def load_data_with_departureIndex(departureIndex,
                                  filePrefix="BCN_BUD",
                                  dataset="Specific"):
    """
    Given the departureIndex, return the dataset with specific departure date in the chosen dataset.
    """
    datas = load_data_with_prefix_and_dataset(filePrefix, dataset)

    # get different departure data in the same flight number,
    # to compute the Q Values for such (flight number, departure date) pair.
    departureDates = []
    [departureDates.append(data["Date"]) for data in datas]
    departureDates = util.remove_duplicates(departureDates)

    # choose the departure date by departureIndex
    departureDate = departureDates[departureIndex]
    print "Evaluating departure date " + departureDate + "..."
    """
    # remove duplicate observedDate-departureDate pair
    observedDates = []
    [observedDates.append(data["ObservedDate"]) for data in datas if data["Date"]==departureDate]
    observedDates = util.remove_duplicates(observedDates)
    states = len(observedDates)
    #print states
    """

    specificDatas = []
    specificDatas = [data for data in datas if data["Date"] == departureDate]

    #states = []
    #[states.append(data["State"]) for data in specificDatas]
    #print max(states)

    return specificDatas
def load_data_with_departureIndex(departureIndex, filePrefix="BCN_BUD", dataset="Specific"):
    """
    Given the departureIndex, return the dataset with specific departure date in the chosen dataset.
    """
    datas = load_data_with_prefix_and_dataset(filePrefix, dataset)

    # get different departure data in the same flight number,
    # to compute the Q Values for such (flight number, departure date) pair.
    departureDates = []
    [departureDates.append(data["Date"]) for data in datas]
    departureDates = util.remove_duplicates(departureDates)

    # choose the departure date by departureIndex
    departureDate = departureDates[departureIndex]
    print "Evaluating departure date " + departureDate + "..."

    """
    # remove duplicate observedDate-departureDate pair
    observedDates = []
    [observedDates.append(data["ObservedDate"]) for data in datas if data["Date"]==departureDate]
    observedDates = util.remove_duplicates(observedDates)
    states = len(observedDates)
    #print states
    """


    specificDatas = []
    specificDatas = [data for data in datas if data["Date"]==departureDate]

    #states = []
    #[states.append(data["State"]) for data in specificDatas]
    #print max(states)

    return specificDatas
Beispiel #3
0
def filter_segments_spt(segments, max_dist_error, max_speed_error):
    new_segments = []
    for segment in segments:
        points = util.remove_duplicates(segment.points)
        new_segments.append(
            GPXTrackSegment(util.spt(points, max_dist_error, max_speed_error)))
    return new_segments
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    opt = parse_arguments(parser)
    conf = Config(opt)
    reader = Reader(conf.digit2zero)
    dataset, max_length, label_length = reader.read_trigger_txt(
        conf.trigger_file, -1)

    reader.merge_labels(dataset)

    trains = reader.read_txt(conf.train_all_file, conf.train_num)
    devs = reader.read_txt(conf.dev_file, conf.dev_num)
    tests = reader.read_txt(conf.test_file, conf.test_num)
    print(len(dataset))
    if conf.context_emb == ContextEmb.bert:
        print('Loading the BERT vectors for all datasets.')
        conf.context_emb_size = load_bert_vec(
            conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

    # setting for data
    conf.use_iobes(trains)
    conf.use_iobes(dataset)
    conf.use_iobes(devs)
    conf.use_iobes(tests)

    conf.optimizer = opt.trig_optimizer
    conf.build_label_idx(dataset)
    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(dataset)
    conf.map_insts_ids(trains)
    conf.map_insts_ids(devs)
    conf.map_insts_ids(tests)

    dataset = reader.trigger_percentage(dataset, conf.percentage)
    encoder = SoftMatcher(conf, label_length)
    trainer = SoftMatcherTrainer(encoder, conf, devs, tests)

    # matching module training
    random.shuffle(dataset)
    trainer.train_model(conf.num_epochs_soft, dataset)
    logits, predicted, triggers = trainer.get_triggervec(dataset)
    # all the trigger vectors, trigger type, string name of the trigger
    triggers_remove = remove_duplicates(logits, predicted, triggers, dataset)

    numbers = int(len(trains) * (1 - opt.unlabeled_percentage))
    print("number of train instances : ", numbers)
    initial_trains = trains[:numbers]
    unlabeled_x = trains[numbers:]

    for data in unlabeled_x:
        data.output_ids = None

    # sequence labeling module self-training
    random.shuffle(dataset)
    inference = SoftSequence(conf, encoder)
    sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests,
                                           triggers_remove)
    sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
Beispiel #5
0
 def get_all(self,property,collection='sentences',pair_container=False):
     lst = []
     if isinstance(collection,str):
         c = getattr(self,collection)
     else:
         c = collection
     for item in c:
         if not pair_container:
             lst += getattr(item,property)
         else:
             lst += [(item,i) for i in getattr(item,property)]
     return util.remove_duplicates(lst)
Beispiel #6
0
    def _clean_mentions_and_coref(self):
        # fixing the hierarchy of mentions
        # fixing the mention tags
        if True:
            # TODO this doesn't work with the current annotations for AAAI, used for SIGDIAL
            token_to_mention_dict = collections.defaultdict(list)
            for mention in self.document.get_all_mentions():
                for j in mention.tokens:
                    token_to_mention_dict[j].append(mention)

            mention_groups = util.remove_duplicates(
                [tuple(i) for i in token_to_mention_dict.values()])
            for mentions in mention_groups:
                if len(mentions) == 1:
                    mentions[0].is_independent = True
                else:
                    logger.info("Multiple mentions in a token: %d" %
                                len(mentions))
                    mentions = self._clean_mentions_set_hierarchy(mentions)
                    self._clean_mentions_set_tags(mentions)
                    for mention in mentions:
                        if not mention.is_independent:
                            self.document.remove_mention(mention)
        # fixing split coreference groups
        for entity in self.document.coreference.entities:
            if entity.number_of_distinct_coref_groups() > 1:
                groups = entity.distinct_coref_groups()
                head = groups.pop(0)
                for group in groups:
                    util.union_list_without_duplicates(head.mentions,
                                                       group.mentions)
                    self.document.coreference.remove_coref_and_entity(group.id)

        # create singleton coreference groups
        mentions = set(self.document.get_all_mentions())
        for coref in self.document.coreference.get_coreference_groups():
            for mention in coref.mentions:
                try:
                    mentions.remove(mention)
                except ValueError:
                    logger.warning(
                        "When removing coreference, mention not found")
                except KeyError:
                    logger.warning(
                        "When removing coreference, mention not found")
        logger.info("Singleton mentions %d" % len(mentions))
        for mention in mentions:
            if 'CH' in mention.get_taxonomy(
                    voz.entitymanager.TaxonomyContainer.TAXONOMY_NONCHARACTER):
                logger.info("Singleton mention character %s" % mention)

        pass
def get_departure_len(filePrefix="BCN_BUD", dataset="Specific"):
    """
    So far, used in QLearning, return the total departure date length in the chosen dataset.
    """
    datas = load_data_with_prefix_and_dataset(filePrefix, dataset)

    # get different departure data in the same flight number,
    # to compute the Q Values for such (flight number, departure date) pair.
    departureDates = []
    [departureDates.append(data["Date"]) for data in datas]
    departureDates = util.remove_duplicates(departureDates)

    return len(departureDates)
def get_departure_len(filePrefix="BCN_BUD", dataset="Specific"):
    """
    So far, used in QLearning, return the total departure date length in the chosen dataset.
    """
    datas = load_data_with_prefix_and_dataset(filePrefix, dataset)

    # get different departure data in the same flight number,
    # to compute the Q Values for such (flight number, departure date) pair.
    departureDates = []
    [departureDates.append(data["Date"]) for data in datas]
    departureDates = util.remove_duplicates(departureDates)

    return len(departureDates)
def check_if_only_one_flightNum(datas):
    """
    check whether the datas only contain one flight number
    :param datas: input data
    :return: Ture if the datas only contain one flight number, False otherwise
    """
    kinds = []
    for data in datas:
        kinds += data["Flights"]

    flightNums = []
    for kind in kinds:
        flightNums.append(kind["FlightNumber"])

    if len(util.remove_duplicates(flightNums)) == 1:
        return True
    else:
        return False
def check_if_only_one_flightNum(datas):
    """
    check whether the datas only contain one flight number
    :param datas: input data
    :return: Ture if the datas only contain one flight number, False otherwise
    """
    kinds = []
    for data in datas:
        kinds += data["Flights"]

    flightNums = []
    for kind in kinds:
        flightNums.append(kind["FlightNumber"])

    if len(util.remove_duplicates(flightNums)) == 1:
        return True
    else:
        return False
Beispiel #11
0
def get_reversed_hosts(value, extensive):
    
    source1 = source.get_reverse_from_yougetsignal(value, extensive)
    source2 = source.get_reverse_from_logontube(value, extensive)
    
    domains=[]
    error=False
    if source1:
        domains = domains + source1
    else:
        error=True
    if source2:
        domains = domains + source2
    else:
        error=True
    if error:
        logger.warning('[*] One source responded badly: Reverse ip lookup may be inaccurate')
    domains = util.remove_duplicates(domains)
    domains = util.sort(domains)
    return domains
Beispiel #12
0
def execute_with_gis(directory):
    # Start ORM engine and get Session
    engine = create_engine('postgresql://*****:*****@localhost/hiking', echo=False)
    Session = sessionmaker(bind=engine)
    session = Session()

    # Create table (drop if already exists)
    if model.Segment.__table__.exists(engine):
        model.Segment.__table__.drop(engine)
    model.Segment.__table__.create(engine)

    # Parsing an existing gpx file
    for filename in glob.glob(directory):
        gpx = gpxpy.parse(open(filename, 'r'))

        for gpx_track in gpx.tracks:

            # Analyze each segment of track
            for segment_id, segment in enumerate(gpx_track.segments):

                # Remove points with same timestamp, if they are consecutive
                points = util.remove_duplicates(segment.points)

                # Simplify using SPT algorithm
                new_points_spt = util.spt(points, MAX_DIST_ERROR, MAX_SPEED_ERROR)

                # Apply segmentation using turning points
                new_lines = splitter.bearing_splitter(new_points_spt, DEGREE_THRESHOLD, MIN_LENGTH)

                # Create geometry and store in GIS
                for new_line in new_lines:
                    ls = LineString(new_line)

                    # Store segment in GIS
                    gis_segment = model.Segment(name=gpx_track.name, geom=ls.wkb_hex)
                    session.add(gis_segment)

    # Save changes
    session.commit()
Beispiel #13
0
def get_reversed_hosts(value, extensive):

    source1 = source.get_reverse_from_yougetsignal(value, extensive)
    source2 = source.get_reverse_from_logontube(value, extensive)

    domains = []
    error = False
    if source1:
        domains = domains + source1
    else:
        error = True
    if source2:
        domains = domains + source2
    else:
        error = True
    if error:
        logger.warning(
            '[*] One source responded badly: Reverse ip lookup may be inaccurate'
        )
    domains = util.remove_duplicates(domains)
    domains = util.sort(domains)
    return domains
Beispiel #14
0
 def remove_duplicates(self):
     return Drawing(util.remove_duplicates(self.paths))
Beispiel #15
0
 def test_remove_duplicates(self):
     self.longMessage = True
     self.assertEqual(util.remove_duplicates([]), [], 'empty list failed')
     self.assertEqual(util.remove_duplicates([1, 1]), [1])
     self.assertItemsEqual(util.remove_duplicates([2, 2, 1, 3, 1, 2]), [1, 2, 3])
Beispiel #16
0
        conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset)

# setting for data
conf.use_iobes(dataset)
conf.use_iobes(devs)
conf.use_iobes(tests)

conf.optimizer = opt.trig_optimizer
conf.build_label_idx(dataset)
conf.build_word_idx(dataset, devs, tests)
conf.build_emb_table()
conf.map_insts_ids(dataset)
conf.map_insts_ids(devs)
conf.map_insts_ids(tests)

encoder = SoftMatcher(conf, label_length)
trainer = SoftMatcherTrainer(encoder, conf, devs, tests)

# matching module training
random.shuffle(dataset)
trainer.train_model(conf.num_epochs_soft, dataset)
logits, predicted, triggers = trainer.get_triggervec(dataset)
triggers_remove = remove_duplicates(logits, predicted, triggers, dataset)

# sequence labeling module training
random.shuffle(dataset)
inference = SoftSequence(conf, encoder)
sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests,
                                       triggers_remove)
sequence_trainer.train_model(conf.num_epochs, dataset, True)
Beispiel #17
0
def test_paths(prime_paths, N, E, N0, Nf):
	edges = edges_to_dict(N,E)
	tests = extend_to_n0(prime_paths,edges,N0)
	tests = extend_to_nf(tests,edges,Nf)
	return remove_duplicates(tests)
Beispiel #18
0
 def remove_duplicates(self):
     return Drawing(util.remove_duplicates(self.paths))
Beispiel #19
0
 def remove_duplicate_samples(self):
     '''Remove samples that appear more than once in the sample set.'''
     self.samples = util.remove_duplicates(self.samples)
Beispiel #20
0
            bad_pages.append(i)
            break

    if i == 63: # on page 63, the word "I" will be filtered by 'A' <= 'I' <= 'Z'
        idx = filtered_page.index("11")
        filtered_page.insert(idx + 1, "I")
        bad_pages.remove(i)
    
    grouped = group_lst(filtered_page, 3)
    try:
        sorted(grouped, key=lambda entry: int(entry[0]))
    except:
        print("Can't sort on page", i)
        pprint(grouped)
        exit()
    entries.extend(grouped)
    
    if test_page > 0:
        pprint(grouped)
        break
    
if bad_pages == []:
    sorted_entries = sorted(remove_duplicates(entries), key=lambda entry: int(entry[0]))
    str_entries = ['\t'.join(entry) for entry in sorted_entries]
    write_lines("word_freq_list.txt", str_entries)
    print("Number of words:", len(str_entries))
    print("Supposed number of words:", sorted_entries[-1][0])
else:
    print("Bad pages:", bad_pages)