def load_data_with_departureIndex(departureIndex, filePrefix="BCN_BUD", dataset="Specific"): """ Given the departureIndex, return the dataset with specific departure date in the chosen dataset. """ datas = load_data_with_prefix_and_dataset(filePrefix, dataset) # get different departure data in the same flight number, # to compute the Q Values for such (flight number, departure date) pair. departureDates = [] [departureDates.append(data["Date"]) for data in datas] departureDates = util.remove_duplicates(departureDates) # choose the departure date by departureIndex departureDate = departureDates[departureIndex] print "Evaluating departure date " + departureDate + "..." """ # remove duplicate observedDate-departureDate pair observedDates = [] [observedDates.append(data["ObservedDate"]) for data in datas if data["Date"]==departureDate] observedDates = util.remove_duplicates(observedDates) states = len(observedDates) #print states """ specificDatas = [] specificDatas = [data for data in datas if data["Date"] == departureDate] #states = [] #[states.append(data["State"]) for data in specificDatas] #print max(states) return specificDatas
def load_data_with_departureIndex(departureIndex, filePrefix="BCN_BUD", dataset="Specific"): """ Given the departureIndex, return the dataset with specific departure date in the chosen dataset. """ datas = load_data_with_prefix_and_dataset(filePrefix, dataset) # get different departure data in the same flight number, # to compute the Q Values for such (flight number, departure date) pair. departureDates = [] [departureDates.append(data["Date"]) for data in datas] departureDates = util.remove_duplicates(departureDates) # choose the departure date by departureIndex departureDate = departureDates[departureIndex] print "Evaluating departure date " + departureDate + "..." """ # remove duplicate observedDate-departureDate pair observedDates = [] [observedDates.append(data["ObservedDate"]) for data in datas if data["Date"]==departureDate] observedDates = util.remove_duplicates(observedDates) states = len(observedDates) #print states """ specificDatas = [] specificDatas = [data for data in datas if data["Date"]==departureDate] #states = [] #[states.append(data["State"]) for data in specificDatas] #print max(states) return specificDatas
def filter_segments_spt(segments, max_dist_error, max_speed_error): new_segments = [] for segment in segments: points = util.remove_duplicates(segment.points) new_segments.append( GPXTrackSegment(util.spt(points, max_dist_error, max_speed_error))) return new_segments
def main(): parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset, max_length, label_length = reader.read_trigger_txt( conf.trigger_file, -1) reader.merge_labels(dataset) trains = reader.read_txt(conf.train_all_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(trains) conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(dataset) conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) dataset = reader.trigger_percentage(dataset, conf.percentage) encoder = SoftMatcher(conf, label_length) trainer = SoftMatcherTrainer(encoder, conf, devs, tests) # matching module training random.shuffle(dataset) trainer.train_model(conf.num_epochs_soft, dataset) logits, predicted, triggers = trainer.get_triggervec(dataset) # all the trigger vectors, trigger type, string name of the trigger triggers_remove = remove_duplicates(logits, predicted, triggers, dataset) numbers = int(len(trains) * (1 - opt.unlabeled_percentage)) print("number of train instances : ", numbers) initial_trains = trains[:numbers] unlabeled_x = trains[numbers:] for data in unlabeled_x: data.output_ids = None # sequence labeling module self-training random.shuffle(dataset) inference = SoftSequence(conf, encoder) sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests, triggers_remove) sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
def get_all(self,property,collection='sentences',pair_container=False): lst = [] if isinstance(collection,str): c = getattr(self,collection) else: c = collection for item in c: if not pair_container: lst += getattr(item,property) else: lst += [(item,i) for i in getattr(item,property)] return util.remove_duplicates(lst)
def _clean_mentions_and_coref(self): # fixing the hierarchy of mentions # fixing the mention tags if True: # TODO this doesn't work with the current annotations for AAAI, used for SIGDIAL token_to_mention_dict = collections.defaultdict(list) for mention in self.document.get_all_mentions(): for j in mention.tokens: token_to_mention_dict[j].append(mention) mention_groups = util.remove_duplicates( [tuple(i) for i in token_to_mention_dict.values()]) for mentions in mention_groups: if len(mentions) == 1: mentions[0].is_independent = True else: logger.info("Multiple mentions in a token: %d" % len(mentions)) mentions = self._clean_mentions_set_hierarchy(mentions) self._clean_mentions_set_tags(mentions) for mention in mentions: if not mention.is_independent: self.document.remove_mention(mention) # fixing split coreference groups for entity in self.document.coreference.entities: if entity.number_of_distinct_coref_groups() > 1: groups = entity.distinct_coref_groups() head = groups.pop(0) for group in groups: util.union_list_without_duplicates(head.mentions, group.mentions) self.document.coreference.remove_coref_and_entity(group.id) # create singleton coreference groups mentions = set(self.document.get_all_mentions()) for coref in self.document.coreference.get_coreference_groups(): for mention in coref.mentions: try: mentions.remove(mention) except ValueError: logger.warning( "When removing coreference, mention not found") except KeyError: logger.warning( "When removing coreference, mention not found") logger.info("Singleton mentions %d" % len(mentions)) for mention in mentions: if 'CH' in mention.get_taxonomy( voz.entitymanager.TaxonomyContainer.TAXONOMY_NONCHARACTER): logger.info("Singleton mention character %s" % mention) pass
def get_departure_len(filePrefix="BCN_BUD", dataset="Specific"): """ So far, used in QLearning, return the total departure date length in the chosen dataset. """ datas = load_data_with_prefix_and_dataset(filePrefix, dataset) # get different departure data in the same flight number, # to compute the Q Values for such (flight number, departure date) pair. departureDates = [] [departureDates.append(data["Date"]) for data in datas] departureDates = util.remove_duplicates(departureDates) return len(departureDates)
def check_if_only_one_flightNum(datas): """ check whether the datas only contain one flight number :param datas: input data :return: Ture if the datas only contain one flight number, False otherwise """ kinds = [] for data in datas: kinds += data["Flights"] flightNums = [] for kind in kinds: flightNums.append(kind["FlightNumber"]) if len(util.remove_duplicates(flightNums)) == 1: return True else: return False
def get_reversed_hosts(value, extensive): source1 = source.get_reverse_from_yougetsignal(value, extensive) source2 = source.get_reverse_from_logontube(value, extensive) domains=[] error=False if source1: domains = domains + source1 else: error=True if source2: domains = domains + source2 else: error=True if error: logger.warning('[*] One source responded badly: Reverse ip lookup may be inaccurate') domains = util.remove_duplicates(domains) domains = util.sort(domains) return domains
def execute_with_gis(directory): # Start ORM engine and get Session engine = create_engine('postgresql://*****:*****@localhost/hiking', echo=False) Session = sessionmaker(bind=engine) session = Session() # Create table (drop if already exists) if model.Segment.__table__.exists(engine): model.Segment.__table__.drop(engine) model.Segment.__table__.create(engine) # Parsing an existing gpx file for filename in glob.glob(directory): gpx = gpxpy.parse(open(filename, 'r')) for gpx_track in gpx.tracks: # Analyze each segment of track for segment_id, segment in enumerate(gpx_track.segments): # Remove points with same timestamp, if they are consecutive points = util.remove_duplicates(segment.points) # Simplify using SPT algorithm new_points_spt = util.spt(points, MAX_DIST_ERROR, MAX_SPEED_ERROR) # Apply segmentation using turning points new_lines = splitter.bearing_splitter(new_points_spt, DEGREE_THRESHOLD, MIN_LENGTH) # Create geometry and store in GIS for new_line in new_lines: ls = LineString(new_line) # Store segment in GIS gis_segment = model.Segment(name=gpx_track.name, geom=ls.wkb_hex) session.add(gis_segment) # Save changes session.commit()
def get_reversed_hosts(value, extensive): source1 = source.get_reverse_from_yougetsignal(value, extensive) source2 = source.get_reverse_from_logontube(value, extensive) domains = [] error = False if source1: domains = domains + source1 else: error = True if source2: domains = domains + source2 else: error = True if error: logger.warning( '[*] One source responded badly: Reverse ip lookup may be inaccurate' ) domains = util.remove_duplicates(domains) domains = util.sort(domains) return domains
def remove_duplicates(self): return Drawing(util.remove_duplicates(self.paths))
def test_remove_duplicates(self): self.longMessage = True self.assertEqual(util.remove_duplicates([]), [], 'empty list failed') self.assertEqual(util.remove_duplicates([1, 1]), [1]) self.assertItemsEqual(util.remove_duplicates([2, 2, 1, 3, 1, 2]), [1, 2, 3])
conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset) conf.build_word_idx(dataset, devs, tests) conf.build_emb_table() conf.map_insts_ids(dataset) conf.map_insts_ids(devs) conf.map_insts_ids(tests) encoder = SoftMatcher(conf, label_length) trainer = SoftMatcherTrainer(encoder, conf, devs, tests) # matching module training random.shuffle(dataset) trainer.train_model(conf.num_epochs_soft, dataset) logits, predicted, triggers = trainer.get_triggervec(dataset) triggers_remove = remove_duplicates(logits, predicted, triggers, dataset) # sequence labeling module training random.shuffle(dataset) inference = SoftSequence(conf, encoder) sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests, triggers_remove) sequence_trainer.train_model(conf.num_epochs, dataset, True)
def test_paths(prime_paths, N, E, N0, Nf): edges = edges_to_dict(N,E) tests = extend_to_n0(prime_paths,edges,N0) tests = extend_to_nf(tests,edges,Nf) return remove_duplicates(tests)
def remove_duplicate_samples(self): '''Remove samples that appear more than once in the sample set.''' self.samples = util.remove_duplicates(self.samples)
bad_pages.append(i) break if i == 63: # on page 63, the word "I" will be filtered by 'A' <= 'I' <= 'Z' idx = filtered_page.index("11") filtered_page.insert(idx + 1, "I") bad_pages.remove(i) grouped = group_lst(filtered_page, 3) try: sorted(grouped, key=lambda entry: int(entry[0])) except: print("Can't sort on page", i) pprint(grouped) exit() entries.extend(grouped) if test_page > 0: pprint(grouped) break if bad_pages == []: sorted_entries = sorted(remove_duplicates(entries), key=lambda entry: int(entry[0])) str_entries = ['\t'.join(entry) for entry in sorted_entries] write_lines("word_freq_list.txt", str_entries) print("Number of words:", len(str_entries)) print("Supposed number of words:", sorted_entries[-1][0]) else: print("Bad pages:", bad_pages)