class NestedContainmentList(object): def __init__(self, starts=None, ends=None, indices=None, reduce=False): self.ncls = None if starts is not None and indices is not None: if ends is None: ends = [s + 1 for s in starts] if reduce: starts, ends, indices = list( zip(*merge_overlaps(zip(starts, ends, indices)))) starts = np.array(starts, dtype='i8') ends = np.array(ends, dtype='i8') indices = np.array(indices, dtype='i8') self.ncls = NCLS(starts, ends, indices) def find_overlaps(self, start, end): if self.ncls is None: # we allow for empty objects, in which case nothing overlaps # use case: non-matching seqids return [] overlaps = [] for overlap in self.ncls.find_overlap(start, end): overlaps.append(Interval(*overlap)) return overlaps @staticmethod def from_intervals(intervals, reduce=False): starts, ends, indices = zip(*intervals) starts = np.array(starts, dtype='i8') ends = np.array(ends, dtype='i8') indices = np.array(indices, dtype='i8') obj = NestedContainmentList(starts, ends, indices, reduce=reduce) return obj
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [0, 3] assert list(l) == [0, 3]
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806))) assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [2147483647, 3] assert list(l) == [2147483647, 3]
def filter_by_human_annotations(self, article, annotations): ncls = NCLS(*get_intervals(article['annotations'])) new_annotations = [] num_filtered = 0 for annotation in annotations: entity_start, entity_end = get_start_end(annotation) matched_human_annotation = list( ncls.find_overlap(entity_start, entity_end)) if len(matched_human_annotation) == 0: new_annotations.append(annotation) else: human_annotation = article['annotations'][ matched_human_annotation[0][2]] human_annotation_start, human_annotation_end = get_start_end( human_annotation) assert intersect(human_annotation_start, human_annotation_end, entity_start, entity_end) num_filtered += 1 assert len(new_annotations) + num_filtered == len(annotations) return new_annotations, num_filtered
class MultiNode(Node): def __init__(self, node_type, node_id, nodes_list, is_robot=False): super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot) self.nodes_list = nodes_list for node in self.nodes_list: node.is_robot = is_robot self.first_timestep = min(node.first_timestep for node in self.nodes_list) self._last_timestep = max(node.last_timestep for node in self.nodes_list) starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64) ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64) ids = np.arange(len(self.nodes_list), dtype=np.int64) self.interval_tree = NCLS(starts, ends, ids) @staticmethod def find_non_overlapping_nodes(nodes_list, min_timesteps=1) -> list: """ Greedily finds a set of non-overlapping nodes in the provided scene. :return: A list of non-overlapping nodes. """ non_overlapping_nodes = list() nodes = sorted(nodes_list, key=lambda n: n.last_timestep) current_time = 0 for node in nodes: if node.first_timestep >= current_time and node.timesteps >= min_timesteps: # Include the node non_overlapping_nodes.append(node) current_time = node.last_timestep return non_overlapping_nodes def get_node_at_timesteps(self, scene_ts) -> Node: possible_node_ranges = list( self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1)) if not possible_node_ranges: return Node(node_type=self.type, node_id='EMPTY', data=self.nodes_list[0].data * np.nan, is_robot=self.is_robot) node_idx = random.choice(possible_node_ranges)[2] return self.nodes_list[node_idx] def scene_ts_to_node_ts(self, scene_ts) -> (Node, np.ndarray, int, int): """ Transforms timestamp from scene into timeframe of node data. :param scene_ts: Scene timesteps :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in node data before data is available. paddingu: Number of timesteps in scene range which are not available in node data after data is available. """ possible_node_ranges = list( self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1)) if not possible_node_ranges: return None, None, None, None node_idx = random.choice(possible_node_ranges)[2] node = self.nodes_list[node_idx] paddingl = (node.first_timestep - scene_ts[0]).clip(0) paddingu = (scene_ts[1] - node.last_timestep).clip(0) ts = np.array(scene_ts).clip( min=node.first_timestep, max=node.last_timestep) - node.first_timestep return node, ts, paddingl, paddingu def get(self, tr_scene, state, padding=np.nan) -> np.ndarray: if tr_scene.size == 1: tr_scene = np.array([tr_scene, tr_scene]) length = tr_scene[1] - tr_scene[0] + 1 # tr is inclusive node, tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene) if node is None: state_length = sum( [len(entity_dims) for entity_dims in state.values()]) return np.full((length, state_length), fill_value=padding) data_array = node.data[tr[0]:tr[1] + 1, state] padded_data_array = np.full((length, data_array.shape[1]), fill_value=padding) padded_data_array[paddingl:length - paddingu] = data_array return padded_data_array def get_all(self, tr_scene, state, padding=np.nan) -> np.ndarray: # Assumption here is that the user is asking for all of the data in this MultiNode and to return it within a # full scene-sized output array. assert tr_scene.size == 2 and tr_scene[ 0] == 0 and self.last_timestep <= tr_scene[1] length = tr_scene[1] - tr_scene[0] + 1 # tr is inclusive state_length = sum( [len(entity_dims) for entity_dims in state.values()]) padded_data_array = np.full((length, state_length), fill_value=padding) for node in self.nodes_list: padded_data_array[node.first_timestep:node.last_timestep + 1] = node.data[:, state] return padded_data_array def history_points_at(self, ts) -> int: """ Number of history points in trajectory. Timestep is exclusive. :param ts: Scene timestep where the number of history points are queried. :return: Number of history timesteps. """ node_idx = next(self.interval_tree.find_overlap(ts, ts + 1))[2] node = self.nodes_list[node_idx] return ts - node.first_timestep @property def timesteps(self) -> int: """ Number of available timesteps for node. :return: Number of available timesteps. """ return self._last_timestep - self.first_timestep + 1
# Test AIList i = AIList() i.from_array(starts1, ends1, ids1, values1) i.construct() ai_res = i.intersect_from_array(starts2, ends2, ids2) i.intersect(starts2[50], ends2[50]) # Test NCLS n = NCLS(starts1, ends1, ids1) n_res = n.all_overlaps_both(starts2, ends2, ids2) list(n.find_overlap(starts2[50], ends2[50])) # Test pandas p = pd.IntervalIndex.from_tuples(list(zip(starts1, ends1))) p.overlaps(pd.Interval(starts2[50], ends2[50])) # Test quicksect b = quicksect.IntervalTree() for i in range(len(starts1)): b.add(starts1[i], ends1[i]) b.search(starts2[50], ends2[50]) KIFYH5 = milo
from ncls import NCLS import numpy as np starts = [] ends = [] ids = [] with open('data.txt') as fh: for line in fh: cols = line.strip().split() starts.append(int(cols[0])) ends.append(int(cols[1])) ids.append(int(cols[2])) if int(cols[2]) > 70000: break starts = np.array(starts, dtype=np.long) ends = np.array(ends, dtype=np.long) ids = np.array(ids, dtype=np.long) ncls = NCLS(starts, ends, ids) for i in ncls.find_overlap(76623690, 76624000): print(i)
from ncls import NCLS import pickle import pandas as pd import numpy as np starts = np.random.randint(0, int(1e8), int(1e7)) ends = starts + 100 ids = starts ncls = NCLS(starts, ends, ids) for i in ncls.find_overlap(0, 2): print(i) pickle.dump(ncls, open("test.pckl", "wb")) import pickle ncls2 = pickle.load(open("test.pckl", "rb")) for i in ncls2.find_overlap(0, 2): print(i)
def __call__(self, path): global vocab global entities num_annotations, num_sentences, num_documents = 0, 0, 0 total_length = 0 num_filtered_xao = 0 num_filtered_by_candidate_set, num_filtered_by_human_annotations, num_filtered_by_self_overlaps = 0, 0, 0 num_filtered_by_crossing_sentence_boundaries, num_filtered_solo_annotion_in_sentence = 0, 0 num_filtered_by_entity_vocab = 0 empty_line_tensor = vocab.encode_line(line='', append_eos=self.append_eos) assert len(empty_line_tensor) == int(self.append_eos) if self.entity_vocab is None: annotation_entities = Counter() else: output_prefix = self.generate_tmp_filename() dataset_builder = indexed_dataset.make_builder( output_prefix + '.text.bin', impl=self.dataset_impl, vocab_size=len(vocab), ) annotations_list = list() with codecs.open(path, 'r', 'utf8') as f: for line in f: article = json.loads(line[:-1]) annotations = article['el'] article[ 'annotations'], _num_filtered_xao = self.fix_annotations( article['annotations']) num_filtered_xao += _num_filtered_xao annotations, _num_filtered_xao = self.fix_annotations( annotations) num_filtered_xao += _num_filtered_xao annotations, _num_filtered_by_candidate_set = self.filter_by_candidate_set( article, annotations) annotations, _num_filtered_by_human_annotations = self.filter_by_human_annotations( article, annotations) annotations, _num_filtered_by_self_overlaps = self.filter_by_self_overlaps( annotations) annotations = article['annotations'] + annotations if self.entity_vocab is not None: annotations, _num_filtered_by_entity_vocab = self.filter_by_entity_vocab( annotations) num_filtered_by_entity_vocab += _num_filtered_by_entity_vocab num_filtered_by_candidate_set += _num_filtered_by_candidate_set num_filtered_by_human_annotations += _num_filtered_by_human_annotations num_filtered_by_self_overlaps += _num_filtered_by_self_overlaps nlcs = NCLS(*get_intervals(annotations)) text = article['text'].replace(u'\xa0', u' ') offset = 0 for sentence, offset in self.split_into_sentences(text): sentence_begin = offset sentence_end = offset + len(sentence) assert sentence == text[sentence_begin:sentence_end] annotations_per_sentence = [] for annotation_id in nlcs.find_overlap( sentence_begin, sentence_end): annotation = annotations[annotation_id[2]] start, end = get_start_end(annotation) if sentence_begin <= start and end <= sentence_end: annotations_per_sentence.append(annotation) else: num_filtered_by_crossing_sentence_boundaries += 1 num_unique_entities = len( set([ annotation['uri'] for annotation in annotations_per_sentence ])) if num_unique_entities < self.min_entities_per_sentence: num_filtered_solo_annotion_in_sentence += 1 continue num_annotations += len(annotations_per_sentence) if self.entity_vocab is None: annotation_entities.update([ annotation['uri'] for annotation in annotations_per_sentence ]) else: annotations_per_sentence = self.set_local_offsets( offset, annotations_per_sentence) fixed_sentence, annotations_per_sentence = self.strip_whitespaces( sentence, annotations_per_sentence) fixed_sentence, annotations_per_sentence = self.strip_double_whitespaces( fixed_sentence, annotations_per_sentence) fixed_sentence, annotations_per_sentence = self.add_margin_to_annotations( fixed_sentence, annotations_per_sentence) annotations_per_sentence = self.get_word_based_offsets( fixed_sentence, annotations_per_sentence) ids, annotations_per_sentence = self.apply_gt2_bpe( fixed_sentence, annotations_per_sentence) ids_tensor = vocab.encode_line( line=' '.join(ids), append_eos=self.append_eos) assert len(ids_tensor) == len(ids) + int( self.append_eos) dataset_builder.add_item(ids_tensor) annotations_list.extend([[ x['start_word'] + total_length, x['end_word'] + total_length, num_sentences, num_documents, int(entities[x['uri']]) ] for x in annotations_per_sentence]) total_length += len(ids_tensor) num_sentences += 1 if self.entity_vocab is not None: dataset_builder.add_item(empty_line_tensor) total_length += len(empty_line_tensor) num_sentences += 1 num_documents += 1 if self.entity_vocab is not None: dataset_builder.finalize(output_prefix + '.text.idx') annotations_list = np.array(annotations_list, dtype=np.int64) return ( annotation_entities if self.entity_vocab is None else output_prefix, annotations_list if self.entity_vocab is not None else None, total_length if self.entity_vocab is not None else 0, num_documents, num_sentences, num_annotations, num_filtered_by_candidate_set, num_filtered_by_human_annotations, num_filtered_by_self_overlaps, num_filtered_by_crossing_sentence_boundaries, num_filtered_solo_annotion_in_sentence, num_filtered_xao, num_filtered_by_entity_vocab, )
from ncls import NCLS import pandas as pd import numpy as np starts = pd.Series(range(0, int(1e7))) ends = starts + 100 ids = starts ncls = NCLS(starts.values, ends.values, ids.values) ncls.write_binaries(b"hello") ncls2 = NCLS(np.array([0]), np.array([2]), np.array([3])) ncls2.buildFromUnsortedFile(b"hello.idb", n=int(1e7)) for i in ncls2.find_overlap(0, 500): print(i)