def setUp(self) -> None: intervals = [] for i in range(0, 30, 5): intervals.append(Interval(i, i + 7)) iv = IntervalTree() for f in intervals: iv.insert(f) self.intervals = intervals self.tree = iv pass
def test_1(self): tree = IntervalTree() tree.add(1, 3, 1.0) tree.add(3, 5, 2.0) tree.add(5, 7, 3.0) self.assertEqual(str(tree.search(3, 4)), '[Inv(3, 5, d=2.0)]') print(tree.search(2, 4)) self.assertTrue('Inv(1, 3, d=1.0)' in str(tree.search(2, 4))) self.assertTrue('Inv(3, 5, d=2.0)' in str(tree.search(2, 4))) self.assertEqual(len(tree.search(2, 4)), 2) self.assertEqual(str(tree.search(5, 7)), '[Inv(5, 7, d=3.0)]')
class TestIssue9(unittest.TestCase): def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2')) def test_right(self): self.assertEqual(0, len(self.tree4.right(Interval(44, 55)))) self.assertEqual(2, len(self.tree4.right(Interval(11, 12)))) def test_left(self): self.assertEqual(2, len(self.tree4.left(Interval(44, 55)))) self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
def addDeterminants(self, text, deter_rule, matches, match_begin, match_end, current_position): deter_rule = deter_rule[FastCNER.END] end = current_position if match_end == 0 else match_end # in case the rules were not configured properly, this can ensure they won't break the execution. if match_begin > end: t = match_begin match_begin = end end = t current_span = Span(match_begin + self.offset, end + self.offset, text[match_begin:end]) current_spans_list = [] overlap_checkers = self.overlap_checkers for key in deter_rule.keys(): rule_id = deter_rule[key] if self.logger is not None: self.logger.debug('try add matched rule ({}-{})\t{}'.format( match_begin, match_end, str(self.rule_store[rule_id]))) current_span.rule_id = rule_id if key in overlap_checkers: current_spans_list = matches[key] overlap_checker = overlap_checkers[key] overlapped_pos = overlap_checker.search( current_span.begin, current_span.end) if len(overlapped_pos) > 0: pos = overlapped_pos.pop().data overlapped_span = current_spans_list[pos] if not self.compareSpan(current_span, overlapped_span): continue current_spans_list[pos] = current_span overlap_checker.remove( Interval(current_span.begin, current_span.end)) overlap_checker.add(current_span.begin, current_span.end, pos) else: overlap_checker.add(current_span.begin, current_span.end, len(current_spans_list)) current_spans_list.append(current_span) else: matches[key] = current_spans_list overlap_checker = IntervalTree() # quickset's search will include both lower and upper bounds, so minus one from the end. overlap_checker.add(current_span.begin, current_span.end - 1, len(current_spans_list)) current_spans_list.append(current_span) overlap_checkers[key] = overlap_checker pass
def test_error(self): iv = IntervalTree() iv.add(10, 17) iv.add(25, 32) iv.add(15, 22) iv.add(20, 27) print(iv.pretty_print()) print(iv.search(0, 220))
def to_data_dict_on_types(concepts: List[Span], type_filter: Set = set(), default_label: str = "NEG", data_dict: dict = { 'X': [], 'concept': [], 'y': [] }, sent_idx: IntervalTree = None, context_sents: List[List[Span]] = None, doc_name: str = '') -> Dict: """ Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'--- the text of labeled concepts, 'y'---label) @param concepts: a list of concepts (in Span type) @param type_filter: a set of type names that need to be included to be vectorized @param default_label: If there is no labeled concept in the context sentences, label it with this default_label @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated @param sent_idx: an IntervalTree built with all sentences in the doc @param context_sents: a 2-d list of sentences with predefined window size. @param doc_name: doc file name (for tracking purpose) @return: a dictionary """ if sent_idx is None or context_sents is None: return data_dict get_doc_name = 'doc_name' in data_dict labeled_sents_id = set() for concept in concepts: if len(type_filter) > 0 and concept.label_ not in type_filter: continue context_sents_ids = sent_idx.search(concept.start, concept.end) for id in context_sents_ids: labeled_sents_id.add(id.data) context = context_sents[id.data] if concept.start >= context[ 0].start and concept.end <= context[-1].end: data_dict['X'].append(' '.join([str(s) for s in context])) data_dict['y'].append(concept.label_) data_dict['concept'].append(str(concept)) if get_doc_name: data_dict['doc_name'].append(doc_name) for i, context in enumerate(context_sents): if i not in labeled_sents_id: data_dict['X'].append(' '.join([str(s) for s in context])) data_dict['y'].append(default_label) data_dict['concept'].append('') if get_doc_name: data_dict['doc_name'].append(doc_name) return data_dict
class EmptyTreeTestCase(unittest.TestCase): """ test search on an empty tree.""" def setUp(self): self.tree = IntervalTree() def test_search(self): self.tree.search(46, 47) def test_find(self): self.tree.find(Interval(46, 47)) def test_left(self): self.tree.left(Interval(46, 47)) def test_right(self): self.tree.right(Interval(46, 47))
def to_data_dict( doc: Doc, sent_window: int = 1, type_filter: Union[Set[str], Dict] = set(), default_label: str = "NEG", data_dict: dict = { 'X': [], 'concept': [], 'y': [] }) -> Dict: """ Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'--- the text of labeled concepts, 'y'---label) @param doc: a SpaCy Doc @param sent_window: The window size (in sentences) around the target concept that need to be pulled @param type_filter: Specify whether and what types of annotation will be used generate the output DataFrame, this parameter can be defined as a set (only concept names are included) or a dictionary (where attributes and values can be included), which maps a matched concept (string and its context string) to a new value in "y" column in the output. The structure of expected dictionary will be: concept_type->attr1->value1->...(other attr->value pairs if needed)->mapped key name @param default_label: If there is no labeled concept in the context sentences, label it with this default_label @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated @param sent_idx: an IntervalTree built with all sentences in the doc @param context_sents: a 2-d list of sentences with predefined window size. @return: a dictionary """ sent_idx = IntervalTree() sents = list(doc.sents) context_sents = [] for i in range(0, len(sents) - sent_window + 1): begin_sent = sents[i] end_sent = sents[i + sent_window - 1] sent_idx.add(begin_sent.start, end_sent.end, len(context_sents)) context_sents.append(sents[i:i + sent_window]) concepts = [] if hasattr(doc._, "concepts"): for type in doc._.concepts: if len(type_filter) == 0 or type in type_filter: concepts.extend(doc._.concepts[type]) else: concepts = [ ent for ent in doc.ents if (len(type_filter) == 0 or ent.label in type_filter) ] get_doc_name = 'doc_name' in data_dict doc_name = doc._.doc_name if get_doc_name else '' if isinstance(type_filter, Set): data_dict = Vectorizer.to_data_dict_on_types( concepts=concepts, type_filter=type_filter, default_label=default_label, data_dict=data_dict, sent_idx=sent_idx, context_sents=context_sents, doc_name=doc_name) elif isinstance(type_filter, Dict): if len(type_filter) == 0: data_dict = Vectorizer.to_data_dict_on_types( concepts=concepts, default_label=default_label, data_dict=data_dict, sent_idx=sent_idx, context_sents=context_sents, doc_name=doc_name) else: data_dict = Vectorizer.to_data_dict_on_type_attr_values( concepts=concepts, type_filter=type_filter, default_label=default_label, data_dict=data_dict, sent_idx=sent_idx, context_sents=context_sents, doc_name=doc_name) else: raise TypeError( 'The arg: "type_filter" needs to be either a set of concept names or a dictionary. Not a {}:\n\t{}' .format(type(type_filter), str(type_filter))) return data_dict
def test_duplicates(self): tree = IntervalTree() tree.add(1, 3, 1.0) tree.add(1, 3, 1.0) self.assertEqual(len(tree.search(1, 1.5)), 2)
def test_3(self): tree = IntervalTree() tree.add(1, 1, 1.0) print(tree.pretty_print()) print((tree.search(1, 3)))
def test_2(self): tree = IntervalTree() tree.add(1, 3, 1.0) tree.add(2, 3, 2.0) tree.add(3, 4, 3.0) tree.add(3, 5, 4.0) tree.add(4, 5, 5.0) tree.add(5, 6, 5.0) tree.add(2, 6, 6.0) print(tree.pretty_print()) self.assertEqual(len(tree.search(4, 4)), 3) self.assertEqual(len(tree.search(3, 3)), 3) self.assertEqual(len(tree.search(4, 6)), 4)
def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2'))
def setUp(self): self.tree = IntervalTree()
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
def test_1(self): tree = IntervalTree() tree.add(1, 3, 100) tree.add(3, 7, 110) tree.add(2, 5, 120) tree.add(4, 6, 130) tree.add(4, 8, 140) tree.add(4, 8, 150) tree.add(5, 7, 160) print(tree.pretty_print()) print(tree.find(Interval(2, 5))) tree.remove(Interval(2, 5)) print(tree.find(Interval(2, 5))) print(tree.pretty_print()) self.assertEqual(True, True)
def to_data_dict_on_type_attr_values(concepts: List[Span], type_filter: Dict = dict(), default_label: str = "NEG", data_dict: dict = { 'X': [], 'concept': [], 'y': [] }, sent_idx: IntervalTree = None, context_sents: List[ List[Span]] = None, doc_name: str = '') -> Dict: """ Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'--- the text of labeled concepts, 'y'---label) @param concepts: a list of concepts (in Span type) @param type_filter: Whether and what types of annotation with what attribute values will be used generate the output DataFrame, this parameter is defined as a dictionary (where attributes and values can be included), which maps a matched concept (string and its context string) to a new value in "y" column in the output. The structure of expected dictionary will be: concept_type->attr1->value1->...(other attr->value pairs if needed)->mapped key name @param default_label: If there is no labeled concept in the context sentences, label it with this default_label @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated @param sent_idx: an IntervalTree built with all sentences in the doc @param context_sents: a 2-d list of sentences with predefined window size. @param doc_name: doc file name (for tracking purpose) @return: a dictionary """ if sent_idx is None or context_sents is None: return data_dict get_doc_name = 'doc_name' in data_dict labeled_sents_id = set() for concept in concepts: conclusions = Vectorizer.get_mapped_names(concept=concept, type_filter=type_filter) if len(conclusions) > 0: context_sents_ids = sent_idx.search(concept.start, concept.end) for id in context_sents_ids: labeled_sents_id.add(id.data) context = context_sents[id.data] if concept.start >= context[ 0].start and concept.end <= context[-1].end: for conclusion in conclusions: data_dict['X'].append(' '.join( [str(s) for s in context])) data_dict['y'].append(conclusion) data_dict['concept'].append(str(concept)) if get_doc_name: data_dict['doc_name'].append(doc_name) # add unlabeled sentences as default label for i, context in enumerate(context_sents): if i not in labeled_sents_id: data_dict['X'].append(' '.join([str(s) for s in context])) data_dict['y'].append(default_label) data_dict['concept'].append('') if get_doc_name: data_dict['doc_name'].append(doc_name) return data_dict
from quicksectx import IntervalTree, Interval import unittest tree = IntervalTree() tree.add(0, 3, 100) tree.add(5, 8, 110) tree.add(6, 10, 120) tree.add(8, 9, 130) tree.add(15, 23, 140) tree.add(19, 20, 150) tree.add(17, 19, 160) tree.add(26, 26, 160) tree.add(25, 30, 160) tree.add(16, 21, 160) print(tree.search(3, 15)) print(tree.pretty_print()) print('\n\n---\n\n\n') tree = IntervalTree() tree.add(0, 3, 100) tree.add(5, 8, 110) tree.add(6, 10, 120) tree.add(8, 9, 130) tree.add(15, 23, 140) tree.add(16, 21, 160) tree.add(17, 19, 160) tree.add(19, 20, 150) tree.add(25, 30, 160) tree.add(26, 26, 160) tree.add(27, 28, 160) tree.add(27, 28, 160) tree.add(27, 28, 160) print(tree.pretty_print())