Example #1
0
 def setUp(self) -> None:
     intervals = []
     for i in range(0, 30, 5):
         intervals.append(Interval(i, i + 7))
     iv = IntervalTree()
     for f in intervals:
         iv.insert(f)
     self.intervals = intervals
     self.tree = iv
     pass
Example #2
0
 def test_1(self):
     tree = IntervalTree()
     tree.add(1, 3, 1.0)
     tree.add(3, 5, 2.0)
     tree.add(5, 7, 3.0)
     self.assertEqual(str(tree.search(3, 4)), '[Inv(3, 5, d=2.0)]')
     print(tree.search(2, 4))
     self.assertTrue('Inv(1, 3, d=1.0)' in str(tree.search(2, 4)))
     self.assertTrue('Inv(3, 5, d=2.0)' in str(tree.search(2, 4)))
     self.assertEqual(len(tree.search(2, 4)), 2)
     self.assertEqual(str(tree.search(5, 7)), '[Inv(5, 7, d=3.0)]')
Example #3
0
class TestIssue9(unittest.TestCase):
    def setUp(self):
        self.tree4 = IntervalTree()
        self.tree4.insert(Interval(22, 33, data='example1'))
        self.tree4.insert(Interval(22, 33, data='example2'))

    def test_right(self):
        self.assertEqual(0, len(self.tree4.right(Interval(44, 55))))
        self.assertEqual(2, len(self.tree4.right(Interval(11, 12))))

    def test_left(self):
        self.assertEqual(2, len(self.tree4.left(Interval(44, 55))))
        self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
Example #4
0
    def addDeterminants(self, text, deter_rule, matches, match_begin,
                        match_end, current_position):
        deter_rule = deter_rule[FastCNER.END]
        end = current_position if match_end == 0 else match_end
        # in case the rules were not configured properly, this can ensure they won't break the execution.
        if match_begin > end:
            t = match_begin
            match_begin = end
            end = t
        current_span = Span(match_begin + self.offset, end + self.offset,
                            text[match_begin:end])

        current_spans_list = []
        overlap_checkers = self.overlap_checkers
        for key in deter_rule.keys():
            rule_id = deter_rule[key]
            if self.logger is not None:
                self.logger.debug('try add matched rule ({}-{})\t{}'.format(
                    match_begin, match_end, str(self.rule_store[rule_id])))
            current_span.rule_id = rule_id
            if key in overlap_checkers:
                current_spans_list = matches[key]
                overlap_checker = overlap_checkers[key]
                overlapped_pos = overlap_checker.search(
                    current_span.begin, current_span.end)
                if len(overlapped_pos) > 0:
                    pos = overlapped_pos.pop().data
                    overlapped_span = current_spans_list[pos]
                    if not self.compareSpan(current_span, overlapped_span):
                        continue
                    current_spans_list[pos] = current_span
                    overlap_checker.remove(
                        Interval(current_span.begin, current_span.end))
                    overlap_checker.add(current_span.begin, current_span.end,
                                        pos)
                else:
                    overlap_checker.add(current_span.begin, current_span.end,
                                        len(current_spans_list))
                    current_spans_list.append(current_span)
            else:
                matches[key] = current_spans_list
                overlap_checker = IntervalTree()
                # quickset's search will include both lower and upper bounds, so minus one from the end.
                overlap_checker.add(current_span.begin, current_span.end - 1,
                                    len(current_spans_list))
                current_spans_list.append(current_span)
                overlap_checkers[key] = overlap_checker

        pass
Example #5
0
 def test_error(self):
     iv = IntervalTree()
     iv.add(10, 17)
     iv.add(25, 32)
     iv.add(15, 22)
     iv.add(20, 27)
     print(iv.pretty_print())
     print(iv.search(0, 220))
Example #6
0
 def to_data_dict_on_types(concepts: List[Span],
                           type_filter: Set = set(),
                           default_label: str = "NEG",
                           data_dict: dict = {
                               'X': [],
                               'concept': [],
                               'y': []
                           },
                           sent_idx: IntervalTree = None,
                           context_sents: List[List[Span]] = None,
                           doc_name: str = '') -> Dict:
     """
     Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer
     extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context
     sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'---
     the text of labeled concepts, 'y'---label)
     @param concepts: a list of concepts (in Span type)
     @param type_filter: a set of type names that need to be included to be vectorized
     @param default_label: If there is no labeled concept in the context sentences, label it with this default_label
     @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated
     @param sent_idx: an IntervalTree built with all sentences in the doc
     @param context_sents: a 2-d list of sentences with predefined window size.
     @param doc_name: doc file name (for tracking purpose)
     @return: a dictionary
     """
     if sent_idx is None or context_sents is None:
         return data_dict
     get_doc_name = 'doc_name' in data_dict
     labeled_sents_id = set()
     for concept in concepts:
         if len(type_filter) > 0 and concept.label_ not in type_filter:
             continue
         context_sents_ids = sent_idx.search(concept.start, concept.end)
         for id in context_sents_ids:
             labeled_sents_id.add(id.data)
             context = context_sents[id.data]
             if concept.start >= context[
                     0].start and concept.end <= context[-1].end:
                 data_dict['X'].append(' '.join([str(s) for s in context]))
                 data_dict['y'].append(concept.label_)
                 data_dict['concept'].append(str(concept))
                 if get_doc_name:
                     data_dict['doc_name'].append(doc_name)
     for i, context in enumerate(context_sents):
         if i not in labeled_sents_id:
             data_dict['X'].append(' '.join([str(s) for s in context]))
             data_dict['y'].append(default_label)
             data_dict['concept'].append('')
             if get_doc_name:
                 data_dict['doc_name'].append(doc_name)
     return data_dict
Example #7
0
class EmptyTreeTestCase(unittest.TestCase):
    """ test search on an empty tree."""
    def setUp(self):
        self.tree = IntervalTree()

    def test_search(self):
        self.tree.search(46, 47)

    def test_find(self):
        self.tree.find(Interval(46, 47))

    def test_left(self):
        self.tree.left(Interval(46, 47))

    def test_right(self):
        self.tree.right(Interval(46, 47))
Example #8
0
    def to_data_dict(
            doc: Doc,
            sent_window: int = 1,
            type_filter: Union[Set[str], Dict] = set(),
            default_label: str = "NEG",
            data_dict: dict = {
                'X': [],
                'concept': [],
                'y': []
            }) -> Dict:
        """
        Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer
        extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context
        sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'---
        the text of labeled concepts, 'y'---label)
        @param doc: a SpaCy Doc
        @param sent_window: The window size (in sentences) around the target concept that need to be pulled
        @param type_filter: Specify whether and what types of annotation will be used generate the output DataFrame, this
        parameter can be defined as a set (only concept names are included) or a dictionary (where attributes and values
        can be included), which maps a matched concept (string and its context string) to a new value in "y"
        column in the output. The structure of expected dictionary will be:
        concept_type->attr1->value1->...(other attr->value pairs if needed)->mapped key name
        @param default_label: If there is no labeled concept in the context sentences, label it with this default_label
        @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated
        @param sent_idx: an IntervalTree built with all sentences in the doc
        @param context_sents: a 2-d list of sentences with predefined window size.
        @return: a dictionary
        """
        sent_idx = IntervalTree()
        sents = list(doc.sents)
        context_sents = []
        for i in range(0, len(sents) - sent_window + 1):
            begin_sent = sents[i]
            end_sent = sents[i + sent_window - 1]
            sent_idx.add(begin_sent.start, end_sent.end, len(context_sents))
            context_sents.append(sents[i:i + sent_window])
        concepts = []
        if hasattr(doc._, "concepts"):
            for type in doc._.concepts:
                if len(type_filter) == 0 or type in type_filter:
                    concepts.extend(doc._.concepts[type])
        else:
            concepts = [
                ent for ent in doc.ents
                if (len(type_filter) == 0 or ent.label in type_filter)
            ]

        get_doc_name = 'doc_name' in data_dict
        doc_name = doc._.doc_name if get_doc_name else ''

        if isinstance(type_filter, Set):
            data_dict = Vectorizer.to_data_dict_on_types(
                concepts=concepts,
                type_filter=type_filter,
                default_label=default_label,
                data_dict=data_dict,
                sent_idx=sent_idx,
                context_sents=context_sents,
                doc_name=doc_name)
        elif isinstance(type_filter, Dict):
            if len(type_filter) == 0:
                data_dict = Vectorizer.to_data_dict_on_types(
                    concepts=concepts,
                    default_label=default_label,
                    data_dict=data_dict,
                    sent_idx=sent_idx,
                    context_sents=context_sents,
                    doc_name=doc_name)
            else:
                data_dict = Vectorizer.to_data_dict_on_type_attr_values(
                    concepts=concepts,
                    type_filter=type_filter,
                    default_label=default_label,
                    data_dict=data_dict,
                    sent_idx=sent_idx,
                    context_sents=context_sents,
                    doc_name=doc_name)
        else:
            raise TypeError(
                'The arg: "type_filter" needs to be either a set of concept names or a dictionary. Not a {}:\n\t{}'
                .format(type(type_filter), str(type_filter)))
        return data_dict
Example #9
0
 def test_duplicates(self):
     tree = IntervalTree()
     tree.add(1, 3, 1.0)
     tree.add(1, 3, 1.0)
     self.assertEqual(len(tree.search(1, 1.5)), 2)
Example #10
0
 def test_3(self):
     tree = IntervalTree()
     tree.add(1, 1, 1.0)
     print(tree.pretty_print())
     print((tree.search(1, 3)))
Example #11
0
 def test_2(self):
     tree = IntervalTree()
     tree.add(1, 3, 1.0)
     tree.add(2, 3, 2.0)
     tree.add(3, 4, 3.0)
     tree.add(3, 5, 4.0)
     tree.add(4, 5, 5.0)
     tree.add(5, 6, 5.0)
     tree.add(2, 6, 6.0)
     print(tree.pretty_print())
     self.assertEqual(len(tree.search(4, 4)), 3)
     self.assertEqual(len(tree.search(3, 3)), 3)
     self.assertEqual(len(tree.search(4, 6)), 4)
Example #12
0
 def setUp(self):
     self.tree4 = IntervalTree()
     self.tree4.insert(Interval(22, 33, data='example1'))
     self.tree4.insert(Interval(22, 33, data='example2'))
Example #13
0
 def setUp(self):
     self.tree = IntervalTree()
Example #14
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                a.insert(f)

        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Example #15
0
 def test_1(self):
     tree = IntervalTree()
     tree.add(1, 3, 100)
     tree.add(3, 7, 110)
     tree.add(2, 5, 120)
     tree.add(4, 6, 130)
     tree.add(4, 8, 140)
     tree.add(4, 8, 150)
     tree.add(5, 7, 160)
     print(tree.pretty_print())
     print(tree.find(Interval(2, 5)))
     tree.remove(Interval(2, 5))
     print(tree.find(Interval(2, 5)))
     print(tree.pretty_print())
     self.assertEqual(True, True)
Example #16
0
    def to_data_dict_on_type_attr_values(concepts: List[Span],
                                         type_filter: Dict = dict(),
                                         default_label: str = "NEG",
                                         data_dict: dict = {
                                             'X': [],
                                             'concept': [],
                                             'y': []
                                         },
                                         sent_idx: IntervalTree = None,
                                         context_sents: List[
                                             List[Span]] = None,
                                         doc_name: str = '') -> Dict:
        """
        Convert a SpaCy doc into a labeled data dictionary. Assuming the doc has been labeled based on concepts(snippets), Vectorizer
        extends the input to the concepts' context sentences (depends on the sent_window size), generate labeled context
        sentences data, and return a dictionary (with three keys: 'X'---the text of context sentences,'concepts'---
        the text of labeled concepts, 'y'---label)
        @param concepts: a list of concepts (in Span type)
        @param type_filter: Whether and what types of annotation with what attribute values will be used generate the
        output DataFrame, this parameter is defined as a dictionary (where attributes and values can be included), which
        maps a matched concept (string and its context string) to a new value in "y" column in the output. The
        structure of expected dictionary will be:
        concept_type->attr1->value1->...(other attr->value pairs if needed)->mapped key name
        @param default_label: If there is no labeled concept in the context sentences, label it with this default_label
        @param data_dict: a dictionary to hold the output and pass on across documents, so that a corpus can be aggregated
        @param sent_idx: an IntervalTree built with all sentences in the doc
        @param context_sents: a 2-d list of sentences with predefined window size.
        @param doc_name: doc file name (for tracking purpose)
        @return: a dictionary
        """
        if sent_idx is None or context_sents is None:
            return data_dict
        get_doc_name = 'doc_name' in data_dict

        labeled_sents_id = set()
        for concept in concepts:
            conclusions = Vectorizer.get_mapped_names(concept=concept,
                                                      type_filter=type_filter)
            if len(conclusions) > 0:
                context_sents_ids = sent_idx.search(concept.start, concept.end)
                for id in context_sents_ids:
                    labeled_sents_id.add(id.data)
                    context = context_sents[id.data]
                    if concept.start >= context[
                            0].start and concept.end <= context[-1].end:
                        for conclusion in conclusions:
                            data_dict['X'].append(' '.join(
                                [str(s) for s in context]))
                            data_dict['y'].append(conclusion)
                            data_dict['concept'].append(str(concept))
                            if get_doc_name:
                                data_dict['doc_name'].append(doc_name)
        # add unlabeled sentences as default label
        for i, context in enumerate(context_sents):
            if i not in labeled_sents_id:
                data_dict['X'].append(' '.join([str(s) for s in context]))
                data_dict['y'].append(default_label)
                data_dict['concept'].append('')
                if get_doc_name:
                    data_dict['doc_name'].append(doc_name)
        return data_dict
Example #17
0
from quicksectx import IntervalTree, Interval
import unittest
tree = IntervalTree()
tree.add(0, 3, 100)
tree.add(5, 8, 110)
tree.add(6, 10, 120)
tree.add(8, 9, 130)
tree.add(15, 23, 140)
tree.add(19, 20, 150)
tree.add(17, 19, 160)
tree.add(26, 26, 160)
tree.add(25, 30, 160)
tree.add(16, 21, 160)
print(tree.search(3, 15))
print(tree.pretty_print())
print('\n\n---\n\n\n')
tree = IntervalTree()
tree.add(0, 3, 100)
tree.add(5, 8, 110)
tree.add(6, 10, 120)
tree.add(8, 9, 130)
tree.add(15, 23, 140)
tree.add(16, 21, 160)
tree.add(17, 19, 160)
tree.add(19, 20, 150)
tree.add(25, 30, 160)
tree.add(26, 26, 160)
tree.add(27, 28, 160)
tree.add(27, 28, 160)
tree.add(27, 28, 160)
print(tree.pretty_print())