Ejemplo n.º 1
0
class EmptyTreeTestCase(unittest.TestCase):
    """ test search on an empty tree."""
    def setUp(self):
        self.tree = IntervalTree()

    def test_search(self):
        self.tree.search(46, 47)

    def test_find(self):
        self.tree.find(Interval(46, 47))

    def test_left(self):
        self.tree.left(Interval(46, 47))

    def test_right(self):
        self.tree.right(Interval(46, 47))
Ejemplo n.º 2
0
class EmptyTreeTestCase(unittest.TestCase):
    """ test search on an empty tree."""

    def setUp(self):
        self.tree = IntervalTree()

    def test_search(self):
        self.tree.search(46, 47)

    def test_find(self):
        self.tree.find(Interval(46, 47))

    def test_left(self):
        self.tree.left(Interval(46, 47))

    def test_right(self):
        self.tree.right(Interval(46, 47))
Ejemplo n.º 3
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Feature(i -4, i + 4, strand=1, chr=ichr)
                a.insert(f)
        
        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Feature(i -4, i + 4, strand=1, chr=ichr)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Ejemplo n.º 4
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                a.insert(f)

        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Ejemplo n.º 5
0
def _create_intervaltree(locs):

    it = IntervalTree()

    for k, (start, end) in locs.iterrows():

        intervals = it.find(start, end)
        if intervals:
            continue

        it.add(start, end, k)

    return it
Ejemplo n.º 6
0
class Document(HString):
    def __init__(self,
                 content,
                 doc_id=rand_id(),
                 language=lng.ENGLISH,
                 preprocessors=None):
        super().__init__(self, 0, len(content))
        self._content = preprocess(content,
                                   preprocessors) if preprocessors else content
        self._annotations = IntervalTree()
        self._doc_id = rand_id(10) if doc_id is None else doc_id
        self._completed = {}
        self._next_id = 0
        self[LANGUAGE] = language
        self._aid_dict = {}

    @property
    def content(self) -> str:
        return self._content

    @property
    def doc_id(self):
        return self._doc_id

    def annotation(self,
                   annotation_type,
                   start=None,
                   end=None) -> typing.List[Annotation]:
        try:
            if end is None or start is None:
                anno_iter = self._annotations.find(Interval(0, self.end))
            else:
                anno_iter = filter(
                    lambda x: x.data.overlaps(Span(start, end)),
                    self._annotations.find(Interval(start, end)))
        except:
            return []
        if annotation_type:
            annotation_type = annotation_type.lower()
            return sorted([
                x.data for x in anno_iter
                if x.data.annotation_type.lower() == annotation_type
                and x.data != self
            ])
        return sorted([x.data for x in anno_iter if x.data != self])

    def annotation_by_id(self, annotation_id: int):
        return self._aid_dict[
            annotation_id] if annotation_id in self._aid_dict else None

    def previous_annotation(self,
                            annotation: Annotation,
                            annotation_type: str = None) -> 'Annotation':
        if not annotation_type:
            annotation_type = annotation.annotation_type
        a = self.annotation(annotation_type, start=-1, end=annotation.start)
        if len(a) == 0:
            return Annotation(None, 0, 0, annotation_type, [])
        return a[-1]

    def next_annotation(self,
                        annotation: Annotation,
                        annotation_type: str = None) -> 'Annotation':
        if not annotation_type:
            annotation_type = annotation.annotation_type
        a = self.annotation(annotation_type,
                            start=annotation.end,
                            end=self.end)
        if len(a) == 0:
            return Annotation(None, 0, 0, annotation_type, [])
        return a[0]

    def create_annotation(self,
                          type: str,
                          start: int,
                          end: int,
                          attributes=None) -> Annotation:
        if attributes is None:
            attributes = []
        annotation = Annotation(self, start, end, type, attributes,
                                self._next_id)
        self._next_id += 1
        self._annotations.insert(
            Interval(annotation.start, annotation.end, annotation))
        self._aid_dict[annotation.annotation_id] = annotation
        return annotation

    def annotate(self, *args):
        for arg in args:
            if arg in self._completed:
                continue
            self.language().load()
            annotator = self.language().get_annotator(arg)
            if annotator:
                annotator.annotate(self)
                self._completed[arg] = '1.0'
            else:
                raise Exception("No annotator for {} annotations in {}".format(
                    arg, self.language()))

    def language(self):
        if LANGUAGE in self.attributes:
            return self.attributes[LANGUAGE]
        return lng.UNKNOWN

    @staticmethod
    def from_spacy(parsed):
        document = Document(content=str(parsed))
        for token in parsed:
            if token.lemma_.strip() != "":
                t = document.create_annotation(
                    "token", token.idx, token.idx + len(token),
                    [(type.INDEX, token.i), (type.LEMMA, token.lemma_),
                     ("prob", token.prob),
                     (type.PART_OF_SPEECH, PartOfSpeech.of(token.tag_))])
                if token.head is token:
                    head_idx = None
                else:
                    head_idx = token.head.i
                if head_idx:
                    t.add_relation(target=head_idx,
                                   type="dep",
                                   relation=token.dep_)

        for entity in parsed.ents:
            document.create_annotation(type.ENTITY, entity.start_char,
                                       entity.end_char,
                                       [(type.ENTITY_TYPE, entity.label_)])
        for i, sentence in enumerate(parsed.sents):
            document.create_annotation(type.SENTENCE, sentence.start_char,
                                       sentence.end_char, [(type.INDEX, i)])
        for np in parsed.noun_chunks:
            document.create_annotation(
                type.PHRASE_CHUNK, np.start_char, np.end_char,
                [(type.PART_OF_SPEECH, PennTreebank.NP)])

    @staticmethod
    def from_json(json_str):
        doc = Document(content='')
        doc.__read_json(json.loads(json_str))
        return doc

    def __getstate__(self):
        return self.to_dict()

    def __setstate__(self, state):
        self.__read_json(state)

    def __read_json(self, obj):
        self.__init__(content=obj['content'])
        self._doc_id = obj.get('id', self._doc_id)
        for (k, v) in obj.get("attributes", {}).items():
            self[k] = get_decoder(k)(v)
        for (k, v) in obj.get('completed', {}).items():
            self._completed[k] = v
        max_id = -1
        for annotation in obj.get("annotations", []):
            ann = Annotation(
                document=self,
                start=annotation["start"],
                end=annotation["end"],
                annotation_type=annotation["type"],
                attributes=[
                    (k, get_decoder(k)(v))
                    for k, v in annotation.get("attributes", {}).items()
                ],
                annotation_id=annotation["id"])
            max_id = max(max_id, ann.annotation_id)
            self._annotations.add(ann.start, ann.end, ann)
            #self._annotations.add(ann)
            for rel in annotation.get("relations", []):
                ann.add_relation(target=rel["target"],
                                 type=rel["type"],
                                 relation=rel["value"])
        self.language().load()
        self._next_id = max_id + 1

    def to_json(self) -> str:
        return json.dumps(self.to_dict(), default=default)

    def to_dict(self) -> typing.Dict[str, typing.Any]:
        return dict([
            ("id", self._doc_id), ("content", self.content),
            ("attributes", self._attributes), ("completed", self._completed),
            ("annotations",
             [a.as_dict() for a in self.annotation(annotation_type=None)])
        ])