Beispiel #1
0
def test_getitem_slice_open_left(tested):
    assert tested[:4] == [
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
    ]
Beispiel #2
0
def test_get_repr_infinite_recursion():
    label = GenericLabel(0, 20, document=document)
    b = GenericLabel(0, 20, document=document, a=label)
    label.b = b

    rep = repr(label)
    assert rep == 'GenericLabel(0, 20, b=GenericLabel(0, 20, a=GenericLabel(...)))'
Beispiel #3
0
    def add_sentence(self, sentence):
        sentence_start = self.length
        end = None
        frag = None
        for word in sentence.findall('w'):
            if frag is None:
                start = self.length
                text = word.text
            else:
                text = frag + word.text
            tag = word.get('c')
            if '|' in tag:
                tag = tag.split('|')[0]
            if tag == '*':
                frag = text
                continue

            self.text.append(text)
            end = start + len(text)
            self.tags.append(
                GenericLabel(start_index=start, end_index=end, tag=tag))
            self.length = end + 1
        if end is not None:
            self.sentences.append(
                GenericLabel(start_index=sentence_start, end_index=end))
Beispiel #4
0
def test_begins_inside(tested):
    inside = tested.beginning_inside(1, 9)
    assert list(inside) == [
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
    ]
def test_getitem_slice_open_right(tested):
    assert tested[4:] == [
        GenericLabel(6, 7, document=document, i=4),
        GenericLabel(2, 6, document=document, i=5),
        GenericLabel(0, 7, document=document, i=6),
        GenericLabel(0, 5, document=document, i=7),
    ]
def test_getitem_slice_open_left(tested):
    assert tested[:4] == [
        GenericLabel(9, 13, document=document, i=0),
        GenericLabel(9, 13, document=document, i=1),
        GenericLabel(9, 10, document=document, i=2),
        GenericLabel(6, 8, document=document, i=3),
    ]
Beispiel #7
0
def test_getitem_slice_open_right(tested):
    assert tested[4:] == [
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 13, document=document, i=7),
    ]
Beispiel #8
0
def test_filter(tested):
    assert tested.filter(lambda x: x.i % 2 == 0) == [
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 13, document=document, i=6),
    ]
Beispiel #9
0
def test_inside(tested):
    inside = tested.inside(1, 8)
    assert list(inside) == [
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4)
    ]
Beispiel #10
0
def test_covering(tested):
    covering = tested.covering(2, 4)
    assert list(covering) == [
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(2, 6, document=document, i=2)
    ]
Beispiel #11
0
def test_get_repr_ref():
    label = GenericLabel(0,
                         20,
                         document=document,
                         a=GenericLabel(0, 20, x="a"))

    rep = repr(label)
    assert rep == 'GenericLabel(0, 20, a=GenericLabel(0, 20, x=\'a\'))'
Beispiel #12
0
def test_before(tested):
    before = tested.before(8)
    assert before == [
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
    ]
Beispiel #13
0
def test_dict_attr():
    label = GenericLabel(0, 10, document=document)
    label.bar = {
        'a': 1,
        'b': 2,
    }
    assert label.bar == {
        'a': 1,
        'b': 2,
    }
Beispiel #14
0
    def process_document(self,
                         document: Document,
                         params: Dict[str, Any]):
        sentences = document.labels['sentences']

        sentences_text = []
        for sentence in sentences:
            sentences_text.append(sentence.text)

        stanza_doc = self.nlp(sentences_text)

        all_deps = []
        all_upos_tags = []
        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences):
            dependencies = {}
            stanza_dependencies = stanza_sentence.dependencies
            stanza_dependencies = list(stanza_dependencies)
            i = 0
            while len(stanza_dependencies) > 0:
                i += 1
                if i > MAX_ITER:
                    raise ValueError(
                        'Maximum Iterations reached while processing dependency graph.')
                head, deprel, dep = stanza_dependencies.pop()
                head_id = int(head.id)
                if head_id == 0:
                    head_dep_label = None
                else:
                    try:
                        head_dep_label = dependencies[head_id]
                    except KeyError:
                        stanza_dependencies.insert(0, (head, deprel, dep))
                        continue

                token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[
                    0].start_char
                dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel)
                dep_label.reference_cache['dependents'] = []
                dependencies[int(dep.id)] = dep_label
                if head_dep_label is not None:
                    head_dep_label.dependents.append(dep_label)
                all_deps.append(dep_label)

            for word in stanza_sentence.words:
                token = word.parent
                token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[
                    0].start_char
                token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[
                    0].start_char
                all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))

        document.add_labels('dependencies', all_deps)
        document.add_labels('upos_tags', all_upos_tags)
Beispiel #15
0
def stanza_deps_and_upos_tags(sentence, stanza_sentence):
    sentence_deps = []
    sentence_upos_tags = []
    stanza_dependencies = stanza_sentence.dependencies
    stanza_dependencies = list(stanza_dependencies)
    i = 0
    graph = np.zeros(
        (len(stanza_dependencies) + 1, len(stanza_dependencies) + 1))
    dep_map = {}
    for head, deprel, dep in stanza_dependencies:
        graph[int(head.id), int(dep.id)] = 1
        dep_map[int(dep.id)] = (dep, deprel)
    dependencies = {}
    q = [0]
    while len(q) > 0:
        i += 1
        head_id = q.pop()
        head_dep_label = dependencies.get(head_id)
        if head_id != 0 and head_dep_label is None:
            raise ValueError("Dep seen before governor.")
        for dep_id in range(len(stanza_dependencies) + 1):
            if graph[head_id, dep_id] > 0:
                dep, deprel = dep_map[dep_id]
                token_begin = (sentence.start_index + dep.parent.start_char -
                               stanza_sentence.tokens[0].start_char)
                token_end = (sentence.start_index + dep.parent.end_char -
                             stanza_sentence.tokens[0].start_char)
                dep_label = GenericLabel(token_begin,
                                         token_end,
                                         head=head_dep_label,
                                         deprel=deprel)
                dep_label.reference_cache['dependents'] = []
                dependencies[int(dep.id)] = dep_label
                if head_dep_label is not None:
                    head_dep_label.dependents.append(dep_label)
                q.insert(0, dep_id)
                sentence_deps.append(dep_label)
    if len(dependencies) == len(stanza_dependencies) - 1:
        raise ValueError("Unexpected number of dependencies")
    for word in stanza_sentence.words:
        token = word.parent
        token_begin = (sentence.start_index + token.start_char -
                       stanza_sentence.tokens[0].start_char)
        token_end = (sentence.start_index + token.end_char -
                     stanza_sentence.tokens[0].start_char)
        sentence_upos_tags.append(
            GenericLabel(token_begin, token_end, tag=word.upos))
    return sentence_deps, sentence_upos_tags
Beispiel #16
0
def test_get_attr():
    label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)

    assert label.a == 'x'
    assert label.y == 20
    assert label.z == 20.0
    assert label.fields == {'a': 'x', 'y': 20, 'z': 20.0}
Beispiel #17
0
def test_get_repr():
    label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)

    rep = repr(label)
    assert rep.startswith("GenericLabel(0, 20, ")
    assert 'a="x"' in rep or "a='x'" in rep
    assert 'y=20' in rep
    assert 'z=20.0' in rep
Beispiel #18
0
def test_copy_document():
    e = Event()
    doc = Document(document_name='first',
                   text='The quick brown fox jumped over the lazy dog.')
    e.add_document(doc)
    with doc.get_labeler('some_index') as label:
        label(0, 3, word='The')
        label(4, 9, word='quick')
        label(10, 15, word='brown')
    processor = CopyDocument('first', 'second')
    processor.process(e, {})
    second = e.documents['second']
    assert second is not None
    assert second.labels['some_index'] == [
        GenericLabel(0, 3, word='The'),
        GenericLabel(4, 9, word='quick'),
        GenericLabel(10, 15, word='brown')
    ]
Beispiel #19
0
def test_iter():
    label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0)
    i = list(iter(label))
    assert len(i) == 5
    assert 'start_index' in i
    assert 'end_index' in i
    assert 'a' in i
    assert 'y' in i
    assert 'z' in i
Beispiel #20
0
def test_count_multiple(tested):
    index = presorted_label_index([
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 13, document=document, i=7),
        GenericLabel(9, 13, document=document, i=6)
    ])
    assert index.count(GenericLabel(9, 13, document=document, i=6)) == 2
Beispiel #21
0
def tested():
    return presorted_label_index([
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 13, document=document, i=7),
    ])
Beispiel #22
0
def test_reversed(tested):
    l = list(reversed(tested))
    assert l == [
        GenericLabel(9, 13, document=document, i=7),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(0, 5, document=document, i=0),
    ]
Beispiel #23
0
def test_descending(tested):
    descending = tested.descending()
    assert descending == [
        GenericLabel(9, 13, document=document, i=7),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(0, 5, document=document, i=0),
    ]
Beispiel #24
0
def test_add_labels_distinct(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    l2 = document.add_labels('index', labels, distinct=True)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    assert l2 == labels
    assert l2.distinct
def read_into_documents(conllu_document: str,
                        client: EventsClient = None,
                        sentences_per_document: int = 15):
    entries = []
    end = None
    sentences = 0
    document_builder = DocumentBuilder(client)
    for line in conllu_document.splitlines():
        if len(line) == 0:
            if len(entries) == 0:
                continue
            # empty line - new sentence
            entries.sort(key=lambda x: x[0].head)
            assert end is not None
            dep_map = {}
            document_builder.add_sentence(end)
            for entry, token_begin, token_end in entries:
                head = dep_map.get(entry.head, None)
                dep = GenericLabel(token_begin,
                                   token_end,
                                   head=head,
                                   deprel=entry.deprel)
                dep.reference_cache['dependents'] = []
                dep_map[entry.id] = dep
                if head is not None:
                    head.dependents.append(dep)
                document_builder.add_token(token_begin, token_end, entry.upos,
                                           entry.lemma)
            entries = []
            document_builder.add_deps(dep_map.values())
            sentences += 1
            if sentences == sentences_per_document:
                yield from document_builder.create_document()
                document_builder = DocumentBuilder(client)
                sentences = 0
        else:
            # parse token
            entry = ConllUEntry.parse_line(line)
            begin, end = document_builder.append_token(entry.form)
            entries.append((entry, begin, end))
    if sentences > 0:
        yield from document_builder.create_document()
Beispiel #26
0
def test_create_sort(tested):
    sorted = label_index([
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(0, 7, document=document, i=1),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=7),
        GenericLabel(0, 5, document=document, i=0),
        GenericLabel(2, 6, document=document, i=2),
    ])

    assert sorted == tested
Beispiel #27
0
def test_add_labels_not_distinct(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_label_index_info.return_value = []
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    document.add_labels('index', labels)
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=mocker.ANY)
    l2 = document.labels['index']
    assert l2 == labels
    assert not l2.distinct
Beispiel #28
0
def test_labeler_distinct(mocker):
    client = mocker.Mock(EventsClient)
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    with document.get_labeler('index', distinct=True) as add_generic_label:
        add_generic_label(0, 10, x=1)
        add_generic_label(11, 15, x=2)
        add_generic_label(16, 20, x=3)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    label_adapter = DistinctGenericLabelAdapter
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=label_adapter)
    assert document.get_label_index('index') == labels
Beispiel #29
0
def test_labeler_distinct(mocker):
    client = mocker.Mock(EventsClient)
    client.get_local_instance.return_value = client
    client.get_label_index_info.return_value = []
    event = Event(event_id='1', client=client)
    document = Document(document_name='plaintext',
                        text='The quick brown fox jumped over the lazy dog.',
                        event=event)
    with document.get_labeler('index', distinct=True) as add_generic_label:
        add_generic_label(0, 10, x=1)
        add_generic_label(11, 15, x=2)
        add_generic_label(16, 20, x=3)
    labels = [
        GenericLabel(0, 10, document=document, x=1),
        GenericLabel(11, 15, document=document, x=2),
        GenericLabel(16, 20, document=document, x=3)
    ]
    label_adapter = DISTINCT_GENERIC_ADAPTER
    client.add_labels.assert_called_with(event_id='1',
                                         document_name='plaintext',
                                         index_name='index',
                                         labels=labels,
                                         adapter=label_adapter)
    assert document.labels['index'] == labels
Beispiel #30
0
def test_after(tested):
    after = tested.after(2)
    assert after == [
        GenericLabel(2, 6, document=document, i=2),
        GenericLabel(6, 7, document=document, i=3),
        GenericLabel(6, 8, document=document, i=4),
        GenericLabel(9, 10, document=document, i=5),
        GenericLabel(9, 13, document=document, i=6),
        GenericLabel(9, 13, document=document, i=7)
    ]