def test_getitem_slice_open_left(tested): assert tested[:4] == [ GenericLabel(0, 5, document=document, i=0), GenericLabel(0, 7, document=document, i=1), GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), ]
def test_get_repr_infinite_recursion(): label = GenericLabel(0, 20, document=document) b = GenericLabel(0, 20, document=document, a=label) label.b = b rep = repr(label) assert rep == 'GenericLabel(0, 20, b=GenericLabel(0, 20, a=GenericLabel(...)))'
def add_sentence(self, sentence): sentence_start = self.length end = None frag = None for word in sentence.findall('w'): if frag is None: start = self.length text = word.text else: text = frag + word.text tag = word.get('c') if '|' in tag: tag = tag.split('|')[0] if tag == '*': frag = text continue self.text.append(text) end = start + len(text) self.tags.append( GenericLabel(start_index=start, end_index=end, tag=tag)) self.length = end + 1 if end is not None: self.sentences.append( GenericLabel(start_index=sentence_start, end_index=end))
def test_begins_inside(tested): inside = tested.beginning_inside(1, 9) assert list(inside) == [ GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), ]
def test_getitem_slice_open_right(tested): assert tested[4:] == [ GenericLabel(6, 7, document=document, i=4), GenericLabel(2, 6, document=document, i=5), GenericLabel(0, 7, document=document, i=6), GenericLabel(0, 5, document=document, i=7), ]
def test_getitem_slice_open_left(tested): assert tested[:4] == [ GenericLabel(9, 13, document=document, i=0), GenericLabel(9, 13, document=document, i=1), GenericLabel(9, 10, document=document, i=2), GenericLabel(6, 8, document=document, i=3), ]
def test_getitem_slice_open_right(tested): assert tested[4:] == [ GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 13, document=document, i=7), ]
def test_filter(tested): assert tested.filter(lambda x: x.i % 2 == 0) == [ GenericLabel(0, 5, document=document, i=0), GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 13, document=document, i=6), ]
def test_inside(tested): inside = tested.inside(1, 8) assert list(inside) == [ GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4) ]
def test_covering(tested): covering = tested.covering(2, 4) assert list(covering) == [ GenericLabel(0, 5, document=document, i=0), GenericLabel(0, 7, document=document, i=1), GenericLabel(2, 6, document=document, i=2) ]
def test_get_repr_ref(): label = GenericLabel(0, 20, document=document, a=GenericLabel(0, 20, x="a")) rep = repr(label) assert rep == 'GenericLabel(0, 20, a=GenericLabel(0, 20, x=\'a\'))'
def test_before(tested): before = tested.before(8) assert before == [ GenericLabel(0, 5, document=document, i=0), GenericLabel(0, 7, document=document, i=1), GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), ]
def test_dict_attr(): label = GenericLabel(0, 10, document=document) label.bar = { 'a': 1, 'b': 2, } assert label.bar == { 'a': 1, 'b': 2, }
def process_document(self, document: Document, params: Dict[str, Any]): sentences = document.labels['sentences'] sentences_text = [] for sentence in sentences: sentences_text.append(sentence.text) stanza_doc = self.nlp(sentences_text) all_deps = [] all_upos_tags = [] for stanza_sentence, sentence in zip(stanza_doc.sentences, sentences): dependencies = {} stanza_dependencies = stanza_sentence.dependencies stanza_dependencies = list(stanza_dependencies) i = 0 while len(stanza_dependencies) > 0: i += 1 if i > MAX_ITER: raise ValueError( 'Maximum Iterations reached while processing dependency graph.') head, deprel, dep = stanza_dependencies.pop() head_id = int(head.id) if head_id == 0: head_dep_label = None else: try: head_dep_label = dependencies[head_id] except KeyError: stanza_dependencies.insert(0, (head, deprel, dep)) continue token_begin = sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[ 0].start_char dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel) dep_label.reference_cache['dependents'] = [] dependencies[int(dep.id)] = dep_label if head_dep_label is not None: head_dep_label.dependents.append(dep_label) all_deps.append(dep_label) for word in stanza_sentence.words: token = word.parent token_begin = sentence.start_index + token.start_char - stanza_sentence.tokens[ 0].start_char token_end = sentence.start_index + token.end_char - stanza_sentence.tokens[ 0].start_char all_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos)) document.add_labels('dependencies', all_deps) document.add_labels('upos_tags', all_upos_tags)
def stanza_deps_and_upos_tags(sentence, stanza_sentence): sentence_deps = [] sentence_upos_tags = [] stanza_dependencies = stanza_sentence.dependencies stanza_dependencies = list(stanza_dependencies) i = 0 graph = np.zeros( (len(stanza_dependencies) + 1, len(stanza_dependencies) + 1)) dep_map = {} for head, deprel, dep in stanza_dependencies: graph[int(head.id), int(dep.id)] = 1 dep_map[int(dep.id)] = (dep, deprel) dependencies = {} q = [0] while len(q) > 0: i += 1 head_id = q.pop() head_dep_label = dependencies.get(head_id) if head_id != 0 and head_dep_label is None: raise ValueError("Dep seen before governor.") for dep_id in range(len(stanza_dependencies) + 1): if graph[head_id, dep_id] > 0: dep, deprel = dep_map[dep_id] token_begin = (sentence.start_index + dep.parent.start_char - stanza_sentence.tokens[0].start_char) token_end = (sentence.start_index + dep.parent.end_char - stanza_sentence.tokens[0].start_char) dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel) dep_label.reference_cache['dependents'] = [] dependencies[int(dep.id)] = dep_label if head_dep_label is not None: head_dep_label.dependents.append(dep_label) q.insert(0, dep_id) sentence_deps.append(dep_label) if len(dependencies) == len(stanza_dependencies) - 1: raise ValueError("Unexpected number of dependencies") for word in stanza_sentence.words: token = word.parent token_begin = (sentence.start_index + token.start_char - stanza_sentence.tokens[0].start_char) token_end = (sentence.start_index + token.end_char - stanza_sentence.tokens[0].start_char) sentence_upos_tags.append( GenericLabel(token_begin, token_end, tag=word.upos)) return sentence_deps, sentence_upos_tags
def test_get_attr(): label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0) assert label.a == 'x' assert label.y == 20 assert label.z == 20.0 assert label.fields == {'a': 'x', 'y': 20, 'z': 20.0}
def test_get_repr(): label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0) rep = repr(label) assert rep.startswith("GenericLabel(0, 20, ") assert 'a="x"' in rep or "a='x'" in rep assert 'y=20' in rep assert 'z=20.0' in rep
def test_copy_document(): e = Event() doc = Document(document_name='first', text='The quick brown fox jumped over the lazy dog.') e.add_document(doc) with doc.get_labeler('some_index') as label: label(0, 3, word='The') label(4, 9, word='quick') label(10, 15, word='brown') processor = CopyDocument('first', 'second') processor.process(e, {}) second = e.documents['second'] assert second is not None assert second.labels['some_index'] == [ GenericLabel(0, 3, word='The'), GenericLabel(4, 9, word='quick'), GenericLabel(10, 15, word='brown') ]
def test_iter(): label = GenericLabel(0, 20, document=document, a="x", y=20, z=20.0) i = list(iter(label)) assert len(i) == 5 assert 'start_index' in i assert 'end_index' in i assert 'a' in i assert 'y' in i assert 'z' in i
def test_count_multiple(tested): index = presorted_label_index([ GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 13, document=document, i=7), GenericLabel(9, 13, document=document, i=6) ]) assert index.count(GenericLabel(9, 13, document=document, i=6)) == 2
def tested(): return presorted_label_index([ GenericLabel(0, 5, document=document, i=0), GenericLabel(0, 7, document=document, i=1), GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 13, document=document, i=7), ])
def test_reversed(tested): l = list(reversed(tested)) assert l == [ GenericLabel(9, 13, document=document, i=7), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 10, document=document, i=5), GenericLabel(6, 8, document=document, i=4), GenericLabel(6, 7, document=document, i=3), GenericLabel(2, 6, document=document, i=2), GenericLabel(0, 7, document=document, i=1), GenericLabel(0, 5, document=document, i=0), ]
def test_descending(tested): descending = tested.descending() assert descending == [ GenericLabel(9, 13, document=document, i=7), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 10, document=document, i=5), GenericLabel(6, 8, document=document, i=4), GenericLabel(6, 7, document=document, i=3), GenericLabel(2, 6, document=document, i=2), GenericLabel(0, 7, document=document, i=1), GenericLabel(0, 5, document=document, i=0), ]
def test_add_labels_distinct(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] l2 = document.add_labels('index', labels, distinct=True) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) assert l2 == labels assert l2.distinct
def read_into_documents(conllu_document: str, client: EventsClient = None, sentences_per_document: int = 15): entries = [] end = None sentences = 0 document_builder = DocumentBuilder(client) for line in conllu_document.splitlines(): if len(line) == 0: if len(entries) == 0: continue # empty line - new sentence entries.sort(key=lambda x: x[0].head) assert end is not None dep_map = {} document_builder.add_sentence(end) for entry, token_begin, token_end in entries: head = dep_map.get(entry.head, None) dep = GenericLabel(token_begin, token_end, head=head, deprel=entry.deprel) dep.reference_cache['dependents'] = [] dep_map[entry.id] = dep if head is not None: head.dependents.append(dep) document_builder.add_token(token_begin, token_end, entry.upos, entry.lemma) entries = [] document_builder.add_deps(dep_map.values()) sentences += 1 if sentences == sentences_per_document: yield from document_builder.create_document() document_builder = DocumentBuilder(client) sentences = 0 else: # parse token entry = ConllUEntry.parse_line(line) begin, end = document_builder.append_token(entry.form) entries.append((entry, begin, end)) if sentences > 0: yield from document_builder.create_document()
def test_create_sort(tested): sorted = label_index([ GenericLabel(9, 13, document=document, i=6), GenericLabel(0, 7, document=document, i=1), GenericLabel(6, 8, document=document, i=4), GenericLabel(6, 7, document=document, i=3), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=7), GenericLabel(0, 5, document=document, i=0), GenericLabel(2, 6, document=document, i=2), ]) assert sorted == tested
def test_add_labels_not_distinct(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_label_index_info.return_value = [] event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] document.add_labels('index', labels) client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=mocker.ANY) l2 = document.labels['index'] assert l2 == labels assert not l2.distinct
def test_labeler_distinct(mocker): client = mocker.Mock(EventsClient) event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) with document.get_labeler('index', distinct=True) as add_generic_label: add_generic_label(0, 10, x=1) add_generic_label(11, 15, x=2) add_generic_label(16, 20, x=3) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] label_adapter = DistinctGenericLabelAdapter client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=label_adapter) assert document.get_label_index('index') == labels
def test_labeler_distinct(mocker): client = mocker.Mock(EventsClient) client.get_local_instance.return_value = client client.get_label_index_info.return_value = [] event = Event(event_id='1', client=client) document = Document(document_name='plaintext', text='The quick brown fox jumped over the lazy dog.', event=event) with document.get_labeler('index', distinct=True) as add_generic_label: add_generic_label(0, 10, x=1) add_generic_label(11, 15, x=2) add_generic_label(16, 20, x=3) labels = [ GenericLabel(0, 10, document=document, x=1), GenericLabel(11, 15, document=document, x=2), GenericLabel(16, 20, document=document, x=3) ] label_adapter = DISTINCT_GENERIC_ADAPTER client.add_labels.assert_called_with(event_id='1', document_name='plaintext', index_name='index', labels=labels, adapter=label_adapter) assert document.labels['index'] == labels
def test_after(tested): after = tested.after(2) assert after == [ GenericLabel(2, 6, document=document, i=2), GenericLabel(6, 7, document=document, i=3), GenericLabel(6, 8, document=document, i=4), GenericLabel(9, 10, document=document, i=5), GenericLabel(9, 13, document=document, i=6), GenericLabel(9, 13, document=document, i=7) ]