def conll_file(filename, fields, word_field, encoding="utf-8"): document = Document(os.path.basename(filename), encoding=encoding) document._corpus = Corpus.from_conll(filename, fields, encoding=encoding) character_index = 0 sentence_index = 0 contents = [] word_spans = [] sentence_spans = [] for sentence in document._corpus.sentences: contents.append([]) for token in sentence: word = token[word_field] contents[-1].append(word) word_spans.append( Span(character_index, character_index + len(word))) character_index += len(word) + 1 sentence_spans.append( Span(sentence_index, sentence_index + len(sentence))) sentence_index += len(sentence) document._content = u"\n".join( [u" ".join(content) for content in contents]) document.add_segmentation(Segmentation("tokens", spans=word_spans)) document.add_segmentation( Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans)) return document
def test_enrich(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word"], sentences=[[{ u"word": u"Ceci" }, { u"word": u"est" }, { u"word": u"un" }, { u"word": u"test" }, { u"word": u"." }]]) document._corpus = corpus features = [] cwg = DictGetterFeature(entry="word", x=0) features.append(BOSFeature(name="BOS", entry="word", getter=cwg)) features.append(EOSFeature(name="EOS", entry="word", getter=cwg)) informations = Informations(bentries=[Entry(u"word")], features=features) enrich = EnrichModule(informations) self.assertEquals(document._corpus.fields, [u"word"]) enrich.process_document(document) self.assertEquals(document._corpus.fields, [u"word", u"BOS", u"EOS"])
def test_clean(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word", u"remove"], sentences=[[{ u"word": u"Ceci", u"remove": u"Ceci" }, { u"word": u"est", u"remove": u"est" }, { u"word": u"un", u"remove": u"un" }, { u"word": u"test", u"remove": u"test" }, { u"word": u".", u"remove": u"." }]]) document._corpus = corpus self.assertEquals(document._corpus.fields, [u"word", u"remove"]) clean = CleanModule(to_keep=[u"word"]) clean.process_document(document) self.assertEquals(document._corpus.fields, [u"word"])
def test_wapiti_label(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word"], sentences=[[{ u"word": u"Ceci" }, { u"word": u"est" }, { u"word": u"un" }, { u"word": u"test" }, { u"word": u"." }]]) document._corpus = corpus self.assertEquals(document._corpus.fields, [u"word"]) wapiti_label = WapitiLabelModule( os.path.join(SEM_DATA_DIR, "non-regression", "models", "model"), u"the_new_field") wapiti_label.process_document(document) self.assertEquals(document._corpus.fields, [u"word", u"the_new_field"]) sentence = document._corpus.sentences[0] self.assertEquals(sentence[0]["the_new_field"], u"A") self.assertEquals(sentence[1]["the_new_field"], u"B") self.assertEquals(sentence[2]["the_new_field"], u"B") self.assertEquals(sentence[3]["the_new_field"], u"A") self.assertEquals(sentence[4]["the_new_field"], u"O")