Example #1
0
def conll_file(filename, fields, word_field, encoding="utf-8"):
    document = Document(os.path.basename(filename), encoding=encoding)
    document._corpus = Corpus.from_conll(filename, fields, encoding=encoding)
    character_index = 0
    sentence_index = 0
    contents = []
    word_spans = []
    sentence_spans = []
    for sentence in document._corpus.sentences:
        contents.append([])
        for token in sentence:
            word = token[word_field]
            contents[-1].append(word)
            word_spans.append(
                Span(character_index, character_index + len(word)))
            character_index += len(word) + 1
        sentence_spans.append(
            Span(sentence_index, sentence_index + len(sentence)))
        sentence_index += len(sentence)
    document._content = u"\n".join(
        [u" ".join(content) for content in contents])
    document.add_segmentation(Segmentation("tokens", spans=word_spans))
    document.add_segmentation(
        Segmentation("sentences",
                     reference=document.segmentation("tokens"),
                     spans=sentence_spans))
    return document
Example #2
0
    def test_clean(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word", u"remove"],
                        sentences=[[{
                            u"word": u"Ceci",
                            u"remove": u"Ceci"
                        }, {
                            u"word": u"est",
                            u"remove": u"est"
                        }, {
                            u"word": u"un",
                            u"remove": u"un"
                        }, {
                            u"word": u"test",
                            u"remove": u"test"
                        }, {
                            u"word": u".",
                            u"remove": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word", u"remove"])

        clean = CleanModule(to_keep=[u"word"])
        clean.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word"])
Example #3
0
    def test_enrich(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        features = []
        cwg = DictGetterFeature(entry="word", x=0)
        features.append(BOSFeature(name="BOS", entry="word", getter=cwg))
        features.append(EOSFeature(name="EOS", entry="word", getter=cwg))

        informations = Informations(bentries=[Entry(u"word")],
                                    features=features)

        enrich = EnrichModule(informations)

        self.assertEquals(document._corpus.fields, [u"word"])

        enrich.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"BOS", u"EOS"])
Example #4
0
    def test_wapiti_label(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word"])

        wapiti_label = WapitiLabelModule(
            os.path.join(SEM_DATA_DIR, "non-regression", "models", "model"),
            u"the_new_field")
        wapiti_label.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"the_new_field"])

        sentence = document._corpus.sentences[0]
        self.assertEquals(sentence[0]["the_new_field"], u"A")
        self.assertEquals(sentence[1]["the_new_field"], u"B")
        self.assertEquals(sentence[2]["the_new_field"], u"B")
        self.assertEquals(sentence[3]["the_new_field"], u"A")
        self.assertEquals(sentence[4]["the_new_field"], u"O")
Example #5
0
    def test_wapiti_label(self):
        corpus = Corpus([u"word", u"tag"],
                        sentences=[
                            [{
                                u"word": u"Ceci",
                                u"tag": u"B-tag"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                            [{
                                u"word": u"Ceci",
                                u"tag": u"O"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                            [{
                                u"word": u"ceci",
                                u"tag": u"O"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                        ])
        document = Document.from_corpus("document", corpus, u"word")
        tags = []
        for sentence in document._corpus.sentences:
            for token in sentence:
                tags.append(token[u"tag"])
        self.assertEquals(tags.count(u"O"), 14)
        self.assertEquals(tags.count(u"B-tag"), 1)

        label_consistency = LabelConsistencyModule(u"tag", token_field=u"word")
        label_consistency.process_document(document)

        self.assertEquals(document._corpus.sentences[0][0][u"tag"], u"B-tag")
        self.assertEquals(document._corpus.sentences[1][0][u"tag"], u"B-tag")
        self.assertEquals(document._corpus.sentences[2][0][u"tag"], u"O")

        tags = []
        for sentence in document._corpus.sentences:
            for token in sentence:
                tags.append(token[u"tag"])
        self.assertEquals(tags.count(u"O"), 13)
        self.assertEquals(tags.count(u"B-tag"), 2)