Ejemplo n.º 1
0
def read_conll(path):
    """Yields CoNLL sentences read from CoNLL formatted file.."""
    with open(path, "r") as conll_fp:
        root = ConllEntry(0, "*root*", "*root*", "ROOT-POS", "ROOT-CPOS", "_",
                          -1, "rroot", "_", "_")
        tokens = [root]
        for line in conll_fp:
            stripped_line = line.strip()
            tok = stripped_line.split("\t")
            if not tok or line.strip() == "":
                if len(tokens) > 1:
                    yield tokens
                tokens = [root]
            else:
                if line[0] == "#" or "-" in tok[0] or "." in tok[0]:
                    # noinspection PyTypeChecker
                    tokens.append(stripped_line)
                else:
                    tokens.append(
                        ConllEntry(
                            int(tok[0]),
                            tok[1],
                            tok[2],
                            tok[4],
                            tok[3],
                            tok[5],
                            int(tok[6]) if tok[6] != "_" else -1,
                            tok[7],
                            tok[8],
                            tok[9],
                        ))
        if len(tokens) > 1:
            yield tokens
Ejemplo n.º 2
0
    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                                         -1, 'rroot', '_', '_')]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + '\t' + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != '-' or pos != 'HYPH':
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos,
                                                 tok.ent_type_, -1, '_', '_', tok.idx)
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print('-----------------------\ninput conll form:')
                for entry in sentence_conll:
                    print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t')
            yield sentence_conll
Ejemplo n.º 3
0
def read_conll(path):
    """Yields CoNLL sentences read from CoNLL formatted file.."""
    with open(path, 'r') as conll_fp:
        root = ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                          -1, 'rroot', '_', '_')
        tokens = [root]
        for line in conll_fp:
            stripped_line = line.strip()
            tok = stripped_line.split('\t')
            if not tok or line.strip() == '':
                if len(tokens) > 1:
                    yield tokens
                tokens = [root]
            else:
                if line[0] == '#' or '-' in tok[0] or '.' in tok[0]:
                    # noinspection PyTypeChecker
                    tokens.append(stripped_line)
                else:
                    tokens.append(
                        ConllEntry(int(tok[0]), tok[1], tok[2], tok[4], tok[3],
                                   tok[5],
                                   int(tok[6]) if tok[6] != '_' else -1,
                                   tok[7], tok[8], tok[9]))
        if len(tokens) > 1:
            yield tokens
Ejemplo n.º 4
0
    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [
                ConllEntry(0, "*root*", "*root*", "ROOT-POS", "ROOT-CPOS", "_",
                           -1, "rroot", "_", "_")
            ]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + "\t" + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != "-" or pos != "HYPH":
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(
                            i_tok + 1,
                            text,
                            tok.lemma_,
                            pos,
                            pos,
                            tok.ent_type_,
                            -1,
                            "_",
                            "_",
                            tok.idx,
                        )
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print("-----------------------\ninput conll form:")
                for entry in sentence_conll:
                    print(
                        str(entry.id) + "\t" + entry.form + "\t" + entry.pos +
                        "\t")
            yield sentence_conll