Ejemplo n.º 1
0
class SpacyBISTParser(object):
    """Main class which handles parsing with Spacy-BIST parser.

    Args:
        verbose (bool, optional): Controls output verbosity.
        spacy_model (str, optional): Spacy model to use
        (see https://spacy.io/api/top-level#spacy.load).
        bist_model (str, optional): Path to a .model file to load. Defaults pre-trained model'.
    """
    dir = path.dirname(path.realpath(__file__))
    _pretrained = path.join(dir, 'bist-pretrained', 'bist.model')

    def __init__(self, verbose=False, spacy_model='en', bist_model=None):
        validate((verbose, bool), (spacy_model, str, 0, 1000),
                 (bist_model, (type(None), str), 0, 1000))
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser._pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained)
        self.spacy_parser = SpacyInstance(spacy_model,
                                          disable=['ner', 'vectors', 'textcat']).parser

    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                                         -1, 'rroot', '_', '_')]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + '\t' + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != '-' or pos != 'HYPH':
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos,
                                                 tok.ent_type_, -1, '_', '_', tok.idx)
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print('-----------------------\ninput conll form:')
                for entry in sentence_conll:
                    print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t')
            yield sentence_conll

    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {'and': set(), 'or': set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != '*root*':
                    if tok.form.lower() == 'and':
                        conj_governors['and'].add(gov_id)
                    if tok.form.lower() == 'or':
                        conj_governors['or'].add(gov_id)

                    if rel == 'conj':
                        if gov_id in conj_governors['and']:
                            rel += '_and'
                        if gov_id in conj_governors['or']:
                            rel += '_or'

                    parsed_tok = {'start': tok.misc, 'len': len(tok.form),
                                  'pos': tok.pos, 'ner': tok.feats,
                                  'lemma': tok.lemma, 'gov': gov_id - 1,
                                  'rel': rel}

                    if show_tok:
                        parsed_tok['text'] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc
Ejemplo n.º 2
0
class SpacyBISTParser(object):
    """Main class which handles parsing with Spacy-BIST parser.

    Args:
        verbose (bool, optional): Controls output verbosity.
        spacy_model (str, optional): Spacy model to use
        (see https://spacy.io/api/top-level#spacy.load).
        bist_model (str, optional): Path to a .model file to load. Defaults pre-trained model'.
    """

    dir = LIBRARY_OUT / "bist-pretrained"
    _pretrained = dir / "bist.model"

    def __init__(self, verbose=False, spacy_model="en", bist_model=None):
        validate(
            (verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000)
        )
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser._pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained)
        self.spacy_parser = SpacyInstance(spacy_model, disable=["ner", "vectors", "textcat"]).parser

    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [
                ConllEntry(
                    0, "*root*", "*root*", "ROOT-POS", "ROOT-CPOS", "_", -1, "rroot", "_", "_"
                )
            ]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + "\t" + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != "-" or pos != "HYPH":
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(
                            i_tok + 1,
                            text,
                            tok.lemma_,
                            pos,
                            pos,
                            tok.ent_type_,
                            -1,
                            "_",
                            "_",
                            tok.idx,
                        )
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print("-----------------------\ninput conll form:")
                for entry in sentence_conll:
                    print(str(entry.id) + "\t" + entry.form + "\t" + entry.pos + "\t")
            yield sentence_conll

    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {"and": set(), "or": set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != "*root*":
                    if tok.form.lower() == "and":
                        conj_governors["and"].add(gov_id)
                    if tok.form.lower() == "or":
                        conj_governors["or"].add(gov_id)

                    if rel == "conj":
                        if gov_id in conj_governors["and"]:
                            rel += "_and"
                        if gov_id in conj_governors["or"]:
                            rel += "_or"

                    parsed_tok = {
                        "start": tok.misc,
                        "len": len(tok.form),
                        "pos": tok.pos,
                        "ner": tok.feats,
                        "lemma": tok.lemma,
                        "gov": gov_id - 1,
                        "rel": rel,
                    }

                    if show_tok:
                        parsed_tok["text"] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc
Ejemplo n.º 3
0
class SpacyBISTParser(object):
    """Main class which handles parsing with Spacy-BIST parser.

    Args:
        verbose (bool, optional): Controls output verbosity.
        spacy_model (str, optional): Spacy model to use
        (see https://spacy.io/api/top-level#spacy.load).
        bist_model (str, optional): Path to a .model file to load. Defaults pre-trained model'.
    """
    dir = path.dirname(path.realpath(__file__))
    pretrained = path.join(dir, 'bist-pretrained', 'bist.model')

    def __init__(self, verbose=False, spacy_model='en', bist_model=None):
        validate((verbose, bool), (spacy_model, str, 0, 1000),
                 (bist_model, (type(None), str), 0, 1000))
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser.pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser.pretrained)
        self.spacy_parser = SpacyInstance(spacy_model,
                                          disable=['ner', 'vectors', 'textcat']).parser

    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                                         -1, 'rroot', '_', '_')]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + '\t' + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != '-' or pos != 'HYPH':
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos,
                                                 tok.ent_type_, -1, '_', '_', tok.idx)
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print('-----------------------\ninput conll form:')
                for entry in sentence_conll:
                    print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t')
            yield sentence_conll

    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {'and': set(), 'or': set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != '*root*':
                    if tok.form.lower() == 'and':
                        conj_governors['and'].add(gov_id)
                    if tok.form.lower() == 'or':
                        conj_governors['or'].add(gov_id)

                    if rel == 'conj':
                        if gov_id in conj_governors['and']:
                            rel += '_and'
                        if gov_id in conj_governors['or']:
                            rel += '_or'

                    parsed_tok = {'start': tok.misc, 'len': len(tok.form),
                                  'pos': tok.pos, 'ner': tok.feats,
                                  'lemma': tok.lemma, 'gov': gov_id - 1,
                                  'rel': rel}

                    if show_tok:
                        parsed_tok['text'] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc