Exemple #1
0
 def __init__(self,
              wordvecs=False,
              sentiws_path='data/sentiws/',
              polarity_modifiers_path='data/polarity_modifiers.pickle'):
     """
     Parameters:
     - wordvecs: True or False; use de_core_news_sm
                 or de_core_news_md german spacy model
     - sentiws_path: path of your sentiws data
     - polarity_modifiers_path: path of your polarity
       modifier dict as pickle
     """
     # loading german spacy model
     if wordvecs:
         self.nlp = spacy.load('de_core_news_md')
     else:
         self.nlp = spacy.load('de_core_news_sm')
     # integrating SentiWS-Corpus as token attribute
     sentiws = spaCySentiWS(sentiws_path=sentiws_path)
     self.nlp.add_pipe(sentiws)
     self.doc = None
     self.modifiers = pickle.load(open(polarity_modifiers_path, 'rb'))
     if not Token.has_extension("modified"):
         Token.set_extension("modified", getter=self.modify_polarity)
     if not Token.has_extension("negated"):
         Token.set_extension("negated", getter=self.negate)
 def set_extensions():
     if not Doc.has_extension('coref_chains'):
         Doc.set_extension('coref_chains', default=None)
     if not Token.has_extension('coref_chains'):
         Token.set_extension('coref_chains', default=None)
     if not Doc.has_extension('holmes_document_info'):
         Doc.set_extension('holmes_document_info', default=None)
     if not Token.has_extension('holmes'):
         Token.set_extension('holmes', default=None)
Exemple #3
0
    def fijar_extensiones(self):
        """Fija extensiones globalmente."""
        if not Token.has_extension("ok_token"):
            Token.set_extension("ok_token", default=True)

        if self.grupos:
            for grupo in self.grupos:
                if not Token.has_extension(grupo):
                    Token.set_extension(grupo, default=False)
Exemple #4
0
    def test_call_lexicon_component(self):
        """
        Test running a doc through the lexicon component and properly overlaying features from
        the lexicon.
        """
        lexicon_component = LexiconOverlayer(self.nlp, self.lexicon)
        self.assertFalse(Token.has_extension('feature_is_ADE_from_lexicon'))
        self.assertFalse(Token.has_extension('feature_is_DRUG_from_lexicon'))

        lexicon_component(self.doc)
        self.assertTrue(Token.has_extension('feature_is_ADE_from_lexicon'))
        self.assertTrue(Token.has_extension('feature_is_DRUG_from_lexicon'))
Exemple #5
0
 def test_call_lexicon_component(self):
     """
     Test running a doc through the lexicon component and properly overlaying features from
     the lexicon.
     :return:
     """
     lexicon_component = LexiconComponent(self.nlp, self.lexicon)
     self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'),
                   False)
     self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'),
                   False)
     doc = lexicon_component(self.doc)
     self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), True)
     self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'),
                   True)
Exemple #6
0
    def _set_extensions(self):
        """Sets the default extensions if they do not exist yet."""
        for obj in Doc, Span, Token:
            if not obj.has_extension(self.ext_names["conll_str"]):
                obj.set_extension(self.ext_names["conll_str"], default=None)
            if not obj.has_extension(self.ext_names["conll"]):
                obj.set_extension(self.ext_names["conll"], default=None)

            if PD_AVAILABLE and not self.disable_pandas:
                if not obj.has_extension(self.ext_names["conll_pd"]):
                    obj.set_extension(self.ext_names["conll_pd"], default=None)

        # Adds fields from the CoNLL-U format that are not available in spaCy
        # However, ConllParser might set these fields when it has read CoNLL_str->spaCy
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)
Exemple #7
0
 def __init__(self, nlp: Language):
     """Initialise components"""
     if not Token.has_extension("dependency_distance"):
         Token.set_extension("dependency_distance",
                             getter=self.token_dependency)
     if not Span.has_extension("dependency_distance"):
         Span.set_extension("dependency_distance",
                            getter=self.span_dependency)
     if not Doc.has_extension("dependency_distance"):
         Doc.set_extension("dependency_distance",
                           getter=self.doc_dependency)
Exemple #8
0
 def __init__(self, in_names, extension_name="is_name"):
     """names is an iterable of names where names consist of max 1 word (so no spaces)
     If there are names with spaces they will be removed from the set of names"""
     in_names_set = set(in_names)
     names = {
         AccentRemover.remove_accents(name.casefold())
         for name in in_names_set if len(name.split(" ")) == 1
     }
     self.names = names
     self.extension_name = extension_name
     if not Token.has_extension(extension_name):
         Token.set_extension(extension_name, default=False)
Exemple #9
0
def write_conllu(docs, file_):
    if not Token.has_extension("get_conllu_lines"):
        Token.set_extension("get_conllu_lines", method=get_token_conllu)
    if not Token.has_extension("begins_fused"):
        Token.set_extension("begins_fused", default=False)
    if not Token.has_extension("inside_fused"):
        Token.set_extension("inside_fused", default=False)

    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = []
        if doc.is_parsed:
            matches = merger(doc)
        spans = [doc[start:end + 1] for _, start, end in matches]
        seen_tokens = set()
        with doc.retokenize() as retokenizer:
            for span in spans:
                span_tokens = set(range(span.start, span.end))
                if not span_tokens.intersection(seen_tokens):
                    retokenizer.merge(span)
                    seen_tokens.update(span_tokens)

        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
                    for word in doc[sent[0].i - 10:sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in doc[sent[-1].i:sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    raise ValueError(
                        "Invalid parse: head outside sentence (%s)" %
                        token.text)
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")
Exemple #10
0
    def __init__(self):

        # register Token attributes if they are not registered already
        from spacy.tokens import Token

        for attr_name in [
                "speaker", "start_time", "end_time", "confidence",
                "entity_linking", "addressee"
        ]:
            if not Token.has_extension(attr_name):
                Token.set_extension(attr_name, default=None)

        # register Span attributes if they are not registered already
        from spacy.tokens import Span

        if not Span.has_extension("speaker"):
            Span.set_extension("speaker", getter=self.span_speaker)

        if not Span.has_extension("start_time"):
            Span.set_extension("start_time", getter=self.span_start_time)

        if not Span.has_extension("end_time"):
            Span.set_extension("end_time", getter=self.span_end_time)

        if not Span.has_extension("confidence"):
            Span.set_extension("confidence",
                               getter=self.span_average_confidence)

        if not Span.has_extension("entity_linking"):
            Span.set_extension("entity_linking",
                               getter=self.span_entity_linking)

        if not Span.has_extension("addressee"):
            Span.set_extension("addressee", getter=self.span_addressee)

        # minimalist spaCy pipeline (used only for its tokenizer)
        self.tokenizer = spacy.load("en_core_web_sm",
                                    disable=["tagger", "parser", "ner"])

        # custom spaCy pipeline (that adds forced alignment attributes and ensures
        # that a new sentence starts at every speaker change)
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.add_pipe(self.placeholder,
                          name="forced_alignment",
                          first=True)
        self.nlp.add_pipe(self.start_sentence_at_speaker_change,
                          after="forced_alignment")
Exemple #11
0
    def test_overlays_cuis(self):
        """Tests that the MetaMapOverlayer overlays CUIs correctly given a document that hasn't been metamapped"""
        doc = self.nlp('I took Tylenol and it gave me nausea and chest pain')

        metamap = MetaMap(metamap_path)
        metamap_component = MetaMapOverlayer(self.nlp, metamap)

        metamap_component(doc)
        self.assertTrue(Token.has_extension('feature_cui'))
        cuis = [token._.feature_cui for token in doc]

        # Test that at least one of the features are a CUI
        any_match = any(re.match(r'C\d+', c) for c in cuis)
        self.assertTrue(any_match)

        # Test that all features are a CUI or '-1'
        all_match = all(re.match(r'(C\d+)|(-1)', c) for c in cuis)
        self.assertTrue(all_match)
Exemple #12
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Exemple #13
0
    def test_overlays_cuis(self):
        """Tests that the MetaMapOverlayer overlays CUIs correctly given a document that hasn't been metamapped"""
        sample_doc = sample_dataset.data_files[0].txt_path
        with open(sample_doc) as f:
            sample_text = f.read()

        doc = self.nlp(sample_text)

        metamap = MetaMap(metamap_path)
        metamap_component = MetaMapOverlayer(self.nlp, metamap)

        metamap_component(doc)
        self.assertTrue(Token.has_extension('feature_cui'))
        cuis = [token._.feature_cui for token in doc]

        # Test that at least one of the features are a CUI
        any_match = any(re.match(r'C\d+', c) for c in cuis)
        self.assertTrue(any_match)

        # Test that all features are a CUI or '-1'
        all_match = all(re.match(r'(C\d+)|(-1)', c) for c in cuis)
        self.assertTrue(all_match)
Exemple #14
0
    def __init__(self,
                 nlp,
                 keywords,
                 label,
                 tokentag,
                 doctag=None,
                 spantag=None):
        nlp.vocab.strings.add(label)
        self.label = nlp.vocab.strings[label]
        self._label_str = label
        self._token_tag = tokentag
        self._doctag = doctag
        self._spantag = spantag
        self._keywordtag = "is_keyword"
        self._labeltag = "label_"
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(key) for key in keywords]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(self._token_tag, None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension(self._token_tag, default=False)
        if not Token.has_extension(self._keywordtag):
            Token.set_extension(self._keywordtag, default=False)
            Token.set_extension(self._labeltag, default=None)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension(self._doctag,
                          getter=lambda tokens: any(
                              [t._.get(self._token_tag) for t in tokens]))
        Span.set_extension(self._spantag,
                           getter=lambda tokens: any(
                               [t._.get(self._token_tag) for t in tokens]))
        if not Span.has_extension("dep_"):
            Span.set_extension("dep_", default="")
            Span.set_extension("head_", default=None)
Exemple #15
0
def tool_hashtags(text: list):
    # nlp = es_core_news_md.load()
    matcher = Matcher(nlp.vocab)
    matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
    #Aqui me daba error, porque "is_hashtag" ya estaa agregado a Token, utilizo Token.has_extension
    # para validar si ya se encuentra, de lo contrario se agrega. ver Documentación https://spacy.io/api/token
    if not Token.has_extension("is_hashtag"):
        Token.set_extension("is_hashtag", default=False)
    within_hash = []
    with_hash = []
    # doc = nlp(" ".join(text))
    doc = nlp(text)
    matches = matcher(doc)
    hashtags = []
    for match_id, start, end in matches:
        if doc.vocab.strings[match_id] == "HASHTAG":
            hashtags.append(doc[start:end])
    with doc.retokenize() as retokenizer:
        for span in hashtags:
            # print(hashtags)
            retokenizer.merge(span)
            for token in span:
                if not token._.is_hashtag:
                    token._.is_hashtag = True
                #   print(token)
    for token in doc:
        # print(token.text, token._.is_hashtag)
        if not token._.is_hashtag:
            within_hash.append(token.text)
            with_hash.append(token.text)
        else:
            with_hash.append(token.text)
    # print(within_hash, with_hash, len(hashtags))
    vector = within_hash, with_hash, len(hashtags)
    # print(vector[1])

    return vector
Exemple #16
0
    def extract(self, text):
        #構文解析器のインスタンス化
        nlp = spacy.load('ja_ginza')
        #マッチャーインスタンス化
        matcher = Matcher(nlp.vocab)
        #TextRank
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

        if not Token.has_extension("_5w1h"):
            Token.set_extension("_5w1h", default='None')

        if not Token.has_extension("_type"):
            Token.set_extension("_type", default=None)

        #5w1hパターン
        Where_pattern1 = [{
            "POS": {
                "REGEX": "NOUN|PROPN|PRON"
            },
            "ENT_TYPE": {
                "REGEX": "Country|Province"
            }
        }]
        Where_pattern2 = [{"TEXT": {"REGEX": "Amazon"}}]
        Where_pattern3 = [{"TEXT": "中"}, {"TEXT": "に"}]

        When_pattern1 = [{
            "TEXT": {
                "REGEX": "今日|昨日|おととい|明日|先日|明後日|今回|前回|後ほど|その後"
            }
        }]
        When_pattern2 = [{
            "POS": "NUM"
        }, {
            "TEXT": {
                "REGEX": "時|日|月|年"
            },
            "LENGTH": 1
        }]
        When_pattern3 = [{"TEXT": {"REGEX": "今"}, "LENGTH": 1}]
        When_pattern4 = [{"TEXT": "この"}, {"TEXT": {"REGEX": "後|前"}}]

        Why_pattern1 = [{
            "POS": {
                "REGEX": "VERB|ADJ"
            },
            "OP": "+"
        }, {
            "POS": {
                "REGEX": "AUX"
            },
            "OP": "?"
        }, {
            "TEXT": "から"
        }]

        How_pattern1 = [{
            "POS": {
                "REGEX": "VERB|ADJ|AUX|PART"
            },
            "DEP": {
                "REGEX": "ROOT|punct"
            }
        }]
        How_pattern2 = [{
            "POS": {
                "REGEX": "VERB|ADJ"
            },
            "OP": "+"
        }, {
            "DEP": {
                "REGEX": "ROOT|cc"
            }
        }]
        How_pattern3 = [{
            "POS": {
                "REGEX": "VERB|ADJ|NOUN"
            },
            "OP": "+"
        }, {
            "POS": {
                "REGEX": "AUX|ADP"
            },
            "OP": "?"
        }, {
            "DEP": {
                "REGEX": "ROOT"
            }
        }]
        How_pattern4 = [{
            "POS": {
                "REGEX": "VERB|ADJ"
            },
            "OP": "+"
        }, {
            "OP": "?"
        }, {
            "POS": "PUNCT"
        }]
        How_pattern5 = [{
            "POS": {
                "REGEX": "VERB|ADJ|AUX"
            },
            "OP": "*"
        }, {
            "TEXT": "の"
        }, {
            "TEXT": "に"
        }]
        How_pattern6 = [{
            "POS": "VERB",
            "OP": "?"
        }, {
            "POS": "AUX",
            "OP": "+"
        }, {
            "TEXT": "が",
            "POS": "CCONJ"
        }, {
            "POS": "PUNCT",
            "OP": "?"
        }]
        How_pattern7 = [{
            "POS": "VERB"
        }, {
            "POS": "AUX"
        }, {
            "TEXT": "ところ",
            "POS": "NOUN"
        }]
        How_pattern8 = [{
            "POS": "VERB",
            "DEP": "advcl"
        }, {
            "TEXT": "し",
            "POS": "CCONJ"
        }]
        How_pattern9 = [{
            "POS": {
                "REGEX": "VERB|ADJ"
            }
        }, {
            "TEXT": "ん",
            "OP": "?"
        }, {
            "TEXT": "です",
            "OP": "?"
        }, {
            "LEMMA": "けれど"
        }, {
            "TEXT": "も",
            "OP": "?"
        }, {
            "TEXT": "、",
            "OP": "?"
        }]
        How_pattern10 = [{
            "POS": "AUX",
            "TEXT": {
                "NOT_IN": ["で"]
            }
        }, {
            "POS": {
                "REGEX": "PUNCT"
            }
        }]
        How_pattern11 = [{
            "POS": {
                "REGEX": "VERB|ADJ|AUX"
            },
            "OP": "*"
        }, {
            "TEXT": "けれど"
        }, {
            "TEXT": "も"
        }]
        How_pattern12 = [{
            "POS": "VERB",
            "OP": "?"
        }, {
            "POS": "AUX",
            "OP": "+"
        }, {
            "TEXT": "と"
        }, {
            "TEXT": "か",
            "OP": "?"
        }, {
            "POS": "PUNCT",
            "OP": "?"
        }]
        How_pattern13 = [{
            "POS": "AUX"
        }, {
            "TEXT": "が",
            "POS": {
                "REGEX": "CCONJ"
            }
        }]
        How_pattern14 = [{
            "POS": "NOUN"
        }, {
            "TEXT": "です"
        }, {
            "POS": {
                "REGEX": "PART"
            },
            "OP": "*"
        }]
        How_pattern15 = [{
            "POS": "NOUN"
        }, {
            "TEXT": "か"
        }, {
            "TEXT": "な"
        }, {
            "POS": {
                "REGEX": "PUNCT"
            }
        }]

        Who_pattern1 = [{
            "TEXT": "の",
            "OP": "?"
        }, {
            "POS": "NOUN",
            "DEP": "compound",
            "OP": "*"
        }, {
            "POS": {
                "REGEX": "NOUN|PRON|PROPN"
            },
            "DEP": {
                "REGEX": "iobj|obl|nsubj"
            },
            "TAG": {
                "NOT_IN": ["名詞-普通名詞-助数詞可能"]
            },
            "TEXT": {
                "NOT_IN": ["幾つ"]
            }
        }, {
            "TEXT": {
                "REGEX": "が|は|も"
            }
        }, {
            "TEXT": {
                "REGEX": "です|ね|、"
            },
            "OP": "*"
        }]
        Who_pattern2 = [{
            "DEP": {
                "REGEX": "amod|advmod|acl"
            },
            "OP": "+"
        }, {
            "TEXT": "ところ",
            "DEP": {
                "REGEX": "compound"
            }
        }]
        Who_pattern3 = [{
            "POS": {
                "REGEX": "NOUN|PRON|PROPN"
            },
            "DEP": {
                "REGEX": "iobj|obl|nsubj"
            }
        }, {
            "TEXT": {
                "REGEX": "に"
            }
        }, {
            "TEXT": {
                "REGEX": "は"
            }
        }]
        Who_pattern4 = [{
            "POS": {
                "REGEX": "NOUN|PRON|PROPN"
            },
            "DEP": {
                "REGEX": "obl|nmod|dep"
            },
            "TEXT": {
                "NOT_IN": ["幾つ"]
            },
            "TAG": {
                "NOT_IN": ["名詞-普通名詞-助数詞可能"]
            }
        }, {
            "TEXT": {
                "REGEX": "が|は|も|って"
            }
        }]
        Who_pattern5 = [{
            "TEXT": "こと",
            "DEP": "compound"
        }, {
            "TEXT": {
                "REGEX": "が|は|も"
            }
        }]
        Who_pattern6 = [{
            "POS": "NUM",
            "OP": "!"
        }, {
            "POS": {
                "REGEX": "NOUN|PRON|PROPN"
            },
            "DEP": {
                "REGEX": "iobj|obl|nsubj"
            },
            "TAG": "名詞-普通名詞-助数詞可能"
        }, {
            "TEXT": {
                "REGEX": "が|は|も"
            }
        }]
        Who_pattern7 = [{
            "POS": {
                "REGEX": "VERB|ADJ"
            },
            "OP": "+"
        }, {
            "POS": "NOUN",
            "OP": "?"
        }, {
            "TEXT": "の"
        }, {
            "TEXT": "が"
        }]
        Who_pattern8 = [{
            "TEXT": "と"
        }, {
            "TEXT": "いう"
        }, {
            "TEXT": "の"
        }, {
            "TEXT": "は"
        }]
        Who_pattern9 = [{
            "POS": "NOUN"
        }, {
            "TEXT": "に"
        }, {
            "TEXT": "おい"
        }, {
            "TEXT": "て"
        }, {
            "TEXT": "は"
        }]

        What_pattern1 = [{
            "POS": {
                "REGEX": "NOUN|PRON|PROPN"
            },
            "DEP": {
                "REGEX": "obl|obj|iobj"
            }
        }, {
            "TEXT": {
                "REGEX": "を"
            }
        }]
        What_pattern2 = [{
            "POS": {
                "REGEX": "SYM"
            },
            "DEP": {
                "REGEX": "dep"
            }
        }, {
            "TEXT": {
                "REGEX": "を"
            }
        }]

        Mod_pattern1 = [{
            "DEP": {
                "REGEX":
                "amod|advmod|nmod|case|obl|case|acl|aux|det|nsubj|dep|mark|compound|nummod|advcl|iobj|det|obj"
            }
        }]
        Mod_pattern2 = [{"TEXT": "いつ"}]

        Task_pattern1 = [{
            "TEXT": "たい"
        }, {
            "TEXT": "の"
        }, {
            "POS": {
                "REGEX": "PUNCT"
            },
            "OP": "*"
        }]
        Task_pattern2 = [{
            "TEXT": "ない"
        }, {
            "TEXT": "か"
        }, {
            "TEXT": "な"
        }, {
            "POS": {
                "REGEX": "PUNCT"
            },
            "OP": "*"
        }]
        Task_pattern3 = [{
            "POS": {
                "REGEX": "VERB|AUX"
            }
        }, {
            "TEXT": "ない"
        }, {
            "TEXT": "で"
        }, {
            "TEXT": "ね"
        }, {
            "POS": {
                "REGEX": "PUNCT"
            },
            "OP": "*"
        }]
        Task_pattern4 = [{
            "POS": "AUX"
        }, {
            "POS": "SCONJ"
        }, {
            "TEXT": "ください"
        }, {
            "POS": {
                "REGEX": "PUNCT"
            },
            "OP": "*"
        }]

        #matcherのコールバック関数
        #5w1hのラベルだけ付与
        def add_label(matcher, doc, id, matches):
            l = list(matches[id])
            for t in doc[l[1]:l[2]]:
                if t._._5w1h == 'None' or t._._5w1h == "Mod":
                    t._._5w1h = nlp.vocab.strings[l[0]]

        def add_right(matcher, doc, id, matches):
            l = list(matches[id])
            tag = nlp.vocab.strings[l[0]]
            end = l[-1]

            if end != len(doc):
                while l[1] <= doc[end].head.i <= l[-1] - 1:

                    end = end + 1
                    if end == len(doc):
                        break

            l[-1] = end - 1
            if end < len(doc):
                if (re.search("ので|だから", doc[l[-1] - 2:l[-1]].text)
                        or re.search("ので、|だから、", doc[l[-1] - 3:l[-1]].text)
                        or re.search("ため", doc[l[-1] - 1:l[-1]].text)
                        or re.search("ため、", doc[l[-1] - 2:l[-1] - 1].text)
                    ) and doc[l[-1]].pos_ != "ADP":
                    tag = "Why"

            for t in doc[l[1]:end]:
                if t._._5w1h == 'None' or t._._5w1h == "Mod" or tag == "Why":
                    t._._5w1h = tag

            matches[id] = tuple(l)

        def add_right_left(matcher, doc, id, matches):
            l = list(matches[id])
            tag = nlp.vocab.strings[l[0]]
            end = l[-1]
            start = l[1]

            if end != len(doc):
                while l[1] <= doc[end].head.i <= l[-1] - 1:

                    end = end + 1
                    if end == len(doc):
                        break

            if start != 0:
                while doc[start].head.i == l[1]:
                    start = start - 1
                    if start == 0:
                        break

            l[1] = start
            l[-1] = end

            if end < len(doc):
                if (re.search("ので|だから", doc[l[-1] - 2:l[-1]].text)
                        or re.search("ので、|だから、", doc[l[-1] - 3:l[-1]].text)
                        or re.search("ため", doc[l[-1] - 1:l[-1]].text)
                        or re.search("ため、", doc[l[-1] - 2:l[-1]].text)
                    ) and doc[l[-1]].pos_ != "ADP":
                    tag = "Why"

            for t in doc[l[1]:l[2]]:
                if t._._5w1h == 'None' or t._._5w1h == "Mod" or tag == "Why":
                    t._._5w1h = tag

            matches[id] = tuple(l)

        def add_label_type(matcher, doc, id, matches):
            l = list(matches[id])
            for t in doc[l[1]:l[2]]:
                t._._type = nlp.vocab.strings[l[0]]

        #matcherを追加
        matcher.add("When", add_right, When_pattern1, When_pattern2,
                    When_pattern3, When_pattern4)
        matcher.add("Where", add_right, Where_pattern1, Where_pattern2,
                    Where_pattern3)
        matcher.add("How", add_right, How_pattern1, How_pattern2, How_pattern3,
                    How_pattern4, How_pattern5, How_pattern6, How_pattern7,
                    How_pattern8, How_pattern9, How_pattern10, How_pattern11,
                    How_pattern12, How_pattern13, How_pattern14, How_pattern15)
        matcher.add("Who", add_label, Who_pattern1, Who_pattern2, Who_pattern3,
                    Who_pattern4, Who_pattern5, Who_pattern6, Who_pattern7,
                    Who_pattern8, Who_pattern9)
        matcher.add("What", add_right, What_pattern1, What_pattern2)
        matcher.add("Why", add_label, Why_pattern1)
        matcher.add("Mod", add_right_left, Mod_pattern1, Mod_pattern2)

        matcher.add("Task", add_label_type, Task_pattern1, Task_pattern2,
                    Task_pattern3, Task_pattern4)

        doc = nlp(text)
        text2 = [
            s.text for s in doc if not re.fullmatch("まあ|まぁ|ま|えー|あのー|あ", s.text)
        ]
        text2 = ''.join(text2)
        doc = nlp(text2)
        for sent in doc.sents:
            matches = matcher(doc)
            num = 0
            start = 0
            end = 0

            tmp_label = None

            #その他の処理
            for token in doc:
                if (token._._5w1h == "Who"
                        or token._._5w1h == "What") and re.search(
                            "VERB|ADJ", doc[token.head.i].pos_
                        ) and doc[token.head.i].i > token.i:
                    tag2 = "How"
                    start = token.head.i

                    end = start + 1

                    while end < len(doc):
                        if doc[end].head.i != start:
                            break

                        end = end + 1

                    if (re.search("ので|だから", doc[end - 2:end - 1].text)
                            or re.search("ので、|だから、", doc[end - 3:end - 1].text)
                            or re.search("ため", doc[end - 1:end - 1].text)
                            or re.search("ため、", doc[end - 2:end - 1].text)
                        ) and doc[end - 1].pos_ != "ADP":
                        tag2 = "Why"

                    for t in doc[start:end]:
                        if not t._._5w1h or t._._5w1h == "Mod" or tag2 == "Why":
                            t._._5w1h = tag2

            for token in reversed(doc):

                if token._._5w1h != "Mod":
                    if tmp_label == "Who" and token._._5w1h == "What":
                        token._._5w1h = "Who"
                    tmp_label = token._._5w1h
                else:
                    token._._5w1h = tmp_label
        self.doc = doc
Exemple #17
0
 def set_extensions():
     if not Doc.has_extension('coref_chains'):
         Doc.set_extension('coref_chains', default=None)
     if not Token.has_extension('coref_chains'):
         Token.set_extension('coref_chains', default=None)
Exemple #18
0
 def __init__(self, extension_name="sin_accents"):
     self.extension_name = extension_name
     if not Token.has_extension(extension_name):
         Token.set_extension(extension_name, default=False)
Exemple #19
0
import spacy
from pdfminer.high_level import extract_text

from rich.progress import track
from spacy.tokens import Doc, Token, Span

from .console import console
from .parsers import pdfminer
from .parsers.base import BaseParser
from ._utils import _filter_doc_by_page, _get_number_of_pages


# Set up the spacy custom extensions.

if not Token.has_extension("page_number"):
    Token.set_extension("page_number", default=None)

if not Doc.has_extension("pdf_file_name"):
    Doc.set_extension("pdf_file_name", default=None)

if not Doc.has_extension("page"):
    Doc.set_extension("page", method=_filter_doc_by_page)

if not Doc.has_extension("first_page"):
    Doc.set_extension("first_page", getter=lambda doc: doc[0]._.page_number)

if not Doc.has_extension("last_page"):
    Doc.set_extension("last_page", getter=lambda doc: doc[-1]._.page_number)

if not Doc.has_extension("page_range"):
Exemple #20
0
    def parse_conll_text_as_spacy(
        self,
        text: str,
        ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$",
        ner_map: Dict[str, str] = None,
    ) -> Doc:
        """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n).
        Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are
        supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a
        Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #)
        is saved in Span._.conll_metadata of sentence Spans.

        This method has been adapted from the work by spaCy.
        See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179

        Multi-word tokens and empty nodes are not supported.

        :param text: CoNLL-U formatted text
        :param ner_tag_pattern: Regex pattern for entity tag in the MISC field
        :param ner_map: Map old NER tag names to new ones, '' maps to O
        :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including
         the custom CoNLL extensions
        """
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)

        docs = []
        for chunk in text.split("\n\n"):
            lines = [
                l for l in chunk.splitlines() if l and not l.startswith("#")
            ]
            words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], []
            heads, deps, deps_graphs = [], [], []
            for i in range(len(lines)):
                line = lines[i]
                parts = line.split("\t")

                if any(not p for p in parts):
                    raise ValueError(
                        "According to the CoNLL-U Format, fields cannot be empty. See"
                        " https://universaldependencies.org/format.html")

                id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts

                if any(" " in f
                       for f in (id_, pos, tag, morph, head, dep, deps_graph)):
                    raise ValueError(
                        "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain"
                        " spaces. See https://universaldependencies.org/format.html"
                    )

                if "." in id_ or "-" in id_:
                    raise NotImplementedError(
                        "Multi-word tokens and empty nodes are not supported in spacy_conll"
                    )

                words.append(word)

                if "SpaceAfter=No" in misc:
                    spaces.append(False)
                else:
                    spaces.append(True)

                id_ = int(id_) - 1
                lemmas.append(lemma)
                poses.append(pos)
                tags.append(pos if tag == "_" else tag)
                morphs.append(morph if morph != "_" else "")
                heads.append((int(head) - 1) if head not in ("0",
                                                             "_") else id_)
                deps.append("ROOT" if dep == "root" else dep)
                deps_graphs.append(deps_graph)
                miscs.append(misc)

            doc = Doc(
                self.nlp.vocab,
                words=words,
                spaces=spaces,
                tags=tags,
                pos=poses,
                morphs=morphs,
                lemmas=lemmas,
                heads=heads,
                deps=deps,
            )

            # Set custom Token extensions
            for i in range(len(doc)):
                doc[i]._.conll_misc_field = miscs[i]
                doc[i]._.conll_deps_graphs_field = deps_graphs[i]

            ents = get_entities(lines, ner_tag_pattern, ner_map)
            doc.ents = spans_from_biluo_tags(doc, ents)

            # The deprel relations ensure that this CoNLL chunk is one sentence
            # Deprel cannot therefore not be empty or each word is considered a separate sentence
            if len(list(doc.sents)) != 1:
                raise ValueError(
                    "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format"
                    " requirements. See https://universaldependencies.org/format.html. Particularly make"
                    " sure that the DEPREL field is filled in.")

            # Save the metadata in a custom sentence Span attribute so that the formatter can use it
            metadata = "\n".join(
                [l for l in chunk.splitlines() if l.startswith("#")])
            # We really only expect one sentence
            for sent in doc.sents:
                sent._.conll_metadata = f"{metadata}\n" if metadata else ""

            docs.append(doc)

        # Add CoNLL custom extensions
        return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
    def __call__(self, doc):
        """
        Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing
        MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature
        is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a
        separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component
        was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and
        'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical
        substances respectively.
        :param doc: spaCy Doc object to run through pipeline
        :return: the same Doc object
        """
        logging.debug("Called MetaMap Component")

        # register all extensions
        if self.cuis:
            Token.set_extension('feature_cui', default="-1",
                                force=True)  #cui feature
        for label in self.semantic_type_labels:  # is_semantic type features
            Token.set_extension('feature_is_' + label,
                                default=False,
                                force=True)

        if not hasattr(doc._, 'file_name'):
            metamap_dict = self.metamap.map_text(str(doc))
        elif doc._.file_name is None or doc._.file_name == 'STRING_INPUT':
            metamap_dict = self.metamap.map_text(str(doc))
        elif os.path.isfile(doc._.file_name):
            # Check if pre-metamapped file exists at expected location
            txt_file_path = doc._.file_name
            metamapped_path = _get_metamapped_path(txt_file_path)
            if not os.path.isfile(metamapped_path):
                warnings.warn(
                    f"No metamapped file was found for '{txt_file_path}'; attempting to run MetaMap over document (results in slower runtime); ensure MetaMap is running"
                )
                metamap_dict = self.metamap.map_text(str(doc))
            else:
                # This branch of the decision tree is reached if the file is already metamapped
                metamap_dict = self.metamap.load(metamapped_path)

        # TODO refactor second part of if statement when implementing live model prediction
        if metamap_dict == '' or metamap_dict['metamap'] is None:
            if hasattr(doc._, 'file_name'):
                warnings.warn(
                    f"MetaMap produced no output for given file: {doc._.file_name}"
                )
            warnings.warn("MetaMap failed")
            return doc

        mapped_terms = self.metamap.extract_mapped_terms(
            metamap_dict)  # parse terms out of mappings dictionary

        spans = []  # for displaying NER output with displacy

        # Overlays semantic type presence if the given semantic type is set in metamap span.
        for semantic_type_label in self.semantic_type_labels:

            entity_name = semantic_type_label
            self.nlp.entity.add_label(entity_name)  # register entity label

            entity_tags = self.metamap.get_term_by_semantic_type(
                mapped_terms, include=[semantic_type_label])
            entity_annotations = self.metamap.mapped_terms_to_spacy_ann(
                entity_tags, semantic_type_label)

            with doc.retokenize() as retokenizer:
                for start, end, label in entity_annotations:
                    span = doc.char_span(
                        start, end, label=self.nlp.vocab.strings[entity_name])

                    #TODO spans are none when indices and token boundaries don't line up.
                    if span not in spans:
                        if span is not None:
                            logging.debug(
                                "Found from metamap: (label=%s,raw_text=\"%s\",location=(%i, %i))"
                                % (label, span.text, start, end))
                            spans.append(span)
                            for token in span:
                                token._.set('feature_is_' + label, True)
                            if self.merge_tokens:
                                try:
                                    retokenizer.merge(span)
                                except BaseException:
                                    continue
                        else:
                            logging.debug(
                                "Metamap span could not be overlayed due to tokenization mis-match: (%i, %i)"
                                % (start, end))

        # Overlays CUI of each term
        if Token.has_extension('feature_cui'):
            with doc.retokenize() as retokenizer:
                for term in mapped_terms:
                    cui = term['CandidateCUI']
                    start, end = self.metamap.get_span_by_term(term)[0]
                    span = doc.char_span(start, end)
                    if span is not None:
                        for token in span:
                            token._.set('feature_cui', cui)
                        if self.merge_tokens:
                            try:
                                retokenizer.merge(span)
                            except BaseException:
                                continue
        return doc
Exemple #22
0
    def __call__(self, doc):
        """
        Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing
        MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature
        is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a
        separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component
        was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and
        'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical
        substances respectively.
        :param doc: document to run through pipeline
        :return:
        """
        logging.debug("Called MetaMap Component")
        metamap = self.metamap
        nlp = self.nlp
        semantic_type_labels = self.semantic_type_labels

        #register all extensions
        if self.cuis:
            Token.set_extension('feature_cui', default="-1",
                                force=True)  #cui feature
        for semantic_type_label in semantic_type_labels:  #is_semantic type features
            Token.set_extension('feature_is_' + semantic_type_label,
                                default=False,
                                force=True)

        #check if pre-metamapped file has been assigned to the document
        if hasattr(doc._, 'metamapped_file'):
            metamap_dict = metamap.load(doc._.metamapped_file)
        else:
            if hasattr(doc._, 'file_name'):
                logging.debug("%s: Could not find metamap file for document." %
                              doc._.file_name)
            metamap_dict = metamap.map_text(
                doc.text)  #TODO metamap.map_text is broken currently

        if not hasattr(doc._, 'file_name'
                       ):  #TODO REMOVE when implemnting live model prediction
            return doc

        # TODO refactor second part of if statement when implementing live model prediction
        if metamap_dict['metamap'] is None:
            if hasattr(doc._, 'metamapped_file'):
                warnings.warn(
                    "%s: This metamap file is invalid and cannot be parsed in MetaMapComponent: %s \n Ignore this warning if this is a unittest - all may be fine."
                    % (doc._.file_name, doc._.metamapped_file))
            else:
                warnings.warn(
                    "Metamapping text on the fly failed - aborting. Try to pre-metamap with DataLoader."
                )
            return doc

        mapped_terms = metamap.extract_mapped_terms(
            metamap_dict)  #parse terms out of mappings dictionary

        spans = []  #for displaying NER output with displacy

        #Overlays semantic type presence if the given semantic type is set in metamap span.
        for semantic_type_label in semantic_type_labels:

            entity_name = semantic_type_label
            nlp.entity.add_label(entity_name)  #register entity label

            entity_tags = metamap.get_term_by_semantic_type(
                mapped_terms, include=[semantic_type_label])
            entity_annotations = metamap.mapped_terms_to_spacy_ann(
                entity_tags, semantic_type_label)

            for start, end, label in [
                    entity_annotations['entities'][key]
                    for key in entity_annotations['entities'].keys()
            ]:
                span = doc.char_span(start,
                                     end,
                                     label=nlp.vocab.strings[entity_name])

                #TODO spans are none when indices and token boundaries don't line up.
                if span not in spans:
                    if span is not None:
                        logging.debug(
                            "Found from metamap: (label=%s,raw_text=\"%s\",location=(%i, %i))"
                            % (label, span.text, start, end))
                        spans.append(span)
                        for token in span:
                            token._.set('feature_is_' + label, True)
                    else:
                        logging.debug(
                            "Metamap span could not be overlayed due to tokenization mis-match: (%i, %i)"
                            % (start, end))

        #adds labels for displaying NER output with displacy.

        for span in spans:
            try:
                doc.ents = list(doc.ents) + [span]
            except ValueError as error:
                logging.warning(
                    str(error)
                )  #This gets called when the same token may match multiple semantic types

        #Overlays CUI of each term
        if Token.has_extension('feature_cui'):
            for term in mapped_terms:
                cui = term['CandidateCUI']
                start, end = metamap.get_span_by_term(term)[0]
                span = doc.char_span(start, end)
                if span is not None:
                    for token in span:
                        token._.set('feature_cui', cui)

        return doc
#Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
#The input text string is converted to Document object
doc = nlp(
    "The French Revolution was a period of time in France when the people overthrew the monarchy and took control of the government."
)

#Define the extension attribute on the token level with name as #'context' and default value as false
Token.set_extension('context', default=False, force=True)

#Try printing the each token on the Document object and the stored #value by the extension attribute. All the values default to 'False'
for d in doc:
    print(d.text, d._.context)

#The entity type of previous, next and self tokens are computed and #is set by the 'set' function
for i, d in enumerate(doc):
    if i > 0 and (i < len(doc) - 1):
        meaning = '|' + doc[i - 1].ent_type_ + '-' + d.ent_type_ + '-' + doc[
            i + 1].ent_type_
        d._.set('context', meaning)

#Printing the tokens again to see the modified values
for d in doc:
    print(d.text, d._.context)

Token.has_extension('context')  #returns True
Token.remove_extension('context')  #removes the attribute
Token.has_extension('context')  #returns False

# In[ ]: