コード例 #1
0
def get_tagged_sequence(sentence):
    clean_text = return_clean_text(sentence)
    tagger = sken_singleton.Singletons.get_instance().get_tagger()
    doc = sken_singleton.Singletons.get_instance().get_nlp(clean_text)
    logger.info("Made {} tokens for sentence={}".format(len(doc), sentence))
    resp = {"sentence": sentence, "sequence": ''}
    context_verbs = []
    context_nouns = []
    for token in doc:
        if 'W' in token.tag_:
            if str(token) in tagger['wquestions']:
                resp['sequence'] += "<wquestion>"
            else:
                resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>'
        elif 'VERB' in token.pos_:
            if str(token) in tagger["context_verb"]:
                resp["sequence"] += '<context_verb>'
                context_verbs.append(str(token))
            else:
                resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>'
        elif 'NOUN' in token.pos_:
            if str(token) in tagger['context_noun']:
                resp['sequence'] += "<context_noun>"
                context_nouns.append(str(token))
            else:
                resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>'
        else:
            resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>'
    return resp, context_nouns, context_verbs
コード例 #2
0
def _test(model):
    for dep in ["nsubj", "attr", "prep", "pobj", "punct", "det"]:
        predicted = model.wv.most_similar(positive=[dep])
        print("Predictions for {}:".format(spacy.explain(dep)))
        for (w, sim) in predicted:
            print("\t", spacy.explain(w), round(sim, 3))
        print("")
コード例 #3
0
ファイル: routes.py プロジェクト: iris-lux/pronoun-correct
def list_present_tense_heads(pronouns, pronoun_replacement):
    present_tense_heads = []
    they_them = ['they', 'them', 'their', 'theirs', 'themselves', 'themself']
    for pronoun in pronouns:
        print(pronoun['token'].text, pronoun['token'].head,
              pronoun['token'].head.tag_,
              spacy.explain(pronoun['token'].head.tag_))
        if (pronoun_replacement.gramatically_plural
                and pronoun['token'].dep_ == 'nsubj'
                and pronoun['token'].head.tag_ == 'VBZ'):
            print(spacy.explain(pronoun['token'].head.tag_))
            present_tense_heads.append({
                'token':
                pronoun['token'].head,
                'replacement_text':
                head_replacement(pronoun['token'].head)
            })
        elif ((pronoun['token'].text.lower() in they_them)
              and pronoun['token'].dep_ == 'nsubj'
              and pronoun['token'].head.tag_ == 'VBP'):
            present_tense_heads.append({
                'token':
                pronoun['token'].head,
                'replacement_text':
                replace_plural_head(pronoun['token'].head)
            })

    return present_tense_heads
コード例 #4
0
def get_verb_tense_frequencies(lines):
    freq = dict()
    freq['present'] = 0
    freq['future'] = 0
    freq['past'] = 0
    verbs_no = 0

    for line in lines:
        doc = nlp(line)
        for i in range(len(doc)):
            token = doc[i]
            if token.pos_ == 'VERB' and token.tag_ != 'MD':
                verbs_no += 1
                if 'present' in spacy.explain(token.tag_):
                    freq['present'] += 1
                elif 'past' in spacy.explain(token.tag_):
                    freq['past'] += 1
            elif token.pos_ == 'VERB' and token.tag_ == 'MD' and token.text.lower(
            ) == 'will':
                if i < len(doc) - 1:
                    i += 1
                    next_token = doc[i]
                    if next_token is not None and next_token.text == 'VB':
                        verbs_no += 1
                        freq['future'] += 1

    if verbs_no > 0:
        for key, value in freq.items():
            freq[key] = value / verbs_no

    return freq
コード例 #5
0
def ner_spacy(text):
    print(spacy.__version__)
    assert spacy.util.is_package("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)
    entities = []
    labels = []
    position_start = []
    position_end = []

    for ent in doc.ents:
        entities.append(ent)
        labels.append(ent.label_)
        position_start.append(ent.start_char)
        position_end.append(ent.end_char)

    df = pd.DataFrame({
        'Entities': entities,
        'Labels': labels,
        'Position_Start': position_start,
        'Position_End': position_end
    })
    print(df)
    spacy.explain('PERSON')
    return df
コード例 #6
0
def print_token_info(token):
    text = token.text if token.text is not None else ""
    pos = spacy.explain(token.pos_) if token.pos_ is not None else ""
    dep = spacy.explain(token.dep_) if token.dep_ is not None else ""
    lemma = token.lemma_ if token.lemma_ is not None else ""

    return text, pos, dep, lemma
コード例 #7
0
def _test(model):
    for pos in ["VB", "POS", "WRB", "JJ", "NN", "."]:
        predicted = model.wv.most_similar(positive=[pos])
        print("Predictions for {}:".format(spacy.explain(pos)))
        for (w, sim) in predicted:
            print("\t", spacy.explain(w), round(sim, 3))
        print("")
コード例 #8
0
def pos_tagging_and_display(sentence):

    tokenizer = spacy.load("en_core_web_sm")
    token_list = tokenizer(sentence)
    for token in token_list:
        print(token, token.tag_, token.pos_, spacy.explain(token.tag_))
        if (spacy.explain(token.tag_) == 'adjective'):
            adj.append(token)
コード例 #9
0
def show_entsproduct(doc):
    if doc.ents:
        for ent in doc.ents:
            if (ent.label_ == "PRODUCT"):
                print(ent.text + ' - ' + ent.label_ + ' - ' +
                      str(spacy.explain(ent.label_)))
                produse.append(ent.text + ' - ' + ent.label_ + ' - ' +
                               str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')
コード例 #10
0
def test_de():
  TAG = ["$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY"]
  for t in TAG:
    e = spacy.explain(t)
    print("{} {}".format(t, e))

  DEP = ["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dep", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "svp", "uc", "vo"]
  for d in DEP:
    e = spacy.explain(d)
    print("{} {}".format(d, e))
コード例 #11
0
def pos_tagging_s():
    sp = spacy.load('en_core_web_sm')
    sen = sp("I like to play football. I hated it in my childhood though")
    print(sen.text)
    print(sen[1].pos_)
    print(sen[1].tag_)
    print(spacy.explain(sen[1].tag_))
    for word in sen:
        print("Word:", word.text, "\t", "POS Tag:", word.pos_, "\t",
              "Tag for Word:", word.tag_, "Explanatation:",
              spacy.explain(word.tag_), "\n")
コード例 #12
0
def test_en():
  TAG = ["$", "''", ",", "-LRB-", "-RRB-", ".", ":", "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XX", "``"]

  for t in TAG:
    e = spacy.explain(t)
    print("{} {}".format(t, e))

  DEP = ["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"]
  for d in DEP:
    e = spacy.explain(d)
    print("{} {}".format(d, e))
コード例 #13
0
def question_likelihood(parsed_data, sub_component=False):
    """
    Determines likelihood that a parsed spacy sentence is a question.

    Keyword arguments:
    spacy -- Spacy English model
    parsed_data -- Spacy text object
    sub_component -- If True will not attempt recursion (default = False)

    Return:
    float -- likelihood of question
    """
    starts_with_wh = spacy.explain(parsed_data[0].tag_).startswith(u'wh-')
    is_question = 0.95 if starts_with_wh else 0.0

    # check if the sentence starts with to be or has
    non_wh_question_starters = [
        u'be', u'do', u'could', u'should', u'may', u'can', u'shall', u'have',
        u'will', u'doe', u'would'
    ]
    if parsed_data[0].lemma_ in non_wh_question_starters:
        is_question = 0.95

    # if second word is not a verb then this is probably not a question
    if len(parsed_data) > 1 and starts_with_wh:
        is_question += -0.45 if parsed_data[1].pos_ != u'VERB' else 0

    # the case of 'to whom should I write this check?'
    if parsed_data[0].lemma_ == u'to':
        if spacy.explain(parsed_data[1].tag_).startswith(u'wh-'):
            is_question = 0.95

    # break down the comma seperated components of a sentence
    # analyze the components
    # Todo: enhance this to break on multiple punct types.
    if not sub_component and is_question <= 0.5:
        component_tokens = [[]]
        i = 0
        for token in parsed_data:
            if token.pos_ == 'PUNCT' and token.orth_ == ',':
                i += 1
                component_tokens.append([])
                continue
            component_tokens[i].append(token)

        if len(component_tokens) > 1:
            is_question = max([
                question_likelihood(component, True)
                for component in component_tokens if component
            ])

    return is_question
コード例 #14
0
 def extract_debug_data(self, parsedData):
     lexicon = []
     deps = []
     for token in parsedData:
         lexicon.append([
             token.orth_,
             spacy.explain(token.pos_),
             spacy.explain(token.tag_), token.tag_, token.lemma_
         ])
         deps.append([
             token.orth_, token.dep_, token.head.orth_,
             ' '.join([t.orth_ for t in token.lefts]),
             ' '.join([t.orth_ for t in token.rights])
         ])
     return lexicon, deps
コード例 #15
0
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + "-" + ent.label_ + "-" +
                  str(spacy.explain(ent.label)))
    else:
        print("No entity found")
コード例 #16
0
def print_examples(examples_for):
    for tag, doc_and_index in examples_for.items():
        print('tag :: {}\nexplanation :: {}'.format(tag, spacy.explain(tag)))
        print('----example-----')
        for doc, indices in doc_and_index.items():
            print(highlight.by_token(indices, doc))
        print('================\n\n')
コード例 #17
0
 def tense(self, verb):
     """ Determines whether verb is present tense or not
     """
     if str(verb.tag_) in self.present_tense or 'base form' in spacy.explain(verb.tag_):
         return 1
     else:
         return 0
コード例 #18
0
    def __init__(self, word, lemma, tag):
        self.word = word
        self.root = lemma
        self.tag = tag
        self.pos = TAG_DICT[tag]
        # set the css class
        if self.pos in ['Noun', 'Verb', 'Adjective', 'Unknown']:
            self.css_cat = self.pos.lower()
        else:
            self.css_cat = 'other'
        # if there can be a translation, look it up and save it
        if self.pos not in ['Proper Noun', 'Other', 'Numeral']:
            self.found, translation, grammar = dictionary.lookup(word, lemma, self.pos)
            if self.found:
                self.english = self.gen_english_string(translation)
                self.grammar_features = self.list_features(grammar)
            else:
                self.english = 'No translation found'
                self.grammar_features = []

        # if it is not possible for the word to have a translation, don't bother.
        else:
            self.found = False
            self.english = 'Not translatable'
            self.grammar_features = []

        # get a tag explanation from SpaCy
        self.grammar_explanation = spacy.explain(tag)
コード例 #19
0
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' +
                  str(spacy.explain(ent.label_)))
    else:
        print('Keine benamten Entitäten gefunden.')
コード例 #20
0
    def _determine_aux(self, clause, verb, verb_tense, subj) -> str:
        '''
        Determines the auxillary verb to be used in the question by checking the verb tense,
        trying to find the aux verb in the sentence, or using defaults.
        
        Args:
            clause: spacy.Span
            verb: spacy.Token
            verb_tense: string
            
        Returns:
            The auxillary verb for the question.
        '''
        # verb is preceded by the auxillary verb
        if verb.nbor(-1).pos_ == 'AUX':
            return self._fg_aux(verb.nbor(-1), verb_tense)

        # look for the auxillary verb
        for token in clause:
            if token.pos_ == 'AUX' or nlp.vocab[token.dep].text == 'aux':
                if verb == token:  # aux is root verb
                    return self._fg_aux(verb, verb_tense)
                return self._fg_aux(token, verb_tense)

        # if no auxillary verb could be found in the sentence, use default aux verb (do)
        if verb_tense == 'PAST_TENSE':
            return 'did'
        elif verb_tense == 'PRESENT':
            if 'non-3rd' in spacy.explain(verb.tag_):  # check form of verb
                return 'do'
            return 'does'
        else:
            print('err: could not determine aux verb')
コード例 #21
0
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + '--' + ent.label_ + '--' +
                  str(spacy.explain(ent.label_)))
    else:
        print('No Entities found')
コード例 #22
0
 def get_morphology(self, token):
     morph = Morphology()
     morph_dict = self._nlp.vocab.morphology.tag_map[token.tag_]
     if not token.is_stop and token.is_alpha:
         if 'Tense_past' in morph_dict.keys():
             if 'VerbForm_part' in morph_dict.keys():
                 morph.tense = PAST+PARTICIPLE
             else:
                 morph.tense = PAST
             morph.tense = PAST
         if 'Tense_pres' in morph_dict.keys():
             if 'VerbForm_part' in morph_dict.keys():
                 morph.tense = PRESENT+PARTICIPLE
             else:
                 morph.tense = PRESENT
         if 'Person_three' in morph_dict.keys():
             morph.is_third_person = True
         if 'Number_plur' in morph_dict.keys():
             morph.is_plural = True
         if 'Number_sing' in morph_dict.keys():
             morph.is_singular = True
         if 'Degree_sup' in morph_dict.keys():
             morph.is_superlative =  True
         if 'Degree_comp' in morph_dict.keys():
             morph.is_comparative = True
         print(token.text, token.lemma_ , spacy.explain(token.tag_), morph_dict)
     return morph
コード例 #23
0
    def _find_nsubj_in_tokens(self, clause) -> 'spacy.Token' or None:
        '''
        Finds a valid nominal subject in the clause.
        
        Args:
            clause: spacy.Span
            
        Returns:
            A spacy.Token of the subject found in the clause or None
        '''
        in_punct = False  # ignore all tokens in parentheses, brackets, and curly braces
        for token in clause:
            if token.text in {'(', '[', '{'}:
                in_punct = True
            if token.text in {')', ']', '}'} and in_punct:
                in_punct = False
            if in_punct:
                continue

            # checks for validity of dependency of subject, and whether it's a wh-determiner
            if nlp.vocab[token.dep].text in {
                    'csubj', 'csubjpass', 'nsubj', 'nsubjpass'
            } and 'wh-determiner' not in spacy.explain(token.tag_):
                return token

        return None
コード例 #24
0
 def _determine_verb_tense(self, verb) -> str:
     '''
     Determines the tense of a verb.
     
     Args:
         verb: spacy.Token
     
     Returns:
         A string describing the verb's tense
     '''
     verb_detail = spacy.explain(verb.tag_)
     if 'past tense' in verb_detail:
         return 'PAST_TENSE'
     elif 'past principle' in verb_detail:
         return 'PAST_PRIN'
     elif 'past participle' in verb_detail:
         return 'PAST_PART'
     elif 'present' in verb_detail:
         return 'PRESENT'
     elif 'future' in verb_detail:
         return 'FUTURE'
     elif 'base form' in verb_detail:
         return 'BASE'
     else:
         print('err: could not determine verb tense')
コード例 #25
0
ファイル: salmonspam.py プロジェクト: schlupov/hermes
 def analyze_email_main_topic(self, tokens):
     email_labels = {}
     email_favorite_topics = {}
     for ent in tokens.ents:
         if ent.label_ not in ["PERCENT", "CARDINAL", "DATE"]:
             if ent.label_ not in email_labels.keys():
                 email_labels[ent.label_] = 1
                 email_favorite_topics[ent.label_] = [ent.text.strip()]
             else:
                 email_labels[ent.label_] += 1
                 email_favorite_topics[ent.label_].append(ent.text.strip())
     most_common_label = 0
     for key, value in email_labels.items():
         if value > most_common_label:
             most_common_label = value
     for key, value in email_labels.items():
         # the email should mention the topic at least three times
         if value == most_common_label and value >= 3:
             self.rating -= 10
             if utils.settings.data["relay"]["save_statistics"]:
                 update_statistics(23)
             favorite_topic = collections.Counter(
                 email_favorite_topics[key])
             favorite_topic = favorite_topic.most_common(1)[0][0]
             logging.info(
                 "[+] (salmonspam.py) - This email mostly talk about %s, especially %s"
                 % (spacy.explain(key).lower(), favorite_topic))
             break
コード例 #26
0
    def number(self, word, roles):
        """ Determines whether noun is singular or plural (conjoined counts as plural)
        """
        explain = spacy.explain(word.tag_)

        if word.pos_ == "NOUN":
            if 'singular' in explain:
                return 1
            elif 'plural' in explain:
                return 0
            else:
                return 2

        elif word.pos_ == "PROPN":
            if word.text in roles:
                if 'PERSON' in roles[word.text]:
                    return 1
                else:
                    return 0

        elif word.pos_ == "PRON":
            if word.text.lower() in self.personal_sg:
                return 1
            elif word.text.lower() in self.personal_plu:
                return 0
            else:
                return 2
        else:
            return 2
コード例 #27
0
def show_entsall(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' +
                  str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')
コード例 #28
0
def print_original_heading_and_new_tokens_list(doc, new_tokens_list):
    [
        print("{:<15}{:<15}{:<7}{:<7}{:<15}".format(token.text,
                                                    str(new_tokens_list[i]),
                                                    token.pos_, token.tag_,
                                                    spacy.explain(token.pos_)))
        for i, token in enumerate(doc)
    ]
コード例 #29
0
def show_entsdatetime(doc):
    if doc.ents:
        for ent in doc.ents:
            if (ent.label_ == "DATETIME"):
                print(ent.text + ' - ' + ent.label_ + ' - ' +
                      str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')
コード例 #30
0
def show_entsquantity(doc):
    if doc.ents:
        for ent in doc.ents:
            if (ent.label_ == "QUANTITY"):
                print(ent.text + ' - ' + ent.label_ + ' - ' +
                      str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')