def get_tagged_sequence(sentence): clean_text = return_clean_text(sentence) tagger = sken_singleton.Singletons.get_instance().get_tagger() doc = sken_singleton.Singletons.get_instance().get_nlp(clean_text) logger.info("Made {} tokens for sentence={}".format(len(doc), sentence)) resp = {"sentence": sentence, "sequence": ''} context_verbs = [] context_nouns = [] for token in doc: if 'W' in token.tag_: if str(token) in tagger['wquestions']: resp['sequence'] += "<wquestion>" else: resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>' elif 'VERB' in token.pos_: if str(token) in tagger["context_verb"]: resp["sequence"] += '<context_verb>' context_verbs.append(str(token)) else: resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>' elif 'NOUN' in token.pos_: if str(token) in tagger['context_noun']: resp['sequence'] += "<context_noun>" context_nouns.append(str(token)) else: resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>' else: resp['sequence'] += '<' + str(spacy.explain(token.pos_)) + '>' return resp, context_nouns, context_verbs
def _test(model): for dep in ["nsubj", "attr", "prep", "pobj", "punct", "det"]: predicted = model.wv.most_similar(positive=[dep]) print("Predictions for {}:".format(spacy.explain(dep))) for (w, sim) in predicted: print("\t", spacy.explain(w), round(sim, 3)) print("")
def list_present_tense_heads(pronouns, pronoun_replacement): present_tense_heads = [] they_them = ['they', 'them', 'their', 'theirs', 'themselves', 'themself'] for pronoun in pronouns: print(pronoun['token'].text, pronoun['token'].head, pronoun['token'].head.tag_, spacy.explain(pronoun['token'].head.tag_)) if (pronoun_replacement.gramatically_plural and pronoun['token'].dep_ == 'nsubj' and pronoun['token'].head.tag_ == 'VBZ'): print(spacy.explain(pronoun['token'].head.tag_)) present_tense_heads.append({ 'token': pronoun['token'].head, 'replacement_text': head_replacement(pronoun['token'].head) }) elif ((pronoun['token'].text.lower() in they_them) and pronoun['token'].dep_ == 'nsubj' and pronoun['token'].head.tag_ == 'VBP'): present_tense_heads.append({ 'token': pronoun['token'].head, 'replacement_text': replace_plural_head(pronoun['token'].head) }) return present_tense_heads
def get_verb_tense_frequencies(lines): freq = dict() freq['present'] = 0 freq['future'] = 0 freq['past'] = 0 verbs_no = 0 for line in lines: doc = nlp(line) for i in range(len(doc)): token = doc[i] if token.pos_ == 'VERB' and token.tag_ != 'MD': verbs_no += 1 if 'present' in spacy.explain(token.tag_): freq['present'] += 1 elif 'past' in spacy.explain(token.tag_): freq['past'] += 1 elif token.pos_ == 'VERB' and token.tag_ == 'MD' and token.text.lower( ) == 'will': if i < len(doc) - 1: i += 1 next_token = doc[i] if next_token is not None and next_token.text == 'VB': verbs_no += 1 freq['future'] += 1 if verbs_no > 0: for key, value in freq.items(): freq[key] = value / verbs_no return freq
def ner_spacy(text): print(spacy.__version__) assert spacy.util.is_package("en_core_web_sm") nlp = spacy.load("en_core_web_sm") doc = nlp(text) entities = [] labels = [] position_start = [] position_end = [] for ent in doc.ents: entities.append(ent) labels.append(ent.label_) position_start.append(ent.start_char) position_end.append(ent.end_char) df = pd.DataFrame({ 'Entities': entities, 'Labels': labels, 'Position_Start': position_start, 'Position_End': position_end }) print(df) spacy.explain('PERSON') return df
def print_token_info(token): text = token.text if token.text is not None else "" pos = spacy.explain(token.pos_) if token.pos_ is not None else "" dep = spacy.explain(token.dep_) if token.dep_ is not None else "" lemma = token.lemma_ if token.lemma_ is not None else "" return text, pos, dep, lemma
def _test(model): for pos in ["VB", "POS", "WRB", "JJ", "NN", "."]: predicted = model.wv.most_similar(positive=[pos]) print("Predictions for {}:".format(spacy.explain(pos))) for (w, sim) in predicted: print("\t", spacy.explain(w), round(sim, 3)) print("")
def pos_tagging_and_display(sentence): tokenizer = spacy.load("en_core_web_sm") token_list = tokenizer(sentence) for token in token_list: print(token, token.tag_, token.pos_, spacy.explain(token.tag_)) if (spacy.explain(token.tag_) == 'adjective'): adj.append(token)
def show_entsproduct(doc): if doc.ents: for ent in doc.ents: if (ent.label_ == "PRODUCT"): print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) produse.append(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No named entities found.')
def test_de(): TAG = ["$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY"] for t in TAG: e = spacy.explain(t) print("{} {}".format(t, e)) DEP = ["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dep", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "svp", "uc", "vo"] for d in DEP: e = spacy.explain(d) print("{} {}".format(d, e))
def pos_tagging_s(): sp = spacy.load('en_core_web_sm') sen = sp("I like to play football. I hated it in my childhood though") print(sen.text) print(sen[1].pos_) print(sen[1].tag_) print(spacy.explain(sen[1].tag_)) for word in sen: print("Word:", word.text, "\t", "POS Tag:", word.pos_, "\t", "Tag for Word:", word.tag_, "Explanatation:", spacy.explain(word.tag_), "\n")
def test_en(): TAG = ["$", "''", ",", "-LRB-", "-RRB-", ".", ":", "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "XX", "``"] for t in TAG: e = spacy.explain(t) print("{} {}".format(t, e)) DEP = ["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"] for d in DEP: e = spacy.explain(d) print("{} {}".format(d, e))
def question_likelihood(parsed_data, sub_component=False): """ Determines likelihood that a parsed spacy sentence is a question. Keyword arguments: spacy -- Spacy English model parsed_data -- Spacy text object sub_component -- If True will not attempt recursion (default = False) Return: float -- likelihood of question """ starts_with_wh = spacy.explain(parsed_data[0].tag_).startswith(u'wh-') is_question = 0.95 if starts_with_wh else 0.0 # check if the sentence starts with to be or has non_wh_question_starters = [ u'be', u'do', u'could', u'should', u'may', u'can', u'shall', u'have', u'will', u'doe', u'would' ] if parsed_data[0].lemma_ in non_wh_question_starters: is_question = 0.95 # if second word is not a verb then this is probably not a question if len(parsed_data) > 1 and starts_with_wh: is_question += -0.45 if parsed_data[1].pos_ != u'VERB' else 0 # the case of 'to whom should I write this check?' if parsed_data[0].lemma_ == u'to': if spacy.explain(parsed_data[1].tag_).startswith(u'wh-'): is_question = 0.95 # break down the comma seperated components of a sentence # analyze the components # Todo: enhance this to break on multiple punct types. if not sub_component and is_question <= 0.5: component_tokens = [[]] i = 0 for token in parsed_data: if token.pos_ == 'PUNCT' and token.orth_ == ',': i += 1 component_tokens.append([]) continue component_tokens[i].append(token) if len(component_tokens) > 1: is_question = max([ question_likelihood(component, True) for component in component_tokens if component ]) return is_question
def extract_debug_data(self, parsedData): lexicon = [] deps = [] for token in parsedData: lexicon.append([ token.orth_, spacy.explain(token.pos_), spacy.explain(token.tag_), token.tag_, token.lemma_ ]) deps.append([ token.orth_, token.dep_, token.head.orth_, ' '.join([t.orth_ for t in token.lefts]), ' '.join([t.orth_ for t in token.rights]) ]) return lexicon, deps
def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + "-" + ent.label_ + "-" + str(spacy.explain(ent.label))) else: print("No entity found")
def print_examples(examples_for): for tag, doc_and_index in examples_for.items(): print('tag :: {}\nexplanation :: {}'.format(tag, spacy.explain(tag))) print('----example-----') for doc, indices in doc_and_index.items(): print(highlight.by_token(indices, doc)) print('================\n\n')
def tense(self, verb): """ Determines whether verb is present tense or not """ if str(verb.tag_) in self.present_tense or 'base form' in spacy.explain(verb.tag_): return 1 else: return 0
def __init__(self, word, lemma, tag): self.word = word self.root = lemma self.tag = tag self.pos = TAG_DICT[tag] # set the css class if self.pos in ['Noun', 'Verb', 'Adjective', 'Unknown']: self.css_cat = self.pos.lower() else: self.css_cat = 'other' # if there can be a translation, look it up and save it if self.pos not in ['Proper Noun', 'Other', 'Numeral']: self.found, translation, grammar = dictionary.lookup(word, lemma, self.pos) if self.found: self.english = self.gen_english_string(translation) self.grammar_features = self.list_features(grammar) else: self.english = 'No translation found' self.grammar_features = [] # if it is not possible for the word to have a translation, don't bother. else: self.found = False self.english = 'Not translatable' self.grammar_features = [] # get a tag explanation from SpaCy self.grammar_explanation = spacy.explain(tag)
def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('Keine benamten Entitäten gefunden.')
def _determine_aux(self, clause, verb, verb_tense, subj) -> str: ''' Determines the auxillary verb to be used in the question by checking the verb tense, trying to find the aux verb in the sentence, or using defaults. Args: clause: spacy.Span verb: spacy.Token verb_tense: string Returns: The auxillary verb for the question. ''' # verb is preceded by the auxillary verb if verb.nbor(-1).pos_ == 'AUX': return self._fg_aux(verb.nbor(-1), verb_tense) # look for the auxillary verb for token in clause: if token.pos_ == 'AUX' or nlp.vocab[token.dep].text == 'aux': if verb == token: # aux is root verb return self._fg_aux(verb, verb_tense) return self._fg_aux(token, verb_tense) # if no auxillary verb could be found in the sentence, use default aux verb (do) if verb_tense == 'PAST_TENSE': return 'did' elif verb_tense == 'PRESENT': if 'non-3rd' in spacy.explain(verb.tag_): # check form of verb return 'do' return 'does' else: print('err: could not determine aux verb')
def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + '--' + ent.label_ + '--' + str(spacy.explain(ent.label_))) else: print('No Entities found')
def get_morphology(self, token): morph = Morphology() morph_dict = self._nlp.vocab.morphology.tag_map[token.tag_] if not token.is_stop and token.is_alpha: if 'Tense_past' in morph_dict.keys(): if 'VerbForm_part' in morph_dict.keys(): morph.tense = PAST+PARTICIPLE else: morph.tense = PAST morph.tense = PAST if 'Tense_pres' in morph_dict.keys(): if 'VerbForm_part' in morph_dict.keys(): morph.tense = PRESENT+PARTICIPLE else: morph.tense = PRESENT if 'Person_three' in morph_dict.keys(): morph.is_third_person = True if 'Number_plur' in morph_dict.keys(): morph.is_plural = True if 'Number_sing' in morph_dict.keys(): morph.is_singular = True if 'Degree_sup' in morph_dict.keys(): morph.is_superlative = True if 'Degree_comp' in morph_dict.keys(): morph.is_comparative = True print(token.text, token.lemma_ , spacy.explain(token.tag_), morph_dict) return morph
def _find_nsubj_in_tokens(self, clause) -> 'spacy.Token' or None: ''' Finds a valid nominal subject in the clause. Args: clause: spacy.Span Returns: A spacy.Token of the subject found in the clause or None ''' in_punct = False # ignore all tokens in parentheses, brackets, and curly braces for token in clause: if token.text in {'(', '[', '{'}: in_punct = True if token.text in {')', ']', '}'} and in_punct: in_punct = False if in_punct: continue # checks for validity of dependency of subject, and whether it's a wh-determiner if nlp.vocab[token.dep].text in { 'csubj', 'csubjpass', 'nsubj', 'nsubjpass' } and 'wh-determiner' not in spacy.explain(token.tag_): return token return None
def _determine_verb_tense(self, verb) -> str: ''' Determines the tense of a verb. Args: verb: spacy.Token Returns: A string describing the verb's tense ''' verb_detail = spacy.explain(verb.tag_) if 'past tense' in verb_detail: return 'PAST_TENSE' elif 'past principle' in verb_detail: return 'PAST_PRIN' elif 'past participle' in verb_detail: return 'PAST_PART' elif 'present' in verb_detail: return 'PRESENT' elif 'future' in verb_detail: return 'FUTURE' elif 'base form' in verb_detail: return 'BASE' else: print('err: could not determine verb tense')
def analyze_email_main_topic(self, tokens): email_labels = {} email_favorite_topics = {} for ent in tokens.ents: if ent.label_ not in ["PERCENT", "CARDINAL", "DATE"]: if ent.label_ not in email_labels.keys(): email_labels[ent.label_] = 1 email_favorite_topics[ent.label_] = [ent.text.strip()] else: email_labels[ent.label_] += 1 email_favorite_topics[ent.label_].append(ent.text.strip()) most_common_label = 0 for key, value in email_labels.items(): if value > most_common_label: most_common_label = value for key, value in email_labels.items(): # the email should mention the topic at least three times if value == most_common_label and value >= 3: self.rating -= 10 if utils.settings.data["relay"]["save_statistics"]: update_statistics(23) favorite_topic = collections.Counter( email_favorite_topics[key]) favorite_topic = favorite_topic.most_common(1)[0][0] logging.info( "[+] (salmonspam.py) - This email mostly talk about %s, especially %s" % (spacy.explain(key).lower(), favorite_topic)) break
def number(self, word, roles): """ Determines whether noun is singular or plural (conjoined counts as plural) """ explain = spacy.explain(word.tag_) if word.pos_ == "NOUN": if 'singular' in explain: return 1 elif 'plural' in explain: return 0 else: return 2 elif word.pos_ == "PROPN": if word.text in roles: if 'PERSON' in roles[word.text]: return 1 else: return 0 elif word.pos_ == "PRON": if word.text.lower() in self.personal_sg: return 1 elif word.text.lower() in self.personal_plu: return 0 else: return 2 else: return 2
def show_entsall(doc): if doc.ents: for ent in doc.ents: print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No named entities found.')
def print_original_heading_and_new_tokens_list(doc, new_tokens_list): [ print("{:<15}{:<15}{:<7}{:<7}{:<15}".format(token.text, str(new_tokens_list[i]), token.pos_, token.tag_, spacy.explain(token.pos_))) for i, token in enumerate(doc) ]
def show_entsdatetime(doc): if doc.ents: for ent in doc.ents: if (ent.label_ == "DATETIME"): print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No named entities found.')
def show_entsquantity(doc): if doc.ents: for ent in doc.ents: if (ent.label_ == "QUANTITY"): print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No named entities found.')