def spell_check(query, indices_to_ignore): hunspell = spaCyHunSpell(nlp, 'linux') nlp.add_pipe(hunspell, name="hunspell") doc = nlp(query) original_data = [token.text for token in doc] for i, data in enumerate(doc): if i in indices_to_ignore: continue if not data._.hunspell_spell: suggestions = data._.hunspell_suggest suggestions_vocab = [nlp.vocab[suggestion] for suggestion in suggestions] result = [data.similarity(ind) for ind in suggestions_vocab] max_word_index = result.index(max(result)) word = suggestions[max_word_index] original_data[i] = word response = " ".join(original_data) nlp.remove_pipe('hunspell') Token.remove_extension("hunspell_spell") Token.remove_extension("hunspell_suggest") return response
def manage_interruptions(doc, party, exception): matcher = Matcher(nlp.vocab) party_parenth = [{ "TEXT": "(" }, { 'LOWER': { 'IN': party } }, { "TEXT": ")" }] ##'regular' speaker (e.g. Muller (spd):) matcher.add("party_parenthesis", None, party_parenth) matches = matcher(doc) end_p = [] for match_id, start, end in matches: end_p.append(end) # identify interruptions in doc and print them tmp = [] for token in doc: if token.text == '(' and doc[token.i + 1].lower_ not in party and doc[ token.i + 1].lower_ not in exception and token.i + 80 < len(doc): for i in range(1, 80): if doc[token.i + i].text == ')' and token.i + i + 1 not in end_p: #find last parentheses that does not belong to party #print(doc[token.i:token.i+i+1], token.i, token.i+i, token.i+i+1) #print interruption, index and subsequent token tmp.append( (token.i, token.i + i + 1) ) #store index of span of interruptions as list of tuples break #avoid capturing subsequent interruptions due to i going till 80 tokens forward matcher.remove("party_parenthesis") ##check long (> 50 tokens) interruptions among those above for i in range(len(tmp)): if (tmp[i][1] - tmp[i][0]) > 50: print('wow! This is a very long interruption: -->', doc[tmp[i][0]:tmp[i][1]], '\n') # create variable that contains index of each token that is within interruptions seen = set() t = [] for i in tmp: #print(doc[i[0]: i[1]]) #check it is printing all interruptions for token in doc[i[0]:i[1]]: if token.i in seen: # avoid adding parts of already identified interruptions continue else: seen.add(token.i) t.append(token.i) # define getter function that returns True if a token is part of interruptions def is_in_interruption(token): in_int = token.i in t return in_int # set a token custom extension to check whether token is in interruption Token.set_extension('is_in_interruption', getter=is_in_interruption) #store tokens that are not within interruptions in clean_doc clean_doc = [] for token in doc: if not token._.is_in_interruption: clean_doc.append(token) Token.remove_extension('is_in_interruption') # create a new doc object that does not contain interruptions doc_clean = nlp(''.join(map(lambda x: x.text_with_ws, clean_doc))) return doc_clean
#Language class with the English model 'en_core_web_sm' is loaded nlp = spacy.load('en_core_web_sm') #The input text string is converted to Document object doc = nlp( "The French Revolution was a period of time in France when the people overthrew the monarchy and took control of the government." ) #Define the extension attribute on the token level with name as #'context' and default value as false Token.set_extension('context', default=False, force=True) #Try printing the each token on the Document object and the stored #value by the extension attribute. All the values default to 'False' for d in doc: print(d.text, d._.context) #The entity type of previous, next and self tokens are computed and #is set by the 'set' function for i, d in enumerate(doc): if i > 0 and (i < len(doc) - 1): meaning = '|' + doc[i - 1].ent_type_ + '-' + d.ent_type_ + '-' + doc[ i + 1].ent_type_ d._.set('context', meaning) #Printing the tokens again to see the modified values for d in doc: print(d.text, d._.context) Token.has_extension('context') #returns True Token.remove_extension('context') #removes the attribute Token.has_extension('context') #returns False # In[ ]:
def _remove_metadata(self, key): if key in self._metadata_attrs: Token.remove_extension('meta_' + key) del self._metadata_attrs[key]