def spell_check(query, indices_to_ignore):
    hunspell = spaCyHunSpell(nlp, 'linux')
    nlp.add_pipe(hunspell, name="hunspell")
    doc = nlp(query)
    original_data = [token.text for token in doc]
    for i, data in enumerate(doc):
        if i in indices_to_ignore:
            continue
        if not data._.hunspell_spell:
            suggestions = data._.hunspell_suggest
            suggestions_vocab = [nlp.vocab[suggestion] for suggestion in suggestions]
            result = [data.similarity(ind) for ind in suggestions_vocab]
            max_word_index = result.index(max(result))
            word = suggestions[max_word_index]
            original_data[i] = word
    response = " ".join(original_data)
    nlp.remove_pipe('hunspell')
    Token.remove_extension("hunspell_spell")
    Token.remove_extension("hunspell_suggest")
    return response
Example #2
0
def manage_interruptions(doc, party, exception):
    matcher = Matcher(nlp.vocab)
    party_parenth = [{
        "TEXT": "("
    }, {
        'LOWER': {
            'IN': party
        }
    }, {
        "TEXT": ")"
    }]  ##'regular' speaker (e.g. Muller (spd):)

    matcher.add("party_parenthesis", None, party_parenth)
    matches = matcher(doc)

    end_p = []
    for match_id, start, end in matches:
        end_p.append(end)

    # identify interruptions in doc and print them
    tmp = []

    for token in doc:
        if token.text == '(' and doc[token.i + 1].lower_ not in party and doc[
                token.i +
                1].lower_ not in exception and token.i + 80 < len(doc):
            for i in range(1, 80):
                if doc[token.i +
                       i].text == ')' and token.i + i + 1 not in end_p:  #find last parentheses that does not belong to party
                    #print(doc[token.i:token.i+i+1], token.i, token.i+i, token.i+i+1) #print interruption, index and subsequent token
                    tmp.append(
                        (token.i, token.i + i + 1)
                    )  #store index of span of interruptions as list of tuples

                    break  #avoid capturing subsequent interruptions due to i going till 80 tokens forward

    matcher.remove("party_parenthesis")

    ##check long (> 50 tokens) interruptions among those above
    for i in range(len(tmp)):
        if (tmp[i][1] - tmp[i][0]) > 50:
            print('wow! This is a very long interruption: -->',
                  doc[tmp[i][0]:tmp[i][1]], '\n')

    # create variable that contains index of each token that is within interruptions
    seen = set()
    t = []
    for i in tmp:
        #print(doc[i[0]: i[1]])  #check it is printing all interruptions
        for token in doc[i[0]:i[1]]:
            if token.i in seen:  # avoid adding parts of already identified interruptions
                continue
            else:
                seen.add(token.i)
                t.append(token.i)

    # define getter function that returns True if a token is part of interruptions
    def is_in_interruption(token):
        in_int = token.i in t
        return in_int

    # set a token custom extension to check whether token is in interruption
    Token.set_extension('is_in_interruption', getter=is_in_interruption)

    #store tokens that are not within interruptions in clean_doc
    clean_doc = []
    for token in doc:
        if not token._.is_in_interruption:
            clean_doc.append(token)

    Token.remove_extension('is_in_interruption')

    # create a new doc object that does not contain interruptions
    doc_clean = nlp(''.join(map(lambda x: x.text_with_ws, clean_doc)))

    return doc_clean
#Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
#The input text string is converted to Document object
doc = nlp(
    "The French Revolution was a period of time in France when the people overthrew the monarchy and took control of the government."
)

#Define the extension attribute on the token level with name as #'context' and default value as false
Token.set_extension('context', default=False, force=True)

#Try printing the each token on the Document object and the stored #value by the extension attribute. All the values default to 'False'
for d in doc:
    print(d.text, d._.context)

#The entity type of previous, next and self tokens are computed and #is set by the 'set' function
for i, d in enumerate(doc):
    if i > 0 and (i < len(doc) - 1):
        meaning = '|' + doc[i - 1].ent_type_ + '-' + d.ent_type_ + '-' + doc[
            i + 1].ent_type_
        d._.set('context', meaning)

#Printing the tokens again to see the modified values
for d in doc:
    print(d.text, d._.context)

Token.has_extension('context')  #returns True
Token.remove_extension('context')  #removes the attribute
Token.has_extension('context')  #returns False

# In[ ]:
Example #4
0
 def _remove_metadata(self, key):
     if key in self._metadata_attrs:
         Token.remove_extension('meta_' + key)
         del self._metadata_attrs[key]