def checkgrammar(inp): sent = nltk.word_tokenize(inp.lower()) suggestions = [[word] for word in sent] postag = nltk.pos_tag(nltk.word_tokenize(inp)) # print(postag) start = time.time() for i in range(len(sent)): x = sent[i].lower() if(sent[i] in articles.split('/')): suggestions[i] = api(options(sent,i,articles)) or [sent[i]] elif(sent[i] in demonstrative_pronouns.split('/')): suggestions[i] = api(options(sent,i,demonstrative_pronouns)) or [sent[i]] elif(sent[i] in preposition.split('/')): x = options(sent,i,preposition) if(i<len(sent)-2): x = x + ' '+sent[i+2] suggestions[i] = api(x) or [sent[i]] elif(sent[i] in possesives_1.split('/')): suggestions[i] = api(options(sent,i,possesives_1),2) or [sent[i]] elif(sent[i] in possesives_2.split('/')): suggestions[i] = api(options(sent,i,possesives_2),2) or [sent[i]] elif(sent[i] in possesives_3.split('/')): suggestions[i] = api(options(sent,i,possesives_3),2) or [sent[i]] elif(sent[i] in quantifiers.split('/')): suggestions[i] = api(options(sent,i,quantifiers),2) or [sent[i]] elif(sent[i] in interrogative_pronouns.split('/')): suggestions[i] = api(options(sent,i,interrogative_pronouns)) or [sent[i]] elif(sent[i] in auxillary_verbs): l1 = list(get_word_forms(wn().lemmatize(sent[i],'v'))['v']) verbs_combined = '"'+'"/"'.join(word for word in l1)+'"' if(l1): suggestions[i] = api(options(sent,i,verbs_combined,),2,sent[i]) or [sent[i]] elif(postag[i][1].startswith('VB')): l1 = list(get_word_forms(wn().lemmatize(sent[i],'v'))['v']) verbs_combined = '"'+'"/"'.join(word for word in l1)+'"' if(l1): suggestions[i] = api(options(sent,i,verbs_combined,),1,sent[i]) or [sent[i]] elif(postag[i][1].startswith('NN') or (i<len(sent)-1 and postag[i][1].startswith('JJ') and postag[i+1][1].startswith('NN'))): if(i==0): suggestions[i] = api(articles+' '+options(sent,i,sent[i])) or [sent[i]] else: if(postag[i-1][1].startswith('VB')): suggestions[i] = [x+' '+sent[i] for x in api(sent[i-1]+' ? '+sent[i],2)] or [sent[i]] sent[i] = suggestions[i][0] end = time.time() newsent = nltk.word_tokenize(inp) for i in range(len(sent)): if newsent[i] in suggestions[i]: suggestions[i].remove(newsent[i]) #print(end-start) return(suggestions)
def preprocess_word_forms(): start = time.time() dictionary = {} eqclasses = {} for word in wordlist: wordset = set() forms = get_word_forms(word) for pos in forms: wordset |= forms[pos] for wordform in wordset: try: dictionary[wordform] |= wordset except KeyError: dictionary[wordform] = wordset for word in dictionary: frzset = frozenset(dictionary[word]) wordhash = hash(frzset) goodkey = False while not goodkey: try: if frzset.issubset(eqclasses[wordhash]) or frzset.issuperset(eqclasses[wordhash]): eqclasses[wordhash] |= frzset goodkey = True else: wordhash += 1 except KeyError: eqclasses[wordhash] = frzset goodkey = True dictionary[word] = wordhash print "TIME: "+ str(time.time()-start) return (dictionary, eqclasses)
def get_wordforms(word): from word_forms.word_forms import get_word_forms try: allwords=[] temp=get_word_forms(word) if len(temp['n'])==0 and len(temp['v'])==0 and len(temp['a'])==0 and len(temp['r'])==0: raise Exception temp["Noun"]=temp["n"] del temp["n"] temp["Adjective"]=temp["a"] del temp["a"] temp["Adverb"]=temp["r"] del temp["r"] temp["Verb"]=temp["v"] del temp["v"] in_str="" for key in temp.keys(): if len(temp[key])>0: # print("Temporary Information: ",list(temp[key])) for i in list(temp[key]): if i not in allwords: allwords.append(i) in_str=in_str+key+" Forms: "+" ".join(temp[key])+"<br>" # print(in_str) # print("All words: ",allwords) return [in_str,allwords] except : return [" No word forms"," "]
def listIntersection(lst0, lst1, expand=False): ''' Find the intersection of list lst0 and lst1. ------- params: ------- lst0: list of key strings. lst1: string. ------- return: ------- list of values contained in both lists. ''' if expand: values = [] for value in lst0: sub_values = forms2list(get_word_forms(value)) for sub_value in sub_values: if sub_value in lst1: values.append(value) break return values else: return [value for value in lst0 if value in lst1]
def find_word_forms(word): dic = get_word_forms(word) words = "" for form in dic: for w in dic[form]: words += w + " " return words
def generate_inflections(word): inflections_dict = get_word_forms(word) inflections = set() for forms in inflections_dict.values(): for form in forms: inflections.add(form.lower()) return list(inflections)
def get_lemma(word): word_forms = get_word_forms(word.text).values() flat_list_of_forms = list(set(chain.from_iterable(word_forms))) if len(flat_list_of_forms) > 3: return sorted(flat_list_of_forms, key=len)[0] else: return word.lemma_
def token_lemma_mapping(word_dict): token_lemma_map = {} for k, v in word_dict.items(): external_forms = get_word_forms(k) all_lemmas = set([]) for e_k, e_v in external_forms.items(): all_lemmas = all_lemmas.union(e_v) token_lemma_map[k] = all_lemmas return token_lemma_map
def get_wordforms(word): MINIMAL_WORDFORMS_REQ = 6 regex = re.compile('[^a-zA-Z]') clean_word = regex.sub('', word) #print(clean_word) if len(clean_word) > 0: pos_tagged_ngramm = nltk.pos_tag([clean_word]) for word_el in pos_tagged_ngramm: #print(word_el) pos = get_wordnet_pos(word_el[1]) if pos: lemma = lemmatizer.lemmatize(word_el[0], pos=pos) else: lemma = word_el[0] break if clean_word == lemma: wordforms_dict = get_word_forms(clean_word) wordform_set = extract_variants_from_dict(wordforms_dict) else: wordforms_dict = get_word_forms(clean_word) wordform_set = extract_variants_from_dict(wordforms_dict) wordforms_lemma_dict = get_word_forms(lemma) wordform_set_from_lemma = extract_variants_from_dict( wordforms_lemma_dict) wordform_set = wordform_set.union(wordform_set_from_lemma) if len(wordform_set) < MINIMAL_WORDFORMS_REQ: if lemma in dct_mixed_words: more_words_number = MINIMAL_WORDFORMS_REQ - len(wordform_set) add_words_list = dct_mixed_words[lemma][:more_words_number] #print("additional words", add_words_list) add_words_list = set(add_words_list[:6]) wordform_set = wordform_set.union(add_words_list) #else: #print("NOT_ENOUGH_WORDS_AND_NOT_ENOUGH_MIXED_WORDS",word) wordform_list = list(wordform_set) if word in wordform_list: wordform_list.remove(word) return wordform_list else: return []
def get_lemmas(word): other_forms = list( set([ item for sublist in [list(x) for x in get_word_forms(word).values()] for item in sublist if item != word ])) min_matching_len = min(len(word), 3) return [ x for x in other_forms if (any(s.isupper() for s in x) == False) & (x[0:min_matching_len] == word[0:min_matching_len]) ]
def lemmatize(self, word): for lemmatizer in self.lemmatizers: base_form = lemmatizer.lemmatize(word) if word != base_form and base_form is not None: return {base_form} word_forms = get_word_forms(word) all_forms = set() for base_forms_pos in word_forms: for base_form in word_forms[base_forms_pos]: if base_form != word: all_forms.add(base_form) return all_forms
def find_lemma_map(name, word_set, dictionary, bypass_lemma=False): ''' this function will generate a csv file that maps from the word in SEND and the lemmas exist in our dictionary. if no lemma exists in the dictionary, we will return a empty string. If multiple appears, we will have a joined string separated by comma. ''' from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = WordNetLemmatizer() from word_forms.word_forms import get_word_forms lemma_map = dict() for word in word_set: # every word will have an empty matching first lemma_map[word] = '' # first priority is given to those words appear in both sets if word in dictionary: lemma_map[word] = word else: if bypass_lemma: continue # if it does not appear in dictionary, we transform in different ways # and see if we can find a close match lemma_word = lemmatizer.lemmatize(word) if lemma_word in dictionary: lemma_map[word] = lemma_word else: # we will use this last resort of external libraries. the output # will then be mannually examined with the output since we only # have a couple thousands of words here. external_forms = get_word_forms(lemma_word) for form in external_forms.keys(): words = external_forms[form] for w in words: if w in dictionary: if lemma_map[word] == '': lemma_map[word] = w else: lemma_map[word] = lemma_map[word] + "," + w # write the dict to a csv file for mannual examination missing_match = 0 for word in lemma_map.keys(): if lemma_map[word] == '': missing_match += 1 print('Missing Count: %s, Total Count: %s' % (missing_match, len(lemma_map.keys()))) output_file = "../warriner_valence/word_lemma_" + name + ".csv" with open(output_file, mode='w') as csv_file: file_writer = csv.writer(csv_file, delimiter=',') for w in lemma_map.keys(): row = [w, lemma_map[w]] file_writer.writerow(row)
def getOptionlist(self, checkWord, optionPass): checkWord = checkWord.lower() outdic = {} outdic = get_word_forms(checkWord) if optionPass in ['NNS', 'NN', 'NNP', 'NNPS']: return list(outdic.get('n')) elif optionPass in ['JJ', 'JJR', 'JJS']: return list(outdic.get('a')) elif optionPass in ['RB', 'RBR', 'RBS']: return list(outdic.get('r')) elif optionPass in ['VB', 'VBD', 'VBG', 'VBN', 'VBZ']: return list(outdic.get('v'))
def get_all_word_forms(word): """Takes a word as input and returns a collection of variants of the word. Parameters: word (string): a specific word Returns: words (list): the different variants of the input word """ forms_dict = get_word_forms(word) return list(dict.fromkeys(flatten(forms_dict.values())))
def lemmatize(word): """ Out of all the related word forms of ``word``, return the smallest form that appears first in the dictionary """ forms = [ word for pos_form in get_word_forms(word).values() for word in pos_form ] forms.sort() forms.sort(key=len) try: return forms[0] except IndexError: raise ValueError("{} is not a real word".format(word))
def extract_word(url): # parsing html and getting clear text from it words_set = set() req = Request(str(url), headers={'User-Agent': 'Mozilla/5.0'}) try: html = urlopen(req).read() #html = urllib.request.urlopen(str(url)) soup = BeautifulSoup(html, features="lxml") data = soup.findAll(text=True) def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', str(element.encode('utf-8'))): return False return True result = filter(visible, data) list_to_str = ' '.join([str(element) for element in list(result)]) # sentence tokenizing the clear text sent = nltk.sent_tokenize(list_to_str) # operations to extract words for item in sent: tokens = nltk.word_tokenize(item) # removing punctuation table = str.maketrans('', '', string.punctuation) stripped = [word.translate(table) for word in tokens] # taking only alphabet words = [word for word in stripped if word.isalpha()] for word in words: word = ''.join([char for char in word if not char.isdigit()]) # removing hexadecimal word = re.sub(r'[^\x00-\x7f]', r'', word) if len(word) >= 1: words_set.add(str(word.casefold())) # to get different form of a word word_form = get_word_forms(word) for item in word_form.values(): for inner_item in item: words_set.add(str(inner_item.casefold())) return words_set except: with open('unavailable_url.csv', 'a') as file: writer = csv.writer(file) writer.writerow([count]) writer.writerow([url]) return set("page " + str(count) + " not available")
def main(): # Command line arguments ap = argparse.ArgumentParser() ap.add_argument('corpus', help='The corpus file') ap.add_argument('out_file', help='The output file') ap.add_argument('nc_vocab', help='The vocabulary file') args = ap.parse_args() logger.info('Reading the vocabulary') with codecs.open(args.nc_vocab, 'r', 'utf-8') as f_in: nc_vocab = frozenset([line.strip() for line in f_in]) logger.info('Computing variations...') variations = {} for nc in tqdm.tqdm(nc_vocab): w1, w2 = nc.split('\t') curr_nc = nc.replace('\t', '_') for w1_form, w2_form in itertools.product( get_word_forms(w1)['n'], get_word_forms(w2)['n']): curr_variation = '_'.join((w1_form, w2_form)).lower() variations[curr_variation] = curr_nc logger.info('Counting the number of sentences in the corpus') num_instances = corpus_size(args.corpus) logger.info('Processing...') with codecs.open(args.corpus, 'r', 'utf-8') as f_in: with codecs.open(args.out_file, 'w', 'utf-8') as f_out: try: for line in tqdm.tqdm(f_in, total=num_instances): for sentence in get_sentences_with_bigrams( line.strip().lower(), variations): f_out.write(sentence + '\n') except Exception as err: logger.error(err)
def nltk_word_forms_dictionary_refiner(input_dictionary): secondary_input_dictionary = dict(input_dictionary) for i in input_dictionary.keys(): current_word_forms = get_word_forms(i) for parts_of_speech in current_word_forms.keys(): for word_form in current_word_forms[parts_of_speech]: if word_form not in input_dictionary.keys(): secondary_input_dictionary[word_form] = {} update_input_dictionary = dict(secondary_input_dictionary) for i in update_input_dictionary.keys(): current_word_forms = get_word_forms(i) for parts_of_speech in current_word_forms.keys(): for word_form in current_word_forms[parts_of_speech]: # this is a for loop through all the related words if word_form in secondary_input_dictionary.keys(): for k in secondary_input_dictionary[word_form].keys(): # this forces the keys into a list each time if k in list(update_input_dictionary[i]): update_input_dictionary[i][ k] += secondary_input_dictionary[word_form][k] else: update_input_dictionary[i][ k] = secondary_input_dictionary[word_form][k] return update_input_dictionary
def generate_ace_adj_rules(self): # - adj_statement_list = [] # - adj_comp_statement_list = [] # - adj_sup_statement_list = [] # - adv_statement_list = [] for adj in self.adjectives(): bare_word = adj comparative_word = comparative(adj) if len(word_tokenize(comparative_word)) > 1: comparative_word = None superlative_word = superlative(adj) if len(word_tokenize(superlative_word)) > 1: superlative_word = None adverb = get_word_forms(adj)["r"] if len(adverb) == 0: adverb = None adj_statement = "adj_itr({}, {}).".format(adj, adj) yield adj_statement # - adj_statement_list.append(adj_statement) if comparative_word is not None: adj_comp_statement = "adj_itr_comp({}, {}).".format( comparative_word, adj) self._inverse_map[comparative_word] = adj yield adj_comp_statement # - adj_comp_statement_list.append(adj_comp_statement) if superlative_word is not None: adj_sup_statement = "adj_itr_sup({}, {}).".format( superlative_word, adj) self._inverse_map[superlative_word] = adj yield adj_sup_statement # - adj_sup_statement_list.append(adj_sup_statement) if adverb is not None: for adv in adverb: adv_statement = "adv({}, {}).".format(adv, adv) self._inverse_map[adv] = adj yield adv_statement
def lemma_to_forms(lemma): """Return all forms of a lemma. Args: lemma (unicode) Returns: forms (set[unicode]): unique list of forms """ from word_forms.word_forms import get_word_forms # this import is slow, so do it lazily forms = set() forms.add(lemma) # original word always counts as a form # include forms of all POS for pos, pos_forms in get_word_forms(lemma).items(): forms.update(pos_forms) return forms
def get_related_forms(ws, should_ban=False): banned_words = set() nouns = set() adjs = set() advs = set() verbs = set() for idx, w in enumerate(ws, start=1): if w in banned_words and should_ban: continue # word net words wn_ws = get_word_forms(w) nns = wn_ws['n'] adj = wn_ws['a'] adv = wn_ws['r'] vrb = wn_ws['v'] candidates = nns.union(*[adv, adj, vrb]) nouns = nouns.union(nns) adjs = adjs.union(adj) advs = advs.union(adv) verbs = verbs.union(vrb) if w in candidates: candidates.remove(w) for cand in candidates: if cand in ws: banned_words.add(cand) return nouns, adjs, advs, verbs
def word_forms(string): proc_word = get_words(string) word_dict = {} for wd in proc_word: word_dict[wd] = get_word_forms(wd) return word_dict
analyzer = SentimentIntensityAnalyzer() tags = list(Tag.objects.all()) tags_synonyms = { tag: set( chain.from_iterable( [word.lemma_names() for word in wn.synsets(tag.tag_title)])) for tag in tags } optimised_tags = {} stop = set(stopwords.words('english')) for tag, syns in tags_synonyms.items(): optimised_tags[tag] = set() for syn in syns: words = get_word_forms(syn) for key, values in words.items(): for word in values: optimised_tags[tag].add(word) def get_sentiments(response): sentiment_dict = {'pos': 0, 'neg': 0, 'neu': 0} for line in response: for sentence in response.split("."): ret = get_sentiment(sentence) sentiment_dict[ret] += 1 if sentiment_dict['pos'] + sentiment_dict['neg'] == 0: return 50.0 return round(
def get_flattened_word_forms(word): wfs = get_word_forms(word) merged = list() for k, s in wfs.items(): merged.extend(s) return merged
def getSynonyms(p): regex = re.compile('[@_!#$%^&*()<>?/}{~:]') j = 0 corrections = {} wordsLists = nltk.word_tokenize(p) for word in wordsLists: if (word in iitblingo.keys()): corrections[word] = [iitblingo[word]] else: wordsList = nltk.word_tokenize(word) wordsList = [w for w in wordsList if not w in stop_words] tagged = nltk.pos_tag(wordsList) if (len(tagged) != 0): if (tagged[0][1] == 'JJ' or tagged[0][1] == 'JJR' or tagged[0][1] == 'JJS' or tagged[0][1] == 'RB' or tagged[0][1] == 'RBR' or tagged[0][1] == 'RBS' or tagged[0][1] == 'VB' or tagged[0][1] == 'VBD' or tagged[0][1] == 'VBG' or tagged[0][1] == 'VBN' or tagged[0][1] == 'VBZ'): string2 = ' ' if (j - 1 >= 0 and j + 1 < len(wordsLists)): string2 = wordsLists[ j - 1] + ' ' + wordsLists[j] + ' ' + wordsLists[j + 1] num2 = 0 syns = wordnet.synsets(WordNetLemmatizer().lemmatize( tagged[0][0], 'v')) ans = {} x = 0 for syn in syns: for w in syn.lemmas(): if (w.name().lower() != WordNetLemmatizer().lemmatize( tagged[0][0], 'v') and regex.search(w.name()) == None): verbs_combined = w.name().lower() if (tagged[0][1] == 'VB' or tagged[0][1] == 'VBZ' or tagged[0][1] == 'VBD' or tagged[0][1] == 'VBG' or tagged[0][1] == 'VBN'): l1 = list( get_word_forms( WordNetLemmatizer().lemmatize( w.name().lower(), 'v'))['v']) if (len(l1) == 0): x += 1 continue else: verbs_combined = '"' + '"/"'.join( words for words in l1) + '"' if (string2 != ' ' and string2 != ""): string2 = wordsLists[ j - 1] + ' ' + verbs_combined + ' ' + wordsLists[ j + 1] k = w.name() if (num2 == 0): if (string2 != ' ' and string2 != ""): encoded_query = urllib.parse.quote( string2) params = { 'corpus': 'eng-us', 'query': encoded_query, 'topk': 3 } params = '&'.join( '{}={}'.format(name, value) for name, value in params.items()) response = requests.get( 'https://api.phrasefinder.io/search?' + params) if (len(response.json()['phrases']) != 0): # print(response.json()) num2 = response.json( )['phrases'][0]['mc'] l = [ i['tt'] for i in response.json() ['phrases'][0]['tks'] if i['tg'] == 2 ] # print(num2) if (len(l) != 0): k = l[0] ans[k] = 10 * num2 num2 = 0 x += 1 out = [ x[0] for x in sorted(ans.items(), key=lambda x: -1 * x[1])[:3] ] corrections[tagged[0][0]] = out j = j + 1 return (corrections)
def runTest(self): self.assertEqual(get_word_forms(self.text_input), self.expected_output, self.description)
def get_variants_and_derivatives(word): return get_word_forms(word)
from word_forms.word_forms import get_word_forms get_word_forms("president")
ans = {} x = 0 for syn in syns: for w in syn.lemmas(): if (w.name().lower() != WordNetLemmatizer().lemmatize( tagged[0][0], 'v') and regex.search(w.name()) == None): # print(w.name().lower()) verbs_combined = w.name().lower() if (tagged[0][1] == 'VB' or tagged[0][1] == 'VBZ' or tagged[0][1] == 'VBD' or tagged[0][1] == 'VBG' or tagged[0][1] == 'VBN'): l1 = list( get_word_forms( WordNetLemmatizer().lemmatize( w.name().lower(), 'v'))['v']) if (len(l1) == 0): x += 1 continue else: verbs_combined = '"' + '"/"'.join( words for words in l1) + '"' # print(verbs_combined) # if(string1!=' 'and string1!=""): # string1 = wordsLists[j-2] + ' ' + wordsLists[j-1] + ' ' + verbs_combined if (string2 != ' ' and string2 != ""): string2 = wordsLists[ j - 1] + ' ' + verbs_combined + ' ' + wordsLists[
from word_forms.word_forms import get_word_forms from word_forms.lemmatizer import lemmatize print(lemmatize("help")) print(get_word_forms(lemmatize("help"))['n']) for word in get_word_forms(lemmatize("help"))['n']: print(word)
from word_forms.word_forms import get_word_forms #Changes POS tags to be compatible with word_forms outdic = {} outdic = get_word_forms("secret") print(outdic) print("noun", list(outdic.get('n'))) print("adj", list(outdic.get('a'))) print("adv", list(outdic.get('r'))) print("verb", list(outdic.get('v')))