def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict, 'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict,'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def verse(word): """Creates a small rhyme for a given word. The rhyme is based on WordNet's description for the word. This description is eloquated (alliterated or antonated), incorporated. """ g = en.noun.gloss(word) words = g.split(" ") for i in range(len(words)): w = words[i] w = w.replace("\"", "") if en.is_noun(w): w = eloquate(w) if random(100) > 60: if en.is_noun(w): w = incorporate(w).upper() if en.is_verb(w): w = incorporate(w, VERB) if en.is_adjective(w): w = incorporate(w, ADJECTIVE) if i > 0 and i % 3 == 0: words[i] = words[i] + "\n" words[i] = w g = " ".join(words) g = g.replace("type A ", "!") g = g.replace("group A ", "!") return g
def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try:#测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a)#try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except:#否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): # print "" return a return ''
def find_grammatical_kind(self): st = self.get_sentence() st = re.sub(",", "", st) # delete all commas result = [] m = st.split(" ") for each in m: flag = False if en.noun.is_emotion(each): result.append("emotion") flag = True elif en.is_connective(each): result.append("connective") flag = True elif en.is_verb(each): result.append("verb") flag = True elif en.is_adjective(each): result.append("adjective") flag = True elif en.is_noun(each): result.append("noun") flag = True elif en.is_persuasive(each): result.append("persuasive") flag = True elif en.is_number(each): result.append("number") flag = True if flag == False: result.append("unclear") return result
def singular_to_plural(self): final_list = [] st = self.get_sentence() list_seperate_by_comma = st.split(",") # divide the sentence to list of strings by all the ',' for each in list_seperate_by_comma: if each[0] == " ": # prevent bug each = each[1:] m = each.split(" ") # split each sentence to list of words plural_list = [] for each in m: if en.is_noun(each): each = en.noun.plural(each) elif en.is_adjective(each): each = en.adjective.plural(each) elif en.is_connective(each): each = self.my_inflect.plural(each) elif en.is_persuasive(each): each = en.persuasive.plural(each) plural_list.append(each) plural_list = " ".join(plural_list) # convert each list to string final_list.append(plural_list) final_list = ", ".join(final_list) return final_list
def translate_x_of_assertion(brain, a): prefix = a.relation[:-3] prefix_article = en.noun.article(prefix) # prefix_article_only = prefix_article.split(" ")[0] verb = get_tense(a, "was", a.l, brain) toReturn = "" if en.is_noun(en.noun.singular(prefix)): if is_plural(a.l, brain): prefix_article = en.noun.plural(prefix) toReturn = list_concepts_naturally( brain, a.l ) + " " + verb + " " + prefix_article + " of " + list_words_naturally( a.r) elif en.is_verb( en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) != "": if hasattr(a, "owner") and len(a.owner) > 0: owner = list_concepts_naturally(brain, a.owner) else: owner = "everyone" toReturn = list_concepts_naturally( brain, a.l ) + " " + prefix + " " + owner + " of " + list_concepts_naturally( brain, a.r) elif en.is_adjective(prefix): # TODO for capable_of >> deal with action, action_object, action_recipient... # Similar for used_for >> when used_for is action / verbs toReturn = list_concepts_naturally( brain, a.l) + " " + verb + " " + prefix + " of " + list_words_naturally( a.r) toReturn = add_end_marks(a, toReturn) return toReturn
def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try: #测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a) #try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except: #否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb( a) or en.is_connective(a): # print "" return a return ''
def is_a_expression(self, word): return self.is_a_hash_tag(word)\ or self.is_negation(word) \ or en.is_noun(word) \ or en.is_adjective(word) \ or en.is_verb(word) \ or en.is_adverb(word) \ or self.is_orality(word)
def adjectives(list): """Parses adjectives from a list of words. """ words = [] for word in list: word = word.strip() if en.is_adjective(word): words.append(word) return words
def get_gloss(word): if en.is_verb(word): return en.verb.gloss(word) elif en.is_adjective(word): return en.adjective.gloss(word) elif en.is_adverb(word): return en.adverb.gloss(word) elif en.is_noun(word): return en.noun.gloss(word) else: return en.wordnet.gloss(word)
def giveNearestEmotion(self, word): if en.is_verb(word): return en.verb.is_emotion(word, boolean=False) if en.is_adverb(word): return en.adverb.is_emotion(word, boolean=False) if en.is_adjective(word): return en.adjective.is_emotion(word, boolean=False) return en.noun.is_emotion(word, boolean=False)
def generate_word(list, pos): #% chance to generate new word if random.random() < percentage_chance: #repeat until word = pos while True: #get all synsets of random word in list synsets = wn.synsets(list[random.randint(0, len(list) - 1)], pos=pos) #get random synset synset = synsets[random.randint(0, len(synsets) - 1)] ran = random.randint(0,3) if ran == 0 and synset.hypernyms(): synset = synset.hypernyms()[random.randint(0, len(synset.hypernyms()) - 1)] elif ran == 1 and synset.hyponyms(): synset = synset.hyponyms()[random.randint(0, len(synset.hyponyms()) - 1)] #get random name from synset that does not contain an _ or - (these make the lib go insane) #words = the names of the synset words = synset.lemma_names() #this loop is to make sure an infinite loop does not occur #where you are picking from all invalid choices while len(words) > 0: word = words[random.randint(0, len(words) - 1)] if "_" not in word and "-" not in word: break else: words.remove(word) continue #if words doesn't have words in it, pick a new word from beginning if(len(words) == 0): continue if ((pos == wn.NOUN and en.is_noun(word)) or (pos == wn.VERB and en.is_verb(word)) or (pos == wn.ADJ and en.is_adjective(word))): #fix word based on pos #if verb, make sure the verb has a conjugation, #if it does, or is not a verb, the word gets appended to the word array, #and a word is returned if pos == wn.VERB: try: en.verb.present(word, person=3, negate=False) except KeyError: continue else: if word not in list: list.append(word) return word else: if word not in list: list.append(word) return word else: #just select a random word from the existing ones return list[random.randint(0, len(list) - 1)]
def normalize(word): ## TODO: make this function nicer (UT, shorter). ## all verb to present try: new_word = en.verb.present(word) if new_word != word and en.is_verb(new_word): return new_word except KeyError: pass new_word = en.noun.singular(word) if new_word != word and en.is_noun(new_word): return new_word if en.is_noun(word): new_word = re.sub(r'er$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'r$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ment$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ness', '', word) if new_word != word and en.is_adjective(new_word): return new_word ## adv to adj ## TODO: is there a quick way to do this in "en" libs new_word = re.sub(r'ly$', '', word) if new_word != word and en.is_adjective(new_word): return new_word if word.endswith('ly'): new_word = re.sub(r'ly$', '', word) + 'e' if new_word != word and en.is_adjective(new_word): return new_word if en.is_adjective(word): new_word = re.sub(r'ory$', '', word) + 'e' if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ive$', '', word) + 'e' if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ive$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'er$', '', word) if new_word != word and en.is_adjective(new_word): return new_word new_word = re.sub(r'r$', '', word) if new_word != word and en.is_adjective(new_word): return new_word return word
def get_article(word, tokens, index): article_index = index - 1 if index <= 0: return tokens[0] if not is_noun(word) and not is_adjective(word) and not is_adverb(word): return tokens[article_index] if tokens[article_index] == 'a' or tokens[article_index] == 'an': proper_article = noun.article(word).split()[0] return proper_article return tokens[article_index]
def simplify_word(a): #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) otherwordlist.append(a) #print a return a
def getcategory(self,word): #Higher prirority for verb try: if(en.verb.present(word)): return("v") except: pass #Check if it is a noun if(en.is_noun(word)): return("n") #Check if it is an adjective elif(en.is_adjective(word)): return("a") else: return(None)
def getcategory(self, word): #Higher prirority for verb try: if (en.verb.present(word)): return ("v") except: pass #Check if it is a noun if (en.is_noun(word)): return ("n") #Check if it is an adjective elif (en.is_adjective(word)): return ("a") else: return (None)
def convertVerb(srclst): dstlst = [] itemnew = "" for item in srclst: #print(item) ############################when nos lib give error #if (item.endswith("ed") or item.endswith("ing")) \ if en.is_verb(item) \ and (not en.is_noun(item)) \ and (not en.is_adjective(item)) \ and (not en.is_adverb(item)) \ and (item not in WIERDWORDS): try: itemnew = en.verb.present(item) except: print "unrecognized word:", item itemnew = item else: itemnew = item dstlst.append(itemnew) return dstlst
def simplify_word(a): try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a otherwordlist.append(a) return a
def translate_x_of_assertion(brain,a): prefix = a.relation[:-3] prefix_article = en.noun.article(prefix) # prefix_article_only = prefix_article.split(" ")[0] verb = get_tense(a, "was", a.l, brain) toReturn = "" if en.is_noun(en.noun.singular(prefix)): if is_plural(a.l, brain): prefix_article = en.noun.plural(prefix) toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix_article + " of " + list_words_naturally(a.r) elif en.is_verb(en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) !="": if hasattr(a,"owner") and len(a.owner)>0: owner = list_concepts_naturally(brain, a.owner) else: owner = "everyone" toReturn = list_concepts_naturally(brain, a.l) + " "+prefix +" "+owner+ " of " + list_concepts_naturally(brain, a.r) elif en.is_adjective(prefix): # TODO for capable_of >> deal with action, action_object, action_recipient... # Similar for used_for >> when used_for is action / verbs toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix + " of " + list_words_naturally(a.r) toReturn = add_end_marks(a, toReturn) return toReturn
def get_adj(tokens_tagged): r = re.compile(r'[^a-zA-Z]') adj = [] for i in range(len(tokens_tagged)): s = tokens_tagged[i] for j in range(len(s)): (w, t) = s[j] if t and t.startswith('J') and not r.match(w): adj.append([w, t, unicode(w.lower()), i, j]) count = defaultdict(int) for liste in adj: count[liste[2]] += 1 sorted_counts = sorted(count.items(), key=operator.itemgetter(1), reverse=True) sorted_counts = [(w, c) for (w, c) in sorted_counts if en.is_adjective(w)] adj_all = [] for (w, c) in sorted_counts: liste = [l for l in adj if l[2]== w] adj_all.append([w, c, liste]) return adj_all
def __init__(self, w, isTop): #maybe add time of post, what subreddit it came from? self.words = w self.verbCount = 0; self.nounCount = 0; self.adjCount = 0; self.connectiveCount = 0; self.other = 0 global topVerb global topVerb global topNoun global topAdj global topCon global topOther global topCount global botVerb global botNoun global botAdj global botCon global botOther global botCount self.count = 0 for word in self.words: self.count += 1 fixedWord = unicode(word).lower() if en.is_verb(fixedWord): if(isTop): topVerb += 1 else: botVerb += 1 self.verbCount += 1 elif en.is_noun(fixedWord): if(isTop): topNoun += 1 else: botNoun += 1 self.nounCount += 1 elif en.is_adjective(fixedWord): if(isTop): topAdj += 1 else: botAdj += 1 self.adjCount += 1 elif en.is_connective(fixedWord): if(isTop): topCon += 1 else: botCon += 1 self.connectiveCount += 1 else: if(isTop): topOther += 1 else: botOther += 1 self.other += 1 if isTop: topCount += self.count else: botCount += self.count
import re import en if __name__ == "__main__": print(en.is_adjective("accomplished")) print(en.is_noun("wizard")) print(en.is_verb("accomplish")) print( en.parser.sentence_tag( "The day after today, before yesterday. And in pase years, later")) en.parser.matches( "The day after today, before yesterday. And in pase years, later", "JJ NN")
def get_frequncy_dist(dir_path): files = os.listdir(dir_path) all_words = 0 words_wt_freq = {} '''get words''' for filename in files: if (filename.endswith('.srt')): file_handler = open(dir_path + '\\' + filename, 'r') for line in file_handler : for word in line.strip().split(): sword = word.strip(punctuation) if (sword.isalpha()): lword = sword.lower() words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1 all_words += 1 file_handler.close() logger.debug('# all words: ' + str (all_words - 1)) logger.debug('# unique words: ' + str (len(words_wt_freq.keys()))) lexical_diversity_for_freq(words_wt_freq.values()) lemmatized_words_wt_freq = {} for word in words_wt_freq.keys(): lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word) if (word != lemmatized_word and lemmatized_word != None): lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word) #print(lemmatized_word, word) else: lemmatized_words_wt_freq[word] = words_wt_freq.get(word) lemmatized_size = len(lemmatized_words_wt_freq.keys()) logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size)) lexical_diversity_for_freq(lemmatized_words_wt_freq.values()) words_wt_freq = {} # Save memory stopwords_en = stopwords.words('english') male_names = names.words('male.txt') female_names = names.words('female.txt') comparative = swadesh.words('en') ignore_list = [] ; ignore_list.extend(stopwords_en) ignore_list.extend(male_names) ignore_list.extend(female_names) ignore_list.extend(comparative) filtered_words = [] out_file = open(dir_path + '\\wfd.csv', 'w') out_file.write ('Word, Type, Frequency \n') for word in lemmatized_words_wt_freq.keys(): if len(word) > 2 and word not in ignore_list: filtered_words.append(word) else: out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words))) ignore_list = [] #save memory '''wordnet has 155k''' usual_words = [] for word in filtered_words: if (len(wordnet.synsets(word)) != 0): usual_words.append(word) else: out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words))) filtered_words = [] # save memory tag_filtered_words_wt_freq = {} words_wt_tags = nltk.pos_tag(usual_words) for (word, tag) in words_wt_tags: if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']): if(en.is_adverb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADV,' + word) elif (en.is_adjective(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADJ,' + word) elif (en.is_verb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('VB,' + word) elif (en.is_noun(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('N,' + word) else: if (tag in ['VBZ', 'NNS']): if word.endswith('s'): new_word = word[:-1] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) #print (word , new_word,tag) elif (tag == 'VBG'): new_word = en.verb.infinitive(word) if new_word != None and word != new_word: tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) elif (tag == 'JJS'): if word.endswith('est'): new_word = word[:-3] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) else: tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print (word,tag) else: out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys()))) lexical_diversity_for_freq(tag_filtered_words_wt_freq.values()) lemmatized_words_wt_freq = {} # save memory usual_words = [] #save memory basic_english_vocab = en.basic.words non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab) non_basic_words_wt_freq = {} for non_basic_word in non_basic_words: non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab) for word in words_in_both: out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys()))) lexical_diversity_for_freq(non_basic_words_wt_freq.values()) tag_filtered_words_wt_freq = {} #save memory fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r') my_words = [word.lower() for line in fh for word in line.strip().split()] fh.close() new_words = set(non_basic_words).difference(my_words) words_in_both = set(non_basic_words).intersection(my_words) for word in words_in_both: out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n') new_words_wt_freq = {} for new_word in new_words: new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys()))) lexical_diversity_for_freq(new_words_wt_freq.values()) sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0)) for (word, frequency) in sorted_words: out_file.write (word + ',lexicon,' + str(frequency) + '\n') out_file.close() return new_words_wt_freq
def is_major(word): return en.is_verb(word) or en.is_adjective(word) or\ en.is_adverb(word) or (word in MODAL_VERBS)
def autoPlural(word): if en.is_adjective(word): return en.plural.adjective_plural(word) else: return en.plural.noun_plural(word)
def valid_pos(word): if not is_noun(word) and not is_verb(word) and not is_adjective( word) and not is_adverb(word) and len(word) < 7: return False return True