def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try: #测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a) #try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except: #否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb( a) or en.is_connective(a): # print "" return a return ''
def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try:#测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a)#try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except:#否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): # print "" return a return ''
def verse(word): """Creates a small rhyme for a given word. The rhyme is based on WordNet's description for the word. This description is eloquated (alliterated or antonated), incorporated. """ g = en.noun.gloss(word) words = g.split(" ") for i in range(len(words)): w = words[i] w = w.replace("\"", "") if en.is_noun(w): w = eloquate(w) if random(100) > 60: if en.is_noun(w): w = incorporate(w).upper() if en.is_verb(w): w = incorporate(w, VERB) if en.is_adjective(w): w = incorporate(w, ADJECTIVE) if i > 0 and i % 3 == 0: words[i] = words[i] + "\n" words[i] = w g = " ".join(words) g = g.replace("type A ", "!") g = g.replace("group A ", "!") return g
def normalize(word): ## TODO: make this function nicer (UT, shorter). ## all verb to present try: new_word = en.verb.present(word) if new_word != word and en.is_verb(new_word): return new_word except KeyError: pass new_word = en.noun.singular(word) if new_word != word and en.is_noun(new_word): return new_word if en.is_noun(word): new_word = re.sub(r'er$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'r$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ment$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ness', '', word) if new_word != word and en.is_adjective(new_word): return new_word ## adv to adj ## TODO: is there a quick way to do this in "en" libs new_word = re.sub(r'ly$', '', word) if new_word != word and en.is_adjective(new_word): return new_word if word.endswith('ly'): new_word = re.sub(r'ly$', '', word) + 'e' if new_word != word and en.is_adjective(new_word): return new_word if en.is_adjective(word): new_word = re.sub(r'ory$', '', word) + 'e' if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ive$', '', word) + 'e' if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'ive$', '', word) if new_word != word and en.is_verb(new_word): return new_word new_word = re.sub(r'er$', '', word) if new_word != word and en.is_adjective(new_word): return new_word new_word = re.sub(r'r$', '', word) if new_word != word and en.is_adjective(new_word): return new_word return word
def list_concepts_naturally(brain, arr): finalString = "" for i in range(0, len(arr)): concept = str(arr[i]) # Remove keyword specifiers, if any. concept = concept.lstrip("$(").rstrip(")") # Refer to arr[i] by actual name. name = brain.get_assertions_with({ "l": [concept], "relation": "has_name" }) if name: if isinstance(name[0]["r"], basestring): concept = name[0]["r"].title() else: concept = name[0]["r"][0].title() else: # TODO: Retrieve the article for concept ("the", "a", "an", or ""). # article = get_article_for(concept, brain) # Due to the number of rules about article omission, we're going to use "the" for now as a simple solution. article = "the" if article != "": article = article + " " # Remove possessiveness if necessary for parsing. main_concept = concept if concept.endswith("'s"): main_concept = concept[:-2] main_concept = main_concept.rstrip("?:!.,;'") # Determine if we should use an article, either definite or indefinite. if main_concept != "his": if en.is_noun(en.noun.singular(main_concept)): concept = article + concept else: # e.g. gingerbread house >> a gingerbread house words = main_concept.split(" ") allNouns = True for w in words: if not en.is_noun(en.noun.singular(w)): allNouns = False if allNouns: concept = article + concept if (i < len(arr) - 2): finalString += concept + ", " elif (i == len(arr) - 2): if len(arr) == 2: finalString += concept + " and " else: finalString += concept + ", and " else: finalString += concept return finalString
def get_tense(word, pos=None): infinitive_word = apply_tense(word, 'infinitive') if is_verb(infinitive_word) and infinitive_word != '': return verb.tense(word) else: singular = to_singular(word) if singular != word and (is_noun(singular) or pos == 'Noun'): return 'plural noun' elif is_noun(word) or pos == 'Noun': return 'singular noun' else: return 'None'
def list_concepts_naturally(brain,arr): finalString="" for i in range(0, len(arr)): concept = str(arr[i]) # Remove keyword specifiers, if any. concept = concept.lstrip("$(").rstrip(")") # Refer to arr[i] by actual name. name = brain.get_assertions_with({"l":[concept],"relation":"has_name"}) if name: if isinstance(name[0]["r"],basestring): concept = name[0]["r"].title() else: concept = name[0]["r"][0].title() else: # TODO: Retrieve the article for concept ("the", "a", "an", or ""). # article = get_article_for(concept, brain) # Due to the number of rules about article omission, we're going to use "the" for now as a simple solution. article = "the" if article!="": article = article + " " # Remove possessiveness if necessary for parsing. main_concept = concept if concept.endswith("'s"): main_concept=concept[:-2] main_concept = main_concept.rstrip("?:!.,;'") # Determine if we should use an article, either definite or indefinite. if main_concept != "his": if en.is_noun(en.noun.singular(main_concept)): concept = article + concept else: # e.g. gingerbread house >> a gingerbread house words = main_concept.split(" ") allNouns = True for w in words: if not en.is_noun(en.noun.singular(w)): allNouns = False if allNouns: concept = article + concept if (i<len(arr)-2): finalString += concept + ", " elif (i==len(arr)-2): if len(arr)==2: finalString+= concept + " and " else: finalString += concept + ", and " else: finalString += concept return finalString
def get_nouns(tokens_tagged): # TODO anfangs initialisieren # Satzzeichen usw. raus, werden manchmal falsch getaggt r = re.compile(r'[^a-zA-Z]') nouns = [] for i in range(len(tokens_tagged)): # new sentence s = tokens_tagged[i] # for every token in the sentence for j in range(len(s)): (w, t) = s[j] if t == 'NN' and not r.match(w): nouns.append([w, t, unicode(w.lower()), i, j]) elif t == 'NNS'and not r.match(w): nouns.append([w, t, unicode(en.noun.singular(w.lower())), i, j]) # frequency ermitteln count = defaultdict(int) for liste in nouns: count[liste[2]] += 1 # nach frequency absteigend sortieren sorted_counts = sorted(count.items(), key=operator.itemgetter(1), reverse=True) # alle Lemmata, die nicht in Wordnet enthalten sind, entfernen sorted_counts = [(n, c) for (n, c) in sorted_counts if en.is_noun(n)] # an alle lemmata die liste ihrer vorkommen anhaengen nouns_all = [] for (n, c) in sorted_counts: liste = [l for l in nouns if l[2]== n] nouns_all.append([n, c, liste]) return nouns_all
def singular_to_plural(self): final_list = [] st = self.get_sentence() list_seperate_by_comma = st.split(",") # divide the sentence to list of strings by all the ',' for each in list_seperate_by_comma: if each[0] == " ": # prevent bug each = each[1:] m = each.split(" ") # split each sentence to list of words plural_list = [] for each in m: if en.is_noun(each): each = en.noun.plural(each) elif en.is_adjective(each): each = en.adjective.plural(each) elif en.is_connective(each): each = self.my_inflect.plural(each) elif en.is_persuasive(each): each = en.persuasive.plural(each) plural_list.append(each) plural_list = " ".join(plural_list) # convert each list to string final_list.append(plural_list) final_list = ", ".join(final_list) return final_list
def find_grammatical_kind(self): st = self.get_sentence() st = re.sub(",", "", st) # delete all commas result = [] m = st.split(" ") for each in m: flag = False if en.noun.is_emotion(each): result.append("emotion") flag = True elif en.is_connective(each): result.append("connective") flag = True elif en.is_verb(each): result.append("verb") flag = True elif en.is_adjective(each): result.append("adjective") flag = True elif en.is_noun(each): result.append("noun") flag = True elif en.is_persuasive(each): result.append("persuasive") flag = True elif en.is_number(each): result.append("number") flag = True if flag == False: result.append("unclear") return result
def simplify_word(a): #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) otherwordlist.append(a) #print a return a
def translate_x_of_assertion(brain, a): prefix = a.relation[:-3] prefix_article = en.noun.article(prefix) # prefix_article_only = prefix_article.split(" ")[0] verb = get_tense(a, "was", a.l, brain) toReturn = "" if en.is_noun(en.noun.singular(prefix)): if is_plural(a.l, brain): prefix_article = en.noun.plural(prefix) toReturn = list_concepts_naturally( brain, a.l ) + " " + verb + " " + prefix_article + " of " + list_words_naturally( a.r) elif en.is_verb( en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) != "": if hasattr(a, "owner") and len(a.owner) > 0: owner = list_concepts_naturally(brain, a.owner) else: owner = "everyone" toReturn = list_concepts_naturally( brain, a.l ) + " " + prefix + " " + owner + " of " + list_concepts_naturally( brain, a.r) elif en.is_adjective(prefix): # TODO for capable_of >> deal with action, action_object, action_recipient... # Similar for used_for >> when used_for is action / verbs toReturn = list_concepts_naturally( brain, a.l) + " " + verb + " " + prefix + " of " + list_words_naturally( a.r) toReturn = add_end_marks(a, toReturn) return toReturn
def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict, 'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict,'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def is_a_expression(self, word): return self.is_a_hash_tag(word)\ or self.is_negation(word) \ or en.is_noun(word) \ or en.is_adjective(word) \ or en.is_verb(word) \ or en.is_adverb(word) \ or self.is_orality(word)
def stem(word): result = en.verb.infinitive(word) if len(result) != 0 and en.is_verb(result): return result result = en.noun.singular(word) if len(result) != 0 and en.is_noun(result): return result return word
def is_clickable(self, node): """ Every node that is a noun is clickable (except the root). """ if en.is_noun(str(node.id.lower())) \ or self.is_expandable(node.id) and node != self.root: return True else: return False
def cosine_similarity(word1, word2, model): if word1 not in model.dictionary: ws = [ w for w in word1.split(u'_') if w in model.dictionary and en.is_noun(w) ] if not ws: return 0 elif len( ws ) > 1: # there are at least 2 elements of the list is in dictionary # Simple composition by summing the vector v1 = model.word_vectors[model.dictionary[ ws[-1]]] * 0.7 + model.word_vectors[model.dictionary[ ws[-2]]] * 0.3 else: # has 1 element that is in the dictionary v1 = model.word_vectors[model.dictionary[ws[0]]] else: v1 = model.word_vectors[model.dictionary[word1]] if word2 not in model.dictionary: ws = [ w for w in word2.split(u'_') if w in model.dictionary and en.is_noun(w) ] if not ws: return 0 elif len( ws ) > 1: # there are at least 2 elements of the list is in dictionary # Simple composition by summing the vector v2 = model.word_vectors[model.dictionary[ ws[-1]]] * 0.7 + model.word_vectors[model.dictionary[ ws[-2]]] * 0.3 else: # has 1 element that is in the dictionary v2 = model.word_vectors[model.dictionary[ws[0]]] else: v2 = model.word_vectors[model.dictionary[word2]] try: return 1 - cosine(v1, v2) except Exception as ex: # key does return 0.0
def nouns(list): """Parses nouns from a list of words. """ words = [] for word in list: word = word.strip() if en.is_noun(word): words.append(word) return words
def get_gloss(word): if en.is_verb(word): return en.verb.gloss(word) elif en.is_adjective(word): return en.adjective.gloss(word) elif en.is_adverb(word): return en.adverb.gloss(word) elif en.is_noun(word): return en.noun.gloss(word) else: return en.wordnet.gloss(word)
def simplify_word(a): try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a otherwordlist.append(a) return a
def webpages(self, q): new_mean = [] dict = {} webpages = [] temp = [] keywords = q.split(' ') keywords = filter(None, keywords) length = len(keywords) global ignorewords for i in range(0, len(keywords)): if keywords[i] not in ignorewords: syn = wn.synsets(keywords[i]) dict[keywords[i]] = '' meanings = [s.lemmas[0].name for s in syn] meanings = [str(item).lower() for item in meanings] if en.is_noun(keywords[i]): meanings.append(en.noun.plural(keywords[i])) if en.is_verb(keywords[i]): meanings.append(en.verb.past(keywords[i])) meanings.append(en.verb.past_participle(keywords[i])) meanings.append(en.verb.present(keywords[i])) for item in meanings: if self.similar(item, keywords[i]) != 1: new_mean.append(item) if new_mean == []: dict[keywords[i]] = '' poss = [(k, v) if v else (k, ) for k, v in dict.items()] temp.append(list(product(*poss))) else: new_mean = list(set(new_mean)) for j in range(0, len(new_mean)): dict[keywords[i]] = new_mean[j] poss = [(k, v) if v else (k, ) for k, v in dict.items()] temp.append(list(product(*poss))) new_mean = [] else: length = length - 1 temp = [item for sublist in temp for item in sublist] for item in temp: if len(item) == length: new_mean.append(item) new_mean = list(set(new_mean)) for i in range(0, len(new_mean)): if i == 0: webpages = self.set_of_webpages(new_mean[i]) #print i,' ',webpages,new_mean[i] else: temp = self.set_of_webpages(new_mean[i]) #print i,' ',temp,new_mean[i] webpages = list(set(webpages) | set(temp)) return webpages
def generate_word(list, pos): #% chance to generate new word if random.random() < percentage_chance: #repeat until word = pos while True: #get all synsets of random word in list synsets = wn.synsets(list[random.randint(0, len(list) - 1)], pos=pos) #get random synset synset = synsets[random.randint(0, len(synsets) - 1)] ran = random.randint(0,3) if ran == 0 and synset.hypernyms(): synset = synset.hypernyms()[random.randint(0, len(synset.hypernyms()) - 1)] elif ran == 1 and synset.hyponyms(): synset = synset.hyponyms()[random.randint(0, len(synset.hyponyms()) - 1)] #get random name from synset that does not contain an _ or - (these make the lib go insane) #words = the names of the synset words = synset.lemma_names() #this loop is to make sure an infinite loop does not occur #where you are picking from all invalid choices while len(words) > 0: word = words[random.randint(0, len(words) - 1)] if "_" not in word and "-" not in word: break else: words.remove(word) continue #if words doesn't have words in it, pick a new word from beginning if(len(words) == 0): continue if ((pos == wn.NOUN and en.is_noun(word)) or (pos == wn.VERB and en.is_verb(word)) or (pos == wn.ADJ and en.is_adjective(word))): #fix word based on pos #if verb, make sure the verb has a conjugation, #if it does, or is not a verb, the word gets appended to the word array, #and a word is returned if pos == wn.VERB: try: en.verb.present(word, person=3, negate=False) except KeyError: continue else: if word not in list: list.append(word) return word else: if word not in list: list.append(word) return word else: #just select a random word from the existing ones return list[random.randint(0, len(list) - 1)]
def __init__(self, tableName): self.name = tableName tableName = tableName.lower() if(en.is_noun(tableName)): self.wordType = "noun" self.mapsTo = [en.noun.plural(tableName), en.noun.singular(tableName)] else: self.wordType = "verb" self.mapsTo = [en.verb.infinitive(tableName), en.verb.present(tableName, person=3, negate=False), en.verb.past(tableName), en.verb.present_participle(tableName)] self.columns = [] self.expose = True;
def correct_form(ans, word): if "_" in ans: return ans # print "correcting", ans, "to match", word if en.verb.infinitive(word): if en.verb.infinitive(ans): return en.verb.conjugate(ans, en.verb.tense(word)) else: return ans else: if en.is_noun(word): return en.noun.singular(ans) else: return ans
def get_article(word, tokens, index): article_index = index - 1 if index <= 0: return tokens[0] if not is_noun(word) and not is_adjective(word) and not is_adverb(word): return tokens[article_index] if tokens[article_index] == 'a' or tokens[article_index] == 'an': proper_article = noun.article(word).split()[0] return proper_article return tokens[article_index]
def getcategory(self,word): #Higher prirority for verb try: if(en.verb.present(word)): return("v") except: pass #Check if it is a noun if(en.is_noun(word)): return("n") #Check if it is an adjective elif(en.is_adjective(word)): return("a") else: return(None)
def getcategory(self, word): #Higher prirority for verb try: if (en.verb.present(word)): return ("v") except: pass #Check if it is a noun if (en.is_noun(word)): return ("n") #Check if it is an adjective elif (en.is_adjective(word)): return ("a") else: return (None)
def convertVerb(srclst): dstlst = [] itemnew = "" for item in srclst: #print(item) ############################when nos lib give error #if (item.endswith("ed") or item.endswith("ing")) \ if en.is_verb(item) \ and (not en.is_noun(item)) \ and (not en.is_adjective(item)) \ and (not en.is_adverb(item)) \ and (item not in WIERDWORDS): try: itemnew = en.verb.present(item) except: print "unrecognized word:", item itemnew = item else: itemnew = item dstlst.append(itemnew) return dstlst
def _lazy_singularize(self, str): """ Attempts to singularize the given string. Does some straightforward inflections and checks with the en library if the result's plural is the same as the given string. """ inflections = [ ("ves" , "f"), ("ies" , "y"), ("es" , ""), ("s" , "") ] for pl, sg in inflections: singular = str.strip(pl) + sg if str == en.noun.plural(singular) \ and en.is_noun(singular): return singular return str
def add_forms(self): forms = [] for w in self.phrases: if en.is_verb(w.name): try: vb = en.verb.infinitive(w.name) vbd = en.verb.past(w.name) vbp1 = en.verb.present(w.name, person=1) vbp2 = en.verb.present(w.name, person=2) vbz = en.verb.present(w.name, person=3) vbg = en.verb.present_participle(w.name) forms.append(Word(vb, "VB")) forms.append(Word(vbd, "VBD")) forms.append(Word(vbp1, "VBP")) forms.append(Word(vbz, "VBZ")) forms.append(Word(vbg, "VBG")) except: print "Error in conjugation for verb:" + w.name elif en.is_noun(w.name): nns = en.noun.plural(w.name) forms.append(Word(nns, "NNS")) return forms
def translate_x_of_assertion(brain,a): prefix = a.relation[:-3] prefix_article = en.noun.article(prefix) # prefix_article_only = prefix_article.split(" ")[0] verb = get_tense(a, "was", a.l, brain) toReturn = "" if en.is_noun(en.noun.singular(prefix)): if is_plural(a.l, brain): prefix_article = en.noun.plural(prefix) toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix_article + " of " + list_words_naturally(a.r) elif en.is_verb(en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) !="": if hasattr(a,"owner") and len(a.owner)>0: owner = list_concepts_naturally(brain, a.owner) else: owner = "everyone" toReturn = list_concepts_naturally(brain, a.l) + " "+prefix +" "+owner+ " of " + list_concepts_naturally(brain, a.r) elif en.is_adjective(prefix): # TODO for capable_of >> deal with action, action_object, action_recipient... # Similar for used_for >> when used_for is action / verbs toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix + " of " + list_words_naturally(a.r) toReturn = add_end_marks(a, toReturn) return toReturn
def add_forms(self): forms = [] for w in self.phrases: if en.is_verb(w.name): try: vb = en.verb.infinitive(w.name) vbd = en.verb.past(w.name) vbp1 = en.verb.present(w.name, person = 1) vbp2 = en.verb.present(w.name, person = 2) vbz = en.verb.present(w.name, person = 3) vbg = en.verb.present_participle(w.name) forms.append(Word(vb,"VB")) forms.append(Word(vbd,"VBD")) forms.append(Word(vbp1,"VBP")) forms.append(Word(vbz,"VBZ")) forms.append(Word(vbg,"VBG")) except: print "Error in conjugation for verb:" + w.name elif en.is_noun(w.name): nns = en.noun.plural(w.name) forms.append(Word(nns, "NNS")) return forms
def resolvePlural(self, sentence): for i in range(len(sentence)): word = sentence[i] if '<PLURAL>' in word: pluralized = en.noun.plural(word[:word.find('<')]) word = word.replace(word[:word.find('<')], "") # Remove word remainingTags = word.replace('<PLURAL>', "") # Remove plural tag sentence[i] = pluralized + remainingTags # Convert nearest noun to "few", "many", "several" or number to plural if word in ["few", "many", "several"] or (word.isdigit() and word != '1'): j = i + 1 while j < len(sentence): next = sentence[j] nextSplit = next.split('<') next_word = nextSplit[0] if ('<NOUN>' in next or en.is_noun(next_word)) and '<PLURAL>' not in next: pluralized = en.noun.plural(next_word) tag = "" if (len(nextSplit) == 2): tag = '<' + nextSplit[1] sentence[j] = pluralized + tag break j += 1
def _parse(chunk_): # as deep as the oceans -> ocean noun = clean(chunk_[-1][0]) if chunk_[-2][0] == "the" and en.is_noun(en.noun.singular(noun)): return en.noun.singular(noun) return noun
def get_frequncy_dist(dir_path): files = os.listdir(dir_path) all_words = 0 words_wt_freq = {} '''get words''' for filename in files: if (filename.endswith('.srt')): file_handler = open(dir_path + '\\' + filename, 'r') for line in file_handler : for word in line.strip().split(): sword = word.strip(punctuation) if (sword.isalpha()): lword = sword.lower() words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1 all_words += 1 file_handler.close() logger.debug('# all words: ' + str (all_words - 1)) logger.debug('# unique words: ' + str (len(words_wt_freq.keys()))) lexical_diversity_for_freq(words_wt_freq.values()) lemmatized_words_wt_freq = {} for word in words_wt_freq.keys(): lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word) if (word != lemmatized_word and lemmatized_word != None): lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word) #print(lemmatized_word, word) else: lemmatized_words_wt_freq[word] = words_wt_freq.get(word) lemmatized_size = len(lemmatized_words_wt_freq.keys()) logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size)) lexical_diversity_for_freq(lemmatized_words_wt_freq.values()) words_wt_freq = {} # Save memory stopwords_en = stopwords.words('english') male_names = names.words('male.txt') female_names = names.words('female.txt') comparative = swadesh.words('en') ignore_list = [] ; ignore_list.extend(stopwords_en) ignore_list.extend(male_names) ignore_list.extend(female_names) ignore_list.extend(comparative) filtered_words = [] out_file = open(dir_path + '\\wfd.csv', 'w') out_file.write ('Word, Type, Frequency \n') for word in lemmatized_words_wt_freq.keys(): if len(word) > 2 and word not in ignore_list: filtered_words.append(word) else: out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words))) ignore_list = [] #save memory '''wordnet has 155k''' usual_words = [] for word in filtered_words: if (len(wordnet.synsets(word)) != 0): usual_words.append(word) else: out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words))) filtered_words = [] # save memory tag_filtered_words_wt_freq = {} words_wt_tags = nltk.pos_tag(usual_words) for (word, tag) in words_wt_tags: if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']): if(en.is_adverb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADV,' + word) elif (en.is_adjective(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADJ,' + word) elif (en.is_verb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('VB,' + word) elif (en.is_noun(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('N,' + word) else: if (tag in ['VBZ', 'NNS']): if word.endswith('s'): new_word = word[:-1] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) #print (word , new_word,tag) elif (tag == 'VBG'): new_word = en.verb.infinitive(word) if new_word != None and word != new_word: tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) elif (tag == 'JJS'): if word.endswith('est'): new_word = word[:-3] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) else: tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print (word,tag) else: out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys()))) lexical_diversity_for_freq(tag_filtered_words_wt_freq.values()) lemmatized_words_wt_freq = {} # save memory usual_words = [] #save memory basic_english_vocab = en.basic.words non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab) non_basic_words_wt_freq = {} for non_basic_word in non_basic_words: non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab) for word in words_in_both: out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys()))) lexical_diversity_for_freq(non_basic_words_wt_freq.values()) tag_filtered_words_wt_freq = {} #save memory fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r') my_words = [word.lower() for line in fh for word in line.strip().split()] fh.close() new_words = set(non_basic_words).difference(my_words) words_in_both = set(non_basic_words).intersection(my_words) for word in words_in_both: out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n') new_words_wt_freq = {} for new_word in new_words: new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys()))) lexical_diversity_for_freq(new_words_wt_freq.values()) sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0)) for (word, frequency) in sorted_words: out_file.write (word + ',lexicon,' + str(frequency) + '\n') out_file.close() return new_words_wt_freq
import re import en if __name__ == "__main__": print(en.is_adjective("accomplished")) print(en.is_noun("wizard")) print(en.is_verb("accomplish")) print( en.parser.sentence_tag( "The day after today, before yesterday. And in pase years, later")) en.parser.matches( "The day after today, before yesterday. And in pase years, later", "JJ NN")
import en # This file just runs some tests to see if en is working. # To run it, cd to the directory just above en, then # python2 < _en-test.py # LEXICAL CATEGORIZATION ############################################################ # Returns True when the given value is a number. print(1, en.is_number(12)) print(2, en.is_number("twelve")) # Returns True when the given string is a noun. # You can also check for is_verb(), is_adjective() and is_adverb(). print(3, en.is_noun("banana")) # Returns True when the given string is a tag, # for example HTML or XML. print(4, en.is_tag("</a>")) # Return True when the string is a HTML tag, # for example <a> or <body>. print(5, en.is_html_tag("</person>")) # COMMONSENSE ####################################################################### # Returns True if the given word expresses a basic emotion: # anger, disgust, fear, joy, sadness, surprise. print(6, en.is_basic_emotion("cheerful"))
def pluralize(term): if en.is_noun(term): pterm = en.noun.plural(term) if pterm is not term : variations[term].add(pterm)
def valid_pos(word): if not is_noun(word) and not is_verb(word) and not is_adjective( word) and not is_adverb(word) and len(word) < 7: return False return True
def singularize(term): if en.is_noun(term): sterm = en.noun.singular(term) if sterm is not term: variations[term].add(sterm)
def pluralize(term): if en.is_noun(term): pterm = en.noun.plural(term) if pterm is not term: variations[term].add(pterm)
def toverb(term): if en.is_noun(term): vterm = lmtzr.lemmatize(term, 'v') if vterm is not term: variations[term].add(vterm)
def singularize(term): if en.is_noun(term): sterm = en.noun.singular(term) if sterm is not term : variations[term].add(sterm)
import en print en.is_basic_emotion("anxious") print en.is_persuasive("money") print en.noun.is_emotion("anger") print en.adjective.is_emotion("anxious", boolean=False) print en.is_noun("comptuer") print en.spelling.suggest("computer")[0] print en.verb.is_emotion("love", boolean=False) print en.verb.infinitive("announced") print en.verb.infinitive("dont") print en.is_verb("went") a=en.verb.infinitive("dont") print en.verb.is_emotion(a, boolean=False) print en.is_noun("people") print en.is_noun(en.noun.singular("adore")) print en.noun.lexname("book") print en.noun.lexname("music") print en.noun.lexname("water") print en.noun.lexname("fear") print en.noun.lexname("love") print en.noun.lexname("like") print en.noun.lexname("hate") print en.noun.lexname("overcome") print en.adverb.lexname("actually")
def toverb(term): if en.is_noun(term): vterm=lmtzr.lemmatize(term,'v') if vterm is not term: variations[term].add(vterm)
def __init__(self, w, isTop): #maybe add time of post, what subreddit it came from? self.words = w self.verbCount = 0; self.nounCount = 0; self.adjCount = 0; self.connectiveCount = 0; self.other = 0 global topVerb global topVerb global topNoun global topAdj global topCon global topOther global topCount global botVerb global botNoun global botAdj global botCon global botOther global botCount self.count = 0 for word in self.words: self.count += 1 fixedWord = unicode(word).lower() if en.is_verb(fixedWord): if(isTop): topVerb += 1 else: botVerb += 1 self.verbCount += 1 elif en.is_noun(fixedWord): if(isTop): topNoun += 1 else: botNoun += 1 self.nounCount += 1 elif en.is_adjective(fixedWord): if(isTop): topAdj += 1 else: botAdj += 1 self.adjCount += 1 elif en.is_connective(fixedWord): if(isTop): topCon += 1 else: botCon += 1 self.connectiveCount += 1 else: if(isTop): topOther += 1 else: botOther += 1 self.other += 1 if isTop: topCount += self.count else: botCount += self.count