def compare_pos(file_name_1, file_name_2): tokens_1 = make_tokens(file_name_1) tokens_2 = make_tokens(file_name_2) tri_tokens_1 = trigrams(tokens_1) tri_tokens_2 = trigrams(tokens_2) dist_1 = nltk.FreqDist(tri_tokens_1) dist_2 = nltk.FreqDist(tri_tokens_2) diff_1 = dist_1 - dist_2 diff_2 = dist_2 - dist_1 with open("common_pos_mt.txt", "w") as file: for word, freq in diff_1.most_common(20): line = str(word) + " " + str(freq) + '\n' print(line) file.write(line) with open("common_pos_hmn.txt", "w") as file: for word, freq in diff_2.most_common(20): line = str(word) + " " + str(freq) + '\n' print(line) file.write(line) """
def train(self,tweets): # 1st step: build the bag-of-words model tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets] tokens = [] print('Computing the trainset vocabulary of n-grams') for tweet_tokens in tweet_tokens_list: unigrams = [w.lower() for w,t in tweet_tokens] tokens += unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)] # build the bag-of-words list using all the tokens self.bag_of_words = set(tokens) data = list() total_tweets = len(tweets) features_list = list() for index,(tweet_tokens,label) in enumerate(tweets): print('Training for tweet n. {}/{}'.format(index+1,total_tweets)) features_list.append(self.extract_features(tweet_tokens)) # Train a SVM classifier #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features]) print('Vectorizing the features') data = self.vectorizer.fit_transform(features_list) target = self.encoder.fit_transform([label for tweet_tokens,label in tweets]) print('Building the model') self.classifier.fit(data, target)
def main(): text = open('holmes.txt').read() tokens = nltk.wordpunct_tokenize(text) charList = [] for word in tokens: for char in word: charList.append(char) fDistChars = nltk.FreqDist(charList) fDistWords = nltk.FreqDist(tokens) print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars))) print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords))) bigramChars = nltk.bigrams(charList) trigramChars = nltk.trigrams(charList) print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), most_common(bigramChars), most_common(trigramChars))) bigramWords = nltk.bigrams(tokens) trigramWords = nltk.trigrams(tokens) print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), most_common(bigramWords), most_common(trigramWords))) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scoredPMI = finder.score_ngrams(bigram_measures.pmi) scoredCHI = finder.score_ngrams(bigram_measures.chi_sq) print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20])) print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') unigrams = [w.lower() for w,t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_'+token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']: feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token,tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuation ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token,tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set
def jacquard_trigram(query): final=[] for a in file('enwiktionary.a.list'): a=a.rstrip() trigram=set(nltk.trigrams(a)) q_trigram=set(nltk.trigrams(query)) intersect=q_trigram.intersection(trigram) union=q_trigram.union(trigram) sim=float(len(intersect))/len(union) final.append([a,sim]) final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True) print final_sorted[:10]
def main(): OUT = open("../output.txt", "w") OUT.close() INP = open("../data/test.hyp1-hyp2-ref", "r") inp = INP.read() for sent in inp.split("\n")[:-1]: h1 = sent.split(" ||| ")[0].split(" ") h2 = sent.split(" ||| ")[1].split(" ") ref = sent.split(" ||| ")[2].split(" ") h1p = process(h1) h2p = process(h2) refp = process(ref) #print(h1c, h2c, refc) #h1_match = word_matches(h1, rset) #h2_match = word_matches(h2, rset) h1c = Counter(h1) h2c = Counter(h2) refc = Counter(ref) h1_bigrams = nltk.bigrams(h1) h2_bigrams = nltk.bigrams(h2) ref_bigrams = nltk.bigrams(ref) h1_trigrams = nltk.trigrams(h1) h2_trigrams = nltk.trigrams(h2) ref_trigrams = nltk.trigrams(ref) #print(h_bigrams, ref_bigrams) h1_bigramsc = Counter(h1_bigrams) h2_bigramsc = Counter(h2_bigrams) ref_bigramsc = Counter(ref_bigrams) h1_trigramsc = Counter(h1_trigrams) h2_trigramsc = Counter(h2_trigrams) ref_trigramsc = Counter(ref_trigrams) h1_allc = h1c + h1_bigramsc + h1_trigramsc h2_allc = h2c + h2_bigramsc + h2_trigramsc ref_allc = refc + ref_bigramsc + ref_trigramsc h1_precision = precision(h1_allc, ref_allc) h2_precision = precision(h2_allc, ref_allc) h1_recall = recall(h1_allc, ref_allc) h2_recall = recall(h2_allc, ref_allc) h1_meteor = meteor(h1_precision, h1_recall) h2_meteor = meteor(h2_precision, h2_recall) OUT = open("../output.txt", "a") if h1_meteor > h2_meteor: OUT.write("-1\n") else: if h1_meteor == h2_meteor: OUT.write("0\n") else: OUT.write("1\n") OUT.close()
def calc_probabilities(training_corpus): unigram_c = collections.defaultdict(int) bigram_c = collections.defaultdict(int) trigram_c = collections.defaultdict(int) for sentence in training_corpus: tokens0 = sentence.strip().split() tokens1 = tokens0 + [STOP_SYMBOL] tokens2 = [START_SYMBOL] + tokens0 + [STOP_SYMBOL] tokens3 = [START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL] # unigrams for unigram in tokens1: unigram_c[unigram] += 1 # bigrams for bigram in nltk.bigrams(tokens2): bigram_c[bigram] += 1 # trigrams for trigram in nltk.trigrams(tokens3): trigram_c[trigram] += 1 unigrams_len = sum(unigram_c.itervalues()) unigram_p = {k: math.log(float(v) / unigrams_len, 2) for k, v in unigram_c.iteritems()} # calc P(W2|W1) = P(W2,W1) / P(W1) = C(W2,W1) / C(W1) unigram_c[START_SYMBOL] = len(training_corpus) bigram_p = {k: math.log(float(v) / unigram_c[k[0]], 2) for k, v in bigram_c.iteritems()} bigram_c[(START_SYMBOL, START_SYMBOL)] = len(training_corpus) trigram_p = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()} return unigram_p, bigram_p, trigram_p
def ngramify(self, word_list): """ Tranforms word_list into unigrams, bigrams, trigrams input: list of words """ # creates an ngram from a word_list based on class settings mode = self.mode pos = self.inclued_pos word = self.include_word if word and pos: selection = [(w.lower(), p) for w, p in word_list] elif word: selection = [w.lower() for w, p in word_list] elif pos: selection = [p for w, p in word_list] if mode == "unigrams": word_list = selection elif mode == "bigrams": word_list = nltk.bigrams(selection) elif mode == "trigrams": word_list = nltk.trigrams(selection) return word_list
def calc_trigrams(brown_tags): ''' Calculate the log-probabilities of tag trigrams. :param brown_tags: List of 'sentence tags List' [ [], [] .. ] :return: tag trigram probability dictionary ''' unigram_p, bigram_p, trigram_p = {}, {}, {} unigram_c, bigram_c, trigram_c = Counter(), Counter(), Counter() # flatten brown tags since it's list of tag lists. brown_tags_flat = [item for sublist in brown_tags for item in sublist] unigram_c.update(brown_tags_flat) # unigram bigram_c.update(nltk.bigrams(brown_tags_flat)) # bigram trigram_c.update(nltk.trigrams(brown_tags_flat)) # trigram unigram_len, bigram_len, trigram_len = sum(unigram_c.values()), sum(bigram_c.values()), sum(trigram_c.values()) # prepare unigram log probabilities -> P(Wi) = c(Wi) / V for unigram, count in unigram_c.iteritems(): unigram_p[(unigram,)] = math.log(count / float(unigram_len-32491), 2) # prepare bigram log probabilities -> P(Wi|Wi-1) = c(Wi-1,Wi)/c(Wi-1) for bigram, count in bigram_c.iteritems(): bigram_p[bigram] = math.log(count / float(unigram_c[bigram[0]]), 2) # prepare trigram log probabilities -> P(Wi|Wi-2, Wi-1) = c(Wi-2,Wi-1,Wi)/c(Wi-2,Wi-1) for trigram, count in trigram_c.iteritems(): trigram_p[trigram] = math.log(count / float(bigram_c[trigram[:2]]), 2) return trigram_p
def linearscore(unigrams, bigrams, trigrams, corpus): """Linear interpolate the probabilities. See http://web.stanford.edu/~jurafsky/slp3/4.pdf paragraph 4.4.3 """ scores = [] # Set lambda equal to all the n-grams so that it sums up to 1. lambda_ = 1.0 / 3 for sentence in corpus: interpolated_score = 0 tokens0 = sentence.strip().split() for trigram in nltk.trigrams([START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]): try: p3 = trigrams[trigram] except KeyError: p3 = MINUS_INFINITY_SENTENCE_LOG_PROB try: p2 = bigrams[trigram[1:3]] except KeyError: p2 = MINUS_INFINITY_SENTENCE_LOG_PROB try: p1 = unigrams[trigram[2]] except KeyError: p1 = MINUS_INFINITY_SENTENCE_LOG_PROB interpolated_score += math.log(lambda_ * (2 ** p3) + lambda_ * (2 ** p2) + lambda_ * (2 ** p1), 2) scores.append(interpolated_score) return scores
def _find_names_in_tokens(self, tokens): """Returns tuple Takes list of all tokens from a document and returns back tuple of found names. First element is a an alphabetised list of unique names, second -- names in the order of their occurance in the document, third -- offsets for each mention of the name in the document Arguments: tokens -- list with all tokens from the searched document """ self._index_dict = create_index(tokens) token_string = " ".join(tokens) if len(tokens) == 2: if (self._is_like_binomial(tokens[0], tokens[1]) and self._is_a_name(token_string, tokens, 0, 1)): self._names_list.append(token_string) elif len(tokens) == 1: if (len(tokens[0]) > 2 and tokens[0][0].isupper() and tokens[0].isalpha() and self._is_not_in_black_list(tokens[0]) and self._is_a_name(tokens[0], tokens, 0, 0)): self._names_list.append(tokens[0]) else: trigrams = nltk.trigrams(tokens) self._walk_trigrams(trigrams, tokens) self._check_last_bigram_unigram(trigrams[-1], tokens) return self._generate_output()
def ngramify(self, word_list, stop): # creates an ngram from a word_list based on class settings mode = self.mode pos = self.inclued_pos word = self.include_word stopset = set(stopwords.words("english")) stopset.remove("not") if stop: if word and pos: selection = [(w.lower(), p) for w, p in word_list if w.lower() not in stopset] elif word: selection = [w.lower() for w, p in word_list if w.lower() not in stopset] elif pos: selection = [p for w, p in word_list if w.lower() not in stopset] else: if word and pos: selection = [(w.lower(), p) for w, p in word_list] elif word: selection = [w.lower() for w, p in word_list] elif pos: selection = [p for w, p in word_list] if mode == "unigrams": word_list = selection elif mode == "bigrams": word_list = nltk.bigrams(selection) elif mode == "trigrams": word_list = nltk.trigrams(selection) return word_list
def getTriGramsFromComments(text): # split the texts into tokens tokens = nltk.word_tokenize(text) tokens = [token.lower() for token in tokens if len(token) > 1] #same as unigrams tri_tokens = trigrams(tokens) fdist = nltk.FreqDist(tri_tokens) return fdist
def exercise2(category): print print "For Category: " + category print "Part 1" print "Words with the tag 'JJ':" words = bn.tagged_words(categories = category) wordlist = bn.words(categories = category) words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ'])) print len(words_JJ) print print "Part 2" print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:" words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS'] print words_VBP_NNPS_NNS[:10] print sent = "" print "Part 3" print "The 3 most frequent 3-word prepositional phrases are:" words = bn.tagged_words(categories = category) for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words): if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')): sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "." sent_part = sent.split(".") fd = nltk.FreqDist(sent_part) v = fd.most_common(3) print v print print "Part 4" print "Ratio of Masculine to Feminine is:" male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b' female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b' male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())]) female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())]) print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns) print
def trigrams(self, unigrams): """ Generate trigrams from unigrams @param unigrams: unigram words @type unigrams: C{list} """ return nltk.trigrams(unigrams)
def calcSentProb(sent, NGramProbDict, n): ''' Look up each tag-ngram (trigrams here) in the target sentence in the ngrams log-prob dictionary; if found, add log-prob to total, else use the default prob; ''' prob = 0.0 count = 0 if len(sent)< 2: prob = -12 count = 1 elif len(sent)<3 or n==2: for (w1,t1),(w2,t2) in nltk.bigrams(sent): if (t1,t2) in NGramProbDict.keys(): prob += NGramProbDict[(t1,t2)] else: prob += tri_default_prob count += 1 elif n==3: for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sent): if (t1,t2,t3) in NGramProbDict.keys(): prob += NGramProbDict[(t1,t2,t3)] else: prob += bi_default_prob count += 1 return float(prob) / count
def filter_input_file(self, input_file): with open(input_file) as f: # Break apart file into list of single words, bigrams, and trigrams full_text = f.read() words = nltk.word_tokenize(full_text) filtered_trigrams = [' '.join(tgram) for tgram in nltk.trigrams(words) if self.query(' '.join(tgram))] filtered_bigrams = [' '.join(bgram) for bgram in nltk.bigrams(words) if self.query(' '.join(bgram))] filtered_words = [word for word in words if self.query(word)] filtered = filtered_trigrams + filtered_bigrams + filtered_words new_text = "" for word in words: if word in filtered: new_text += "**** " else: new_text += word + " " # out_file = open('bloom-output-200bits-10hashes', 'w') # out_file.write(new_text) # out_file.close() for line in textwrap.wrap(new_text, 140): print line return filtered pass
def ngrams_freq(tokens): trigrams = nltk.trigrams(tokens) fdist = nltk.FreqDist(trigrams) dd = {} for k,v in fdist.items(): dd[k] = v return dd
def pos_tagger(self): tweets = [] for tw in self.tweet_original: try: tw = tw.decode('unicode_escape').encode('ascii','ignore') except: tw = re.sub(r'\\+', '', tw) tw = tw.decode('unicode_escape').encode('ascii','ignore') tweets.append(tw) # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]] sent_tags = CMUTweetTagger.runtagger_parse(tweets) # fil_tweet = open('tweet_tags.json','w') i = 0 for sent in sent_tags: unigrams = [tag_tuple[1] for tag_tuple in sent] bigrams = set(nltk.bigrams(unigrams)) trigrams = set(nltk.trigrams(unigrams)) self.tweet_unigram[self.tweet_id[i]] = set(unigrams) self.tweet_bigram[self.tweet_id[i]] = bigrams self.tweet_trigram[self.tweet_id[i]] = trigrams self.tweet_feature_list.extend(unigrams) self.tweet_feature_list.extend(bigrams) self.tweet_feature_list.extend(trigrams) i += 1 #json.dump(self.tweet_unigram,fil_tweet) self.tweet_feature_list = list(set(self.tweet_feature_list))
def get_classification(self, text): text = ut.clean(text) uni = nltk.tokenize.word_tokenize(text) bi = nltk.bigrams (uni) tri = nltk.trigrams (uni) temp_lambda = self.lambda_pi # Map to store answer to its divergence pairs list_of_ans = dict() for (ques, ans) in self.training_set: fin_val = 0.0 for t in uni: fin_val += temp_lambda[5] * (float(self.unigram_tot_dict.get(t,0))/self.len) fin_val += temp_lambda[4] * (float(self.unigram_dict.get((ques,t),0))/len(ques)) for t in bi: fin_val += temp_lambda[3] * (float(self.bigram_tot_dict.get(t,0))/self.unigram_tot_dict.get(t[:1],1)) fin_val += temp_lambda[2] * (float(self.bigram_dict.get((ques,t),0))/self.unigram_dict.get((ques,t[:1]),1)) for t in tri: fin_val += temp_lambda[1] * (float(self.trigram_tot_dict.get(t,0))/self.bigram_tot_dict.get(t[:2],1)) fin_val += temp_lambda[0] * (float(self.trigram_dict.get((ques,t),0))/self.bigram_dict.get((ques,t[:2]),1)) list_of_ans[self.training_orig.get(ans, ans)] = fin_val # Return Weighted list of responses return list_of_ans
def calc_trigrams(brown_tags): #print brown_tags[0] #q_values = {} #unigram_c = collections.defaultdict(int) bigram_c = collections.defaultdict(int) trigram_c = collections.defaultdict(int) for stags in brown_tags: unigram_tuples = stags bigram_tuples = list(nltk.bigrams(stags)) trigram_tuples = list(nltk.trigrams(stags)) #print unigram_tuples #for g in unigram_tuples: #unigram_c[g] += 1 for g in bigram_tuples: bigram_c[g] += 1 for g in trigram_tuples: trigram_c[g] += 1 bigram_c[(START_SYMBOL, START_SYMBOL)] = len(brown_tags) q_values = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()} return q_values
def calc_probabilities(training_corpus): unigram_p = {} bigram_p = {} trigram_p = {} total_unigram=0 unigram_freq=Counter() bigram_freq=Counter() trigram_freq=Counter() u_freq=Counter() for line in training_corpus: line=START_SYMBOL+" "+ line+STOP_SYMBOL unigram_tokens=line.split() unigram_freq.update(unigram_tokens) total_unigram=total_unigram+len(unigram_tokens) for sent in training_corpus: sent=START_SYMBOL+" "+ START_SYMBOL+" "+sent+STOP_SYMBOL unigram_tokens=sent.split() u_freq.update(unigram_tokens) bigram_tuples=list(nltk.bigrams(unigram_tokens)) bigram_freq.update(bigram_tuples) trigram_tuples=list(nltk.trigrams(unigram_tokens)) trigram_freq.update(trigram_tuples) for key in unigram_freq: unigram_p[(key,)]= math.log(unigram_freq[key]/float(total_unigram),2) for key in bigram_freq: bigram_p[key]= math.log(bigram_freq[key]/float(u_freq[key[0]]),2) for key in trigram_freq: trigram_p[key]=math.log(trigram_freq[key]/float(bigram_freq[key[0],key[1]]),2) return unigram_p, bigram_p, trigram_p
def linearscore(unigrams, bigrams, trigrams, corpus): scores = [] lamb=float(1)/3; for line in corpus: line= STOP_SYMBOL+" "+START_SYMBOL+" "+line+" "+STOP_SYMBOL tokens = line.split() trigram = list(nltk.trigrams(tokens)) prob=0 for t in trigram: c=t[2] b=t[1] a=t[0] #tri= pow(2,trigrams[(a,b,c)]) #bi= pow(2,bigrams[(b,c)]) #uni= pow(2,unigrams[(c,)]) try: prob= prob + math.log(lamb*(pow(2,trigrams[(a,b,c)])+pow(2,bigrams[(b,c)])+pow(2,unigrams[(c,)])),2) #prob= prob + math.log(lamb*(tri+bi+uni),2) except: prob=MINUS_INFINITY_SENTENCE_LOG_PROB scores.append(prob) scores.append(prob) return scores
def calc_trigrams(brown_tags): q_values = {} unigram_count = {} bigram_count = {} trigram_count = {} for tag_list in brown_tags: unigram_tuples = [(word,) for word in tag_list] bigram_tuples = list(nltk.bigrams(tag_list)) trigram_tuples = list(nltk.trigrams(tag_list)) for word in unigram_tuples: if word in unigram_count: unigram_count[word] += 1 else: unigram_count[word] = 1 for word in bigram_tuples: if word in bigram_count: bigram_count[word] += 1 else: bigram_count[word] = 1 for word in trigram_tuples: if word in trigram_count: trigram_count[word] += 1 else: trigram_count[word] = 1 for word in trigram_count: q_values[word] = math.log(float(trigram_count[word])/bigram_count[(word[0], word[1])], 2) return q_values
def calc_trigrams(brown_tags): q_values = {} bigram_count = {} trigram_count = {} for item in brown_tags: bigram_tmp = nltk.bigrams(item) trigram_tmp = nltk.trigrams(item) for bigram in bigram_tmp: if bigram in bigram_count: bigram_count[bigram] += 1 else: bigram_count[bigram] = 1 for trigram in trigram_tmp: if trigram in trigram_count: trigram_count[trigram] += 1 else: trigram_count[trigram] =1 for trigram in trigram_count: q_values[trigram] = math.log(trigram_count[trigram], 2) - math.log(bigram_count[trigram[:2]],2) return q_values
def calc_trigrams(brown_tags): q_values = {} trigram_tags = list(nltk.trigrams(brown_tags)) bigram_tags = list(nltk.bigrams(brown_tags)) trigram_tags_count = {} bigram_tags_count = {} for trigram_tag in trigram_tags: if trigram_tag not in trigram_tags_count: trigram_tags_count[trigram_tag] = 1 else: trigram_tags_count[trigram_tag] += 1 for bigram_tag in bigram_tags: if bigram_tag not in bigram_tags_count: bigram_tags_count[bigram_tag] = 1 else: bigram_tags_count[bigram_tag] += 1 for trigram_tag in trigram_tags: q_values[trigram_tag] = math.log(trigram_tags_count[trigram_tag] / float(bigram_tags_count[trigram_tag[:2]]), 2) return q_values
def score(ngram_p, n, data): scores = [] if n == 1: for sentence in data: line_score = 0 sentence += "STOP " unigram_tokens = nltk.word_tokenize(sentence) for token in unigram_tokens: line_score += ngram_p[(token,)] scores.append(line_score) elif n == 2: for sentence in data: line_score = 0 sentence = "* " + sentence + "STOP " bigram_tuples = tuple(nltk.bigrams(nltk.word_tokenize(sentence))) for bigram in bigram_tuples: line_score += ngram_p[bigram] scores.append(line_score) elif n == 3: for sentence in data: line_score = 0 sentence = "* * " + sentence + "STOP " trigra_tuples = tuple(nltk.trigrams(nltk.word_tokenize(sentence))) for trigram in trigra_tuples: line_score += ngram_p[trigram] scores.append(line_score) return scores
def demo_findPOSpattern(words_tagged, num=20): print "List the most {0} ambiguous words ...".format(num) i = 0 data = nltk.ConditionalFreqDist(words_tagged) for word in data.conditions(): if len(data[word]) > 3: i += 1 tags = data[word].keys() print word.encode('big5'), "=>", ', '.join(tags) if i >= num: break while True: inp = raw_input("Enter a 3-frame pattern (example:'把 N V', 0 to exit): ") if inp == '0': break inp = inp.decode('big5') P = inp.split(' ') for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(words_tagged): W = (w1, w2, w3); T = (t1, t2, t3); flag = 0 for i in range(len(W)): if len(P[i]) == 0: break # if no input pattern then show dialog again if ord(P[i]) < 128: # an English tag name if T[i].startswith(P[i]): flag += 1 elif W[i] == P[i]: flag += 1 if flag == len(W): print ', '.join(W)
def get_trigrams(sentence, stopwords, porter): words = nltk.word_tokenize(sentence) words = [word.lower() for word in words] words = [normalize_numeric(word) for word in words] words = [normalize_stopword(word, stopwords) for word in words] words = [porter.stem(word) for word in words] return nltk.trigrams(words)
def _count_words(path): print path word_count = defaultdict(int) with open(path, 'r') as f: tokens = nltk.word_tokenize(f.read().decode('utf-8').lower()) word_counts = nltk.FreqDist(tokens) for word, count in word_counts.items(): word_count[word] = count bigrams = nltk.bigrams(tokens) bigram_counts = nltk.FreqDist(bigrams) for bigram, count in bigram_counts.items(): word_count['%s %s' % bigram] = count trigrams = nltk.trigrams(tokens) trigram_counts = nltk.FreqDist(trigrams) for trigram, count in trigram_counts.items(): word_count['%s %s %s' % trigram] = count filename = path.split('/')[2] count_date = '%s-%s-%s' % (filename.split('-')[0], filename.split('-')[1], filename.split('-')[2]) with open('data/text/counts/%s.json' % count_date, 'w') as f: json.dump({ 'words': word_count }, f)
def getTrainingAndTestData(tweets, K, k, method, feature_set): add_ngram_feat = feature_set.get('ngram', 1) add_negtn_feat = feature_set.get('negtn', False) from functools import wraps import preprocessing procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] stemmer = nltk.stem.PorterStemmer() all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if len(word) >= 3] words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] all_tweets.append((words, sentiment)) # train_tweets = all_tweets[:int(len(all_tweets)*ratio)] #DATADICT: train_tweets = [ (words, sentiment), ... ] # test_tweets = all_tweets[int(len(all_tweets)*ratio):] #DATADICT: test_tweets = [ (words, sentiment), ... ] train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k] test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k] unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1: n_grams_fd = nltk.FreqDist() for (words, sentiment) in train_tweets: words_uni = words unigrams_fd.update(words) if add_ngram_feat >= 2: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] n_grams_fd.update(words_bi) if add_ngram_feat >= 3: words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] n_grams_fd.update(words_tri) sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys()))) #unigrams_sorted = nltk.FreqDist(unigrams).keys() unigrams_sorted = unigrams_fd.keys() #bigrams_sorted = nltk.FreqDist(bigrams).keys() #trigrams_sorted = nltk.FreqDist(trigrams).keys() if add_ngram_feat > 1: sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd))) ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1] sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted))) def get_word_features(words): bag = {} words_uni = ['has(%s)' % ug for ug in words] if (add_ngram_feat >= 2): words_bi = [ 'has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words) ] else: words_bi = [] if (add_ngram_feat >= 3): words_tri = [ 'has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words) ] else: words_tri = [] for f in words_uni + words_bi + words_tri: bag[f] = 1 #bag = collections.Counter(words_uni+words_bi+words_tri) return bag negtn_regex = re.compile( r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [bool(negtn_regex.search(w)) for w in words] left = [0.0] * len(words) prev = 0.0 for i in range(0, len(words)): if (negtn[i]): prev = 1.0 left[i] = prev prev = max(0.0, prev - 0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0, len(words))): if (negtn[i]): prev = 1.0 right[i] = prev prev = max(0.0, prev - 0.1) return dict( zip(['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words], left + right)) def counter( func ): #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called @wraps(func) def tmp(*args, **kwargs): tmp.count += 1 return func(*args, **kwargs) tmp.count = 0 return tmp @counter #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called def extract_features(words): features = {} word_features = get_word_features(words) features.update(word_features) if add_negtn_feat: negation_features = get_negation_features(words) features.update(negation_features) sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets') return features extract_features.count = 0 if ('1step' == method): # Apply NLTK's Lazy Map v_train = nltk.classify.apply_features(extract_features, train_tweets) v_test = nltk.classify.apply_features(extract_features, test_tweets) return (v_train, v_test) elif ('2step' == method): isObj = lambda sent: sent in ['neg', 'pos'] makeObj = lambda sent: 'obj' if isObj(sent) else sent train_tweets_obj = [(words, makeObj(sent)) for (words, sent) in train_tweets] test_tweets_obj = [(words, makeObj(sent)) for (words, sent) in test_tweets] train_tweets_sen = [(words, sent) for (words, sent) in train_tweets if isObj(sent)] test_tweets_sen = [(words, sent) for (words, sent) in test_tweets if isObj(sent)] v_train_obj = nltk.classify.apply_features(extract_features, train_tweets_obj) v_train_sen = nltk.classify.apply_features(extract_features, train_tweets_sen) v_test_obj = nltk.classify.apply_features(extract_features, test_tweets_obj) v_test_sen = nltk.classify.apply_features(extract_features, test_tweets_sen) test_truth = [sent for (words, sent) in test_tweets] return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) else: return nltk.classify.apply_features(extract_features, all_tweets)
def n_gram_creator(tokens, top_n=20, n=2, freq_filter=None, window_size=None, counts=False, show_freq=True, show_pmi=False, keep=None): # Helper function creating [2-4]grams with a variety of options import nltk.collocations as colloc from nltk import bigrams, trigrams ## Check if n-gram is supported if n in [2, 3, 4]: ## Allowing for non-contiguous ngram creation if isinstance(window_size, int): window = window_size else: window = n ## Bigram setup if n == 2: word = 'Bi' if counts: ngrams = bigrams(tokens) return ngrams else: ngram_measures = colloc.BigramAssocMeasures() ngram_finder = colloc.BigramCollocationFinder.from_words( tokens, window_size=window) ## Trigram setup elif n == 3: word = 'Tri' if counts: ngrams = trigrams(tokens) return ngrams else: ngram_measures = colloc.TrigramAssocMeasures() ngram_finder = colloc.TrigramCollocationFinder.from_words( tokens, window_size=window) ## Quadgram setup elif n == 4: word = 'Quad' ngram_measures = colloc.QuadgramAssocMeasures() ngram_finder = colloc.QuadgramCollocationFinder.from_words( tokens, window_size=window) ## Applying frequency filter to results if selected for if isinstance(freq_filter, int): ngram_finder.apply_freq_filter(freq_filter) ## Create ngram scores ngram_score = ngram_finder.score_ngrams(ngram_measures.raw_freq) ngram_pmi_score = ngram_finder.score_ngrams(ngram_measures.pmi) ## Optional display if show_freq: print(f'Top {top_n} {word}-grams by frequency') display(ngram_score[:top_n]) ## Optional display if show_pmi: print(f'PMI score for {top_n} {word}-grams') display(ngram_pmi_score[:top_n]) ## Optional return if keep == 'score': return ngram_score elif keep == 'pmi': return ngram_pmi_score ## Messaging for non-supported ngrams else: return f"{n}-grams are not supported. Try 2, 3, or 4."
if len(word) > len(longest): longest = word print('longest word:{}'.format(longest)) # 下面是等效的代码,使用两个链表推导式 # 可以找到所有最长的词 maxlen = max(len(word) for word in text) print([word for word in text if len(word) == maxlen]) # 4.3.3 计数器(counter)的常规用法 # 使用循环变量来提取链表中连续重叠的3-grams n = 3 sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper'] print("3-grams= ", [sent[i:i + n] for i in range(len(sent) - n + 1)]) # 下面是等效的代码 print("3-grams= ", list(nltk.trigrams(sent))) # 下面是 2-grams print("2-grams= ", list(nltk.bigrams(sent))) # 下面是 4-grams print("4-grams= ", list(nltk.ngrams(sent, 4))) import pprint # 使用循环变量构建多维结构 # 嵌套的链表推导式 m, n = 3, 7 array = [[set() for i in range(n)] for j in range(m)] array[2][5].add('Alice') pprint.pprint(array) # 链表乘法则会对象复制的影响
def all_trigram_withcount(mylist): # Use Counter to sort all trigram. # trigram_withcount = Counter(list(trigrams(mylist))).most_common() trigram_withcount = FreqDist(list(trigrams(mylist))).most_common() return trigram_withcount
def respond(message, slack_words): string_trigrams = trigrams(slack_words.lower().split(' ')) for tri in string_trigrams: if ' '.join(tri).lower() in all_trigrams: message.reply(markov.generate_markov_text_with_words(tri[0], tri[1]))
brown_sents_train = [] for sent3 in brown_sent_train: sent3 = list(filter(lambda a: a not in ("``", "''", "--", ".", ",", "!",";","(",")","?",":"), sent3)) sent3 = [x.lower() for x in sent3] sent3 = ['<unk>' if x not in brown_unigram_dict_train.keys() else x for x in sent3] brown_sents_train.append(sent3) # list of sentences (as lists) elist = [] elist_10 = [] for sent in brown_sents_train: elist.append(list(bigrams(sent, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))) elist_10.append(list(trigrams(sent, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))) # list of tuples containing bigrams, trigrams elist_2 = [] elist_12 = [] for l in elist: for t in l: elist_2.append(t) for l in elist_10: for t in l: elist_12.append(t) elist_2 += [('<s>', '<s>'), ('</s>', '</s>')]*len(brown_sents_train) brown_bigram_dict_train = FreqDist(elist_2) brown_trigram_dict_train = FreqDist(elist_12)
from nltk.corpus import reuters from nltk import bigrams, trigrams from collections import Counter, defaultdict model = defaultdict(lambda: defaultdict(lambda: 0)) # Count frequency of co-occurance for sentence in reuters.sents(): for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): model[(w1, w2)][w3] += 1 # Let's transform the counts to probabilities for w1_w2 in model: total_count = float(sum(model[w1_w2].values())) for w3 in model[w1_w2]: model[w1_w2][w3] /= total_count print(dict(model['today', 'the']))
#%% Test with English sentences import nltk import langdetect # Test package langdetect.detect('Hello,How are you?') # copy a test dataset data = data_valid.copy() data.sent = data.sent.str.lower() # lowercase tokenizor = nltk.tokenize.RegexpTokenizer( "[a-zA-Z'`éèî]+") # this re need to be edited token = list() trigram = list() for i in range(len(data)): token.append(tokenizor.tokenize(data['sent'][i + 1])) trigram.append(nltk.trigrams(token[i]))
fileloc = " " fo = open(fileloc,"r+") inp = fo.read() print(inp) inp = clean_text(inp) #inp = input() tokens = word_tokenize(inp) for i in range(len(tokens)): tokens[i] = tokens[i].lower() tokens[i] = "|" + tokens[i] + "|" profile= FreqDist() for t in tokens: token_bigrams = bigrams(list(t)) token_trigrams = trigrams(list(t)) for cur_bigram in token_bigrams: cur_bigram = "".join(cur_bigram) if cur_bigram in profile: profile[cur_bigram] += 1 else: profile[cur_bigram] = 1 for cur_trigram in token_trigrams: cur_trigram = "".join(cur_trigram) if cur_trigram in profile: profile[cur_trigram] += 1 else: profile[cur_trigram] = 1
names.reverse() value.reverse() val = value # the bar lengths pos = arange(15)+.5 # the bar centers on the y axis pos plt.figure(figsize=(9,9)) barh(pos,val, align='center',alpha=0.7,color='rgbcmyk') yticks(pos, names) xlabel('Mentions') grid(True) list(nltk.bigrams(tokens)) list(nltk.trigrams(tokens)) sorted(w for w in set(tokens) if w.endswith('ing')) [w.upper() for w in tokens] for token in tokens: if token.islower(): print(token, 'is a lowercase word') elif token.istitle(): print(token, 'is a titlecase word') else: print(token, 'is punctuation') ########################################################
return math.log(len(list_of_docs) / float(num_docs_containing(word, list_of_docs))) def tf_idf(word, doc, list_of_docs): return (tf(word, doc) * idf(word, list_of_docs)) #Compute the frequency for each term. vocabulary = [] docs = {} all_tips = [] for tip in (['documment 1', 'documment 2']): tokens = tokenizer.tokenize(tip.text) bi_tokens = bigrams(tokens) tri_tokens = trigrams(tokens) tokens = [token.lower() for token in tokens if len(token) > 2] tokens = [token for token in tokens if token not in stopwords] bi_tokens = [' '.join(token).lower() for token in bi_tokens] bi_tokens = [token for token in bi_tokens if token not in stopwords] tri_tokens = [' '.join(token).lower() for token in tri_tokens] tri_tokens = [token for token in tri_tokens if token not in stopwords] final_tokens = [] final_tokens.extend(tokens) final_tokens.extend(bi_tokens) final_tokens.extend(tri_tokens) docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {}, 'tokens': []}
#compute word frequency in text fdist = FreqDist(words_nltk) # get the most two common words in the text fdist.most_common(2) """ stop_words = set(stopwords.words("english")) filtered_words=[] for w in words_nltk: if w not in stop_words: filtered_words.append(w) """ #generate trigram trigram_nltk = [t for t in trigrams(words_nltk)] #trigram_nltk = [t for t in trigrams(filtered_words)] print( "Note: We are running now NLTK Option 2! For text generation we'll use text generating function from Option 1 in the program." ) print( "Please enter an integer as the number of sentences you want to generate, preferably between {} and {}:>>" .format(len(trigram_nltk) // 10, 2 * (len(trigram_nltk) // 10))) number_sentences = input() print( "Please enter an integer as the lenght of the sentence in the generated text, preferably bigger than 2 and smaller than {}:>>" .format(len(trigram_nltk) // 10)) sentence_lenght = input()
def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if t1.startswith('V') and t2 == 'TO' and t3.startswith('V'): print(w1, w2, w3)
def summaryGen(fileName,domain,gram=5,debug=False): if os.path.exists("../datasets/"+domain.lower()+".pickle"): stopwords = nltk.corpus.stopwords.words() tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) def freq(word, doc): return doc.count(word) def word_count(doc): return len(doc) def tf(word, doc): return (freq(word, doc) / float(word_count(doc))) def num_docs_containing(word, list_of_docs): count = 0 for document in list_of_docs: if freq(word, document) > 0: count += 1 return 1 + count def idf(word, list_of_docs): return math.log(len(list_of_docs) / float(num_docs_containing(word, list_of_docs))) def tf_idf(word, doc, list_of_docs): return (tf(word, doc) * idf(word, list_of_docs)) #Compute the frequency for each term. vocabulary = [] docs = {} all_tips = [] text = "" brands_reviews = pickle.load( open( "../datasets/"+domain.lower()+".pickle", "rb" ) ) review_data = brands_reviews[fileName] for i in review_data: text+=i["review"] tokens = tokenizer.tokenize(text) bi_tokens = bigrams(tokens) tri_tokens = trigrams(tokens) n_tokens = ngrams(tokens, gram) tokens = [token.lower() for token in tokens if len(token) > 2] tokens = [token for token in tokens if token not in stopwords] bi_tokens = [' '.join(token).lower() for token in bi_tokens] bi_tokens = [token for token in bi_tokens if token not in stopwords] tri_tokens = [' '.join(token).lower() for token in tri_tokens] tri_tokens = [token for token in tri_tokens if token not in stopwords] n_tokens = [' '.join(token).lower() for token in n_tokens] n_tokens = [token for token in n_tokens if token not in stopwords] final_tokens = [] final_tokens.extend(tokens) final_tokens.extend(bi_tokens) final_tokens.extend(tri_tokens) final_tokens.extend(n_tokens) docs[0] = {'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {}, 'tokens': []} for token in final_tokens: #The frequency computed for each tip docs[0]['freq'][token] = freq(token, final_tokens) #The term-frequency (Normalized Frequency) docs[0]['tf'][token] = tf(token, final_tokens) docs[0]['tokens'] = final_tokens vocabulary.append(final_tokens) for doc in docs: for token in docs[doc]['tf']: #The Inverse-Document-Frequency docs[doc]['idf'][token] = idf(token, vocabulary) #The tf-idf docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) #Now let's find out the most relevant words by tf-idf. words = {} for doc in docs: for token in docs[doc]['tf-idf']: if token not in words: words[token] = docs[doc]['tf-idf'][token] else: if docs[doc]['tf-idf'][token] > words[token]: words[token] = docs[doc]['tf-idf'][token] review_keywords = sorted(words.items(), key=lambda x: x[1], reverse=True) if debug: print "After tokenization...." sleep(1) print final_tokens sleep(1) print "After frequency computation...." sleep(1) print docs[0]['freq'] sleep(1) print "After term frequency computation...." sleep(1) print docs[0]['tf'] sleep(1) print "After Inverse-Document-Frequency computation...." sleep(1) print docs[0]['tf-idf'] sleep(1) print "After term-frequency Inverse-Document-Frequency computation...." sleep(1) print docs[0]['tf-idf'] sleep(1) print "Scores....." for i in review_keywords: print i[0],"-",i[1] return [i[0] for i in review_keywords] else: print "Domain not in dataset" return "Domain not in dataset"
len(single) single # Frequency distribution of the words tokens.count('gluten') fd = nltk.FreqDist(tokens) fd.most_common(50) fd.plot(50) # How long are the words? fd_wlen = nltk.FreqDist([len(w) for w in unique]) fd_wlen # What about bigrams and trigrams? bigr = nltk.bigrams(tokens[:10]) trigr = nltk.trigrams(tokens[:10]) tokens[:10] list(bigr) list(trigr) # Back to text preprocessing: remove punctuations tokens_nop = [t for t in tokens if t not in string.punctuation] print(tokens[:50]) print(tokens_nop[:50]) len(tokens) len(tokens_nop) len(set(tokens_nop)) # Convert all characters to Lower case tokens_lower = [t.lower() for t in tokens_nop] print(tokens_lower[:50])
def all_trigrams(my_list): all_trigrams = list(nltk.trigrams(my_list)) return all_trigrams
def data_cleaning(): # have all the variables populated which are required below new_list_job_description = [] lemmatized_description = [] # Getting the all teams job posting file from Amazon S3 aws_id = '***' aws_secret = '***' s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) obj = s3.get_object(Bucket='data-science-team1', Key='all_job_posting.csv') data = obj['Body'].read() all_job_data = pd.read_csv(io.BytesIO(data), encoding='ISO-8859-1') print('All job posting file read successful from amazon S3') # Copy the read csv into a new data frame all_job_data_copy = all_job_data all_job_data_copy.reset_index(drop=True, inplace=True) # Description column of the data frame is converted into a list list_job_description = all_job_data['Description'].tolist() # Removing special characters from the list and converting it to lowercase for i in list_job_description: a = re.sub('[^A-Za-z]+', ' ', str(i)) a = a.lower() new_list_job_description.append(a) # Removing stop words new_list_job_description = [ word for word in new_list_job_description if word not in stopwords.words('english') ] # Adding SPLITHEREAFTERLEMMATIZATION at the end of each column combined_description_data = " SPLITHEREAFTERLEMMATIZATION ".join( new_list_job_description) # lemmatizing words lmtzr = WordNetLemmatizer() a = combined_description_data.split(' ') for i in a: word_after_lammatize = lmtzr.lemmatize(i) lemmatized_description.append(word_after_lammatize) print('Job Posting lemmatization successful') lemmatized_description_join = " ".join(lemmatized_description) description_df = pd.DataFrame({ "Job Description": lemmatized_description_join.split('SPLITHEREAFTERLEMMATIZATION') }) descr_lemmatizes_data_frame = all_job_data_copy.join(description_df) descr_lemmatizes_data_frame = descr_lemmatizes_data_frame.drop( columns="Description") descr_lemmatizes_data_frame = descr_lemmatizes_data_frame.dropna() descr_lemmatizes_data_frame.to_csv("descrlemmatizesdataframeclean.csv") all_job_data['Description'] = new_list_job_description job_description_list_descr_lemmatizes_data_frame = descr_lemmatizes_data_frame[ 'Job Description'].tolist() # df = pd.read_excel("Words_for_Clustering - copy.xlsx") # df = df.dropna() # Converting the 100 words list to lowercase, removing special characters, lammetizing s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret) obj = s3.get_object(Bucket='data-science-team1', Key='Final100Keywords.xlsx') data = obj['Body'].read() word_list = pd.read_excel(io.BytesIO(data), encoding='ISO-8859-1') print('100 keywords file read successful from amazon S3') # lemmatized_clean_word_list: List of clean word list clean_word_list = [] for key, i in word_list['Keywords'].iteritems(): a = re.sub('[^A-Za-z]+', ' ', str(i)) a = a.lower() clean_word_list.append(a) lemmatizer = WordNetLemmatizer() lemmatized_clean_word_list = [] for word in clean_word_list: a = [] for each in word.split(" "): a1 = lemmatizer.lemmatize(each) a.append(a1.lower()) a = " ".join(a) lemmatized_clean_word_list.append(a) # print(lemmatized_clean_word_list) # Counting the numbers of words in each description : one-gram, bi-gram, tri-grams list_of_counts = [] for list_ in job_description_list_descr_lemmatizes_data_frame: matched_words = {} words = nltk.word_tokenize(list_) words_set = set(words) bi_grams = nltk.bigrams(words) trigr = nltk.trigrams(words) bi_grams_pairs = [' '.join(pair) for pair in bi_grams] bi_grams_pairs_set = set(bi_grams_pairs) trigram_pairs = [' '.join(each) for each in trigr] trigram_pairs_set = set(trigram_pairs) # count = 0 matched_words.update({ word: words.count(word) for word in words_set if word in lemmatized_clean_word_list }) matched_words.update({ bi: bi_grams_pairs.count(bi) for bi in bi_grams_pairs_set if bi in lemmatized_clean_word_list }) matched_words.update({ tri: trigram_pairs.count(tri) for tri in trigram_pairs_set if tri in lemmatized_clean_word_list }) list_of_counts.append(matched_words) df_with_required_count = pd.DataFrame(list_of_counts) # with_count_data_frame: Final data frame with jobs and their count with_count_data_frame = descr_lemmatizes_data_frame.join( df_with_required_count) with_count_data_frame = with_count_data_frame.fillna(0) # appending all the remaining rows with the genearted dataframe remainingwords = [] duplicatedf = with_count_data_frame.loc[:, "access":"wealth management"] duplicatedf = duplicatedf.columns.values.tolist() word_list = word_list["Keywords"].tolist() for each in word_list: if each not in duplicatedf: remainingwords.append(each) for i in remainingwords: with_count_data_frame[i] = float(0) # with_count_data_frame.to_csv(r'A:\2nd Semester\Data Science\Assignment 2\Anurag\FINAL\word_count.csv') print('Job posting Word Count file successful') # return with_count_data_frame with_count_data_frame.to_csv("job_posting_with_count.csv") client = boto3.client('s3', aws_access_key_id='***', aws_secret_access_key='***') transfer = S3Transfer(client) transfer.upload_file('job_posting_with_count.csv', 'data-science-team1', 'job_posting_with_count.csv') # transfer.upload_file('with_count_data_frame.csv', 'data-science-team1', 'with_count_data_frame' + "/" + 'with_count_data_frame') print('Job posting file transferred to S3 successful')
def to_trigrams(words): for trigram in nltk.trigrams(words, pad_left=True, pad_right=True): if trigram != (None, None, None): yield trigram
def common_strings(start, end): CS_THRESHOLD = 6 sep = "tvlwz" tokens = string_range_tokenize(start, end, sep) #make a copy since we're going to edit it u_tokens = tokens c = 0 while (c < len(u_tokens)): if u_tokens[c] == sep: del u_tokens[c] else: c += 1 print("common_strings tokens:") print(tokens) if len(u_tokens) < CS_THRESHOLD: #print "%08x - %08x : %s" % (start,end,"no string") return ("", 0) f = nltk.FreqDist(u_tokens) u_gram = f.most_common(1)[0][0] u_gram_score = f.most_common(1)[0][1] #print "Tokens:" #print tokens #print len(tokens) bgs = list(nltk.bigrams(tokens)) c = 0 while (c < len(bgs)): if sep in bgs[c]: del bgs[c] else: c += 1 #print "Bigrams:" #print bgs if (len(bgs) != 0): fs = nltk.FreqDist(bgs) b_gram = fs.most_common(1)[0][0] #print "Most Common:" #print b_gram b_str = b_gram[0] + "_" + b_gram[1] b_gram_score = fs.most_common(1)[0][1] else: b_str = "" b_gram_score = 0 tgs = list(nltk.trigrams(tokens)) c = 0 while (c < len(tgs)): if sep in tgs[c]: del tgs[c] else: c += 1 #print "Trigrams:" #print tgs if (len(tgs) != 0): ft = nltk.FreqDist(tgs) t_gram = ft.most_common(1)[0][0] t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2] t_gram_score = ft.most_common(1)[0][1] else: t_str = "" t_gram_score = 0 #print "1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score) if (b_gram_score * 2 >= u_gram_score): if (t_gram_score * 2 >= b_gram_score): ret = t_str ret_s = t_gram_score else: ret = b_str ret_s = b_gram_score else: ret = u_gram ret_s = u_gram_score #print "%08x - %08x : %s" % (start,end,ret) return (ret, ret_s)
def calc_trigrams(brown_tags): q_values = {} #John's edit starts here #should be very simple from the other file unigram_c = {} bigram_c = {} trigram_c = {} unigram_p = {} bigram_p = {} sentence_count = 0 unigram_count = 0 bigram_count = 0 trigram_count = 0 # John's edit starts here # First need to find the count of all the tuples, and put them in an outer dictionary test_me = 1 for sentence in brown_tags: # passing this to helper function with the stripped version of each sentence #unigram_tuples, bigram_tuples, trigram_tuples = sentence_split(sentence.strip()) trigram_tuples = list(nltk.trigrams(sentence)) # if test_me: # print trigram_tuples # test_me = 0 #sentence.pop(0) #remove first start symbol new_sent = list(sentence[1:]) bigram_tuples = list(nltk.bigrams(new_sent)) newnew_sent = list(new_sent[1:]) #remove other start symbol unigram_tuples = list(newnew_sent) for phrase in unigram_tuples: if (phrase, ) in unigram_c: unigram_c[(phrase, )] += 1 else: unigram_c[(phrase, )] = 1 unigram_count += 1 for phrase in bigram_tuples: if phrase in bigram_c: bigram_c[phrase] += 1 else: bigram_c[phrase] = 1 bigram_count += 1 for phrase in trigram_tuples: if phrase in trigram_c: trigram_c[phrase] += 1 else: trigram_c[phrase] = 1 trigram_count += 1 sentence_count += 1 # keeps track of how many sentences there are # now that we have all the data, we now need to convert counts into probabilities for one_word in unigram_c: current_count = unigram_c[one_word] #print unigram_count #if one_word[0] == "captain": #print "captain", current_count unigram_p[one_word] = math.log( float(current_count) / float(unigram_count), 2) for two_words in bigram_c: count_both_words = bigram_c[two_words] if (two_words[0] == '*'): count_word_one = sentence_count else: count_word_one = unigram_c[(two_words[0], )] #count_word_two = unigram_c[(two_words[1],)] bigram_p[two_words] = math.log( float(count_both_words) / float(count_word_one), 2) testing_this = 1 for three_words in trigram_c: count_three_words = trigram_c[three_words] prev_bigram = tuple([three_words[0], three_words[1]]) #count_prev_bigram = bigram_c[prev_bigram] #above was previous code if (prev_bigram[0] == '*' and prev_bigram[1] == '*'): count_prev_bigram = sentence_count else: count_prev_bigram = bigram_c[prev_bigram] q_values[three_words] = math.log( float(count_three_words) / float(count_prev_bigram), 2) #professor provided return value here return q_values
def triGrams(words): tGrams = [] for item in nltk.trigrams(words): tGrams.append(' '.join(item)) return tGrams
NEG = scores.get('neg') NEU = scores.get('neu') RES = str() if POS > NEG: RES = 'Positive' elif NEG > POS: RES = 'Negative' elif NEU >= 0.5 or POS > NEU: RES = 'Positive' elif NEU < 0.5: RES = 'Negative' # -------------------------------------------------------- PATTERN ADVERB, ADVERB, ADJECTIVE (Down) tri_pairs = list() for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS): if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith( "JJ"): tri_pairs.append((w1, w2, w3)) if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D: print("[True]: Tri Pairs are found in Drought Rel. Term") # TRIGGER AREA for j in range(len(F)): if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]: print( "[True]: Tri Pairs are found in Frequent Wordset") if RES is "Positive": RES = "Highly Positive" FW = F[j] # fuzzy_df['FreqWord'].map(lambda x: next((y for y in x.split() if y in F), 'Not Found')) elif RES is "Negative":
def get_trigrams(self, doc_num): return nltk.trigrams(self.texts[doc_num].split())
def __init__(self, rdgDir, general, working_dir='.', overwrite=False, rank_from_previous=False, background_cache_file='ranking.pkl', full_to_abbr=False): # available metrics global bck_cache_file bck_cache_file = background_cache_file self.metrics = { 'DR': self._calDR, 'DC': self._calDC, 'DRDC': self._calDRDC, 'IDF': self._calIDF, 'TFIDF': self._calTFIDF, 'TokenDRDC': self._calTokenDRDC, 'TokenIDF': self._calTokenIDF, 'Entropy': self._calEntropy, 'KLDiv': self._calKLDiv, 'Weighted': self._calWeighted, 'TF': self._calTF } # used for restoring ranking # from previous self.rankingmap = {} # input files self.genDocs = Document(overwrite=overwrite) #for numBackDocs # updates by Y Gu 11/2018 for pkl file type compatibility self.genDocsNum = 0 #filtfname = os.path.join(rdgDir, 'filter.save') #filtfname = os.path.join(working_dir, '.filter.save') # General document group is given as files in a directory if rank_from_previous: pass elif type(general) == type(str()): logging.debug('Loading general documents from ' + general) # gen = [Document(general+genFile) for genFile in os.listdir(general) if genFile[-4:]=='.txt'] gen = map( lambda x: Document(filename=x.strip(), overwrite=overwrite), open(general).readlines()) ## note that the iterator only les us calculate this once ## this is OK because this is the initialization function ## other maps should be cast into lists # we only need the sum for the general class ## python3 compatibility change ## TrueTdf updates by Y Gu 6/2018 (next 2 lines + 5 lines in for loop) ## Updated again by Y Gu 11/2018 for type compatibility for iterator in gen: self.genDocsNum += 1 for w in iterator.counts: ## print(2,w,iterator.counts[w]) ## 57 OK self.genDocs.counts[w] += iterator.counts[w] self.genDocs.token_counts[ w] += 1 # updates by Y Gu 11/2018 for pkl file type compatibility ## input('pausing') # for i in range(len(list(gen))): # for w in gen[i].counts: # self.genDocs.counts[w] += gen[i].counts[w] # General document group is given as a corpus else: logging.debug('Loading from general corpus...') # NGrams in lieu of NPs -- we are storing extra info words = general.words() logging.debug('Unigrams loading') bigrams = nltk.bigrams(words) logging.debug('Bigrams loading') trigrams = nltk.trigrams(words) logging.debug('Trigrams loading') #filters = ['abbreviation', 'case', 'stem'] filters = Settings.getCorpusFilters() logging.debug('Filtering unigrams') for w in words: for filt in filters: # if filt == 'abbreviation': # w = Filter.criteria[filt](w,full_to_abbr) # ## Somewhat of a kludge, the more general approach # ## would be to allow all filters to take multiple arguments. # ## If these get expanded, that would be the way to go. # else: w = Filter.criteria[filt](w) if w: self.genDocs.counts[w] += 1 self.genDocs.token_counts[w] += 1 logging.debug('Filtering bigrams') for gram in bigrams: w = ' '.join(gram) for filt in filters: w = Filter.criteria[filt](w) if w: self.genDocs.counts[w] += 1 logging.debug('Filtering trigrams') for gram in trigrams: w = ' '.join(gram) for filt in filters: w = Filter.criteria[filt](w) if w: self.genDocs.counts[w] += 1 logging.debug('done') # Related Document Group -- we need each document separately logging.debug('Loading RDG from ' + rdgDir + '...') #self.rdgDocs = [Document(rdgDir+rdgFile) for rdgFile in os.listdir(rdgDir) if rdgFile[-4:]=='.txt'] self.rdgDocs = list( map(lambda x: Document(filename=x.strip(), overwrite=overwrite), open(rdgDir).readlines())) ## Python 3 compatibility -- rdgDocs needs to be a list and Python3 makes it an iterator logging.debug('done')
import collections counter = collections.Counter() for line in lines: for word in nltk.word_tokenize(line): counter[word.lower()] += 1 word2idx = {w: (i + 1) for i, (w, _) in enumerate(counter.most_common())} idx2word = {v: k for k, v in word2idx.items()} xs = [] ys = [] for line in lines: embedding = [word2idx[w.lower()] for w in nltk.word_tokenize(line)] triples = list(nltk.trigrams(embedding)) w_lefts = [x[0] for x in triples] w_centers = [x[1] for x in triples] w_rights = [x[2] for x in triples] xs.extend(w_centers) ys.extend(w_lefts) xs.extend(w_centers) ys.extend(w_rights) print(len(word2idx)) vocab_size = len(word2idx) + 1 ohe = OneHotEncoder(n_values=vocab_size) X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense() Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
# Remove single-character tokens (mostly punctuation) words = [word for word in words if len(word) > 1] # turn words into their roots words = [textblob.Word(w).lemmatize() for w in words] # extract freq of single words fdist = nltk.FreqDist(words) # print for preview for word, frequency in fdist.most_common(50): print(u'{};{}'.format(word, frequency)) # 2 and 3-ngrams bgs = nltk.bigrams(words) tgs = nltk.trigrams(words) fdist_bgs = nltk.FreqDist(bgs) fdist_tgs = nltk.FreqDist(tgs) # preview for word, frequency in fdist_bgs.most_common(50): print(u'{};{}'.format(word, frequency)) for word, frequency in fdist_tgs.most_common(50): print(u'{};{}'.format(word, frequency)) # sort by occurance and count sgs_sorted = fdist.most_common() bgs_sorted = fdist_bgs.most_common() tgs_sorted = fdist_tgs.most_common()
sentimentWords = [] sentimentBigrams = [] sentimentTrigrams = [] for tweet in trainSet.keys(): for tag in taggedTweets[tweet]: if tag[0] == 'Group': # Split the text and clean it up by removing punctuation and # making each word lowercase text = [ w.strip('"\\.,:/!?\'()').lower() for w in tag[1]['text'].split() ] groupWords.extend(text) groupBigrams.extend(list(nltk.bigrams(text))) groupTrigrams.extend(list(nltk.trigrams(text))) elif tag[0] == 'Stereotype': text = [ w.strip('"\\.,:/!?\'()').lower() for w in tag[1]['text'].split() ] stereotypeWords.extend(text) stereotypeBigrams.extend(list(nltk.bigrams(text))) stereotypeTrigrams.extend(list(nltk.trigrams(text))) elif tag[0] == 'Sentiment': text = [ w.strip('"\\.,:/!?\'()').lower() for w in tag[1]['text'].split() ] sentimentWords.extend(text) sentimentBigrams.extend(list(nltk.bigrams(text)))
def extract_bias_features(text): features = {} txt_lwr = str(text).lower() words = nltk.word_tokenize(txt_lwr) words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$'] unigrams = sorted(list(set(words))) bigram_tokens = nltk.bigrams(words) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = nltk.trigrams(words) trigrams = [ " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens)) ] # print words # print unigrams # print bigrams # print trigrams # print "----------------------" # word count features['word_cnt'] = len(words) # unique word count features['unique_word_cnt'] = len(unigrams) # coherence marker count count = count_feature_list_freq(coherence, words, bigrams, trigrams) features['cm_cnt'] = count features['cm_rto'] = round(float(count) / float(len(words)), 4) # degree modifier count count = count_feature_list_freq(modifiers, words, bigrams, trigrams) features['dm_cnt'] = count features['dm_rto'] = round(float(count) / float(len(words)), 4) # hedge word count count = count_feature_list_freq(hedges, words, bigrams, trigrams) features['hedge_cnt'] = count features['hedge_rto'] = round(float(count) / float(len(words)), 4) # factive verb count count = count_feature_list_freq(factives, words, bigrams, trigrams) features['factive_cnt'] = count features['factive_rto'] = round(float(count) / float(len(words)), 4) # assertive verb count count = count_feature_list_freq(assertives, words, bigrams, trigrams) features['assertive_cnt'] = count features['assertive_rto'] = round(float(count) / float(len(words)), 4) # implicative verb count count = count_feature_list_freq(implicatives, words, bigrams, trigrams) features['implicative_cnt'] = count features['implicative_rto'] = round(float(count) / float(len(words)), 4) # bias words and phrases count count = count_feature_list_freq(biased, words, bigrams, trigrams) features['bias_cnt'] = count features['bias_rto'] = round(float(count) / float(len(words)), 4) # opinion word count count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams) features['opinion_cnt'] = count features['opinion_rto'] = round(float(count) / float(len(words)), 4) # weak subjective word count count = count_feature_list_freq(subj_weak, words, bigrams, trigrams) features['subj_weak_cnt'] = count features['subj_weak_rto'] = round(float(count) / float(len(words)), 4) # strong subjective word count count = count_feature_list_freq(subj_strong, words, bigrams, trigrams) features['subj_strong_cnt'] = count features['subj_strong_rto'] = round(float(count) / float(len(words)), 4) # composite sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores( text)['compound'] features['vader_sentiment'] = compound_sentiment # subjectivity score using Pattern.en pattern_subjectivity = pattern_sentiment(text)[1] features['subjectivity'] = round(pattern_subjectivity, 4) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentenceObj = Sentence(sentence) features['modality'] = round(modality(sentenceObj), 4) features['mood'] = mood(sentenceObj) # Flesch-Kincaid Grade Level (reading difficulty) using textstat features['fk_gl'] = textstat.flesch_kincaid_grade(text) # liwc 3rd person pronoun count (combines S/he and They) count = count_liwc_list_freq(liwc_3pp, words) features['liwc_3pp_cnt'] = count features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4) # liwc auxiliary verb count count = count_liwc_list_freq(liwc_aux, words) features['liwc_aux_cnt'] = count features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4) # liwc adverb count count = count_liwc_list_freq(liwc_adv, words) features['liwc_adv_cnt'] = count features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4) # liwc preposition count count = count_liwc_list_freq(liwc_prep, words) features['liwc_prep_cnt'] = count features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4) # liwc conjunction count count = count_liwc_list_freq(liwc_conj, words) features['liwc_conj_cnt'] = count features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4) # liwc discrepency word count count = count_liwc_list_freq(liwc_discr, words) features['liwc_discr_cnt'] = count features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4) # liwc tentative word count count = count_liwc_list_freq(liwc_tent, words) features['liwc_tent_cnt'] = count features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4) # liwc certainty word count count = count_liwc_list_freq(liwc_cert, words) features['liwc_cert_cnt'] = count features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4) # liwc causation word count count = count_liwc_list_freq(liwc_causn, words) features['liwc_causn_cnt'] = count features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4) # liwc work word count count = count_liwc_list_freq(liwc_work, words) features['liwc_work_cnt'] = count features['liwc_work_rto'] = round(float(count) / float(len(words)), 4) # liwc achievement word count count = count_liwc_list_freq(liwc_achiev, words) features['liwc_achiev_cnt'] = count features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4) return features
# -*- coding: utf-8 -*- """Unigram, Biagram And Trigram Code .ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1fz3ji7JokGo0IDMD78uL4FBfQWh8-Wzg """ import nltk as nlp #nlp.download() paragrapha = """" Machine learning is a form of AI that enables a system to learn from data rather than through explicit programming. However, machine learning is not a simple process. As the algorithms ingest training data, it is then possible to produce more precise models based on that data. A machine-learning model is the output generated when you train your machine-learning algorithm with data. After training, when you provide a model with an input, you will be given an output. For example, a predictive algorithm will create a predictive model. Then, when you provide the predictive model with data, you will receive a prediction based on the data that trained the model. illustration of robot solving puzzle Iterative learning Machine learning enables models to train on data sets before being deployed. Some machine- learning models are online and continuous. This iterative process of online models leads to an improvement in the types of associations made between data elements. Due to their complexity and size, these patterns and associations could have easily been overlooked by human observation. After a model has been trained, it can be used in real time to learn from data. The improvements in accuracy are a result of the training process and automation that are part of machine learning. """ sentence = nlp.sent_tokenize(paragrapha) word = nlp.word_tokenize(paragrapha) print(list(nlp.bigrams(word))) print(list(nlp.trigrams(word))) #print(sentence)
text = soup.p.contents[0] text_1 = text.lower() text_2 = re.sub('\W', ' ', text_1) from nltk import word_tokenize from nltk import bigrams from nltk import trigrams from nltk import ngrams text_3 = word_tokenize(text_2) text_3_bi = bigrams(text_3) text_3_tri = trigrams(text_3) text_3_n = ngrams(text_3, 4) stop_words = urlopen( 'http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop' ).read().split('\n') ##we can then identify the stop words and then eliminate them from the list ##this is code that executes a very simple for loop to check the list text_4 = [x for x in text_3 if x not in stop_words] ##you can check what was removed with: text_rem = [x for x in text_3 if x not in text_4]