def _get_bigram_scores(self, posdata, negdata): pos_words = list(itertools.chain(*posdata)) neg_words = list(itertools.chain(*negdata)) pos_bigram_finder = BigramCollocationFinder.from_words(pos_words) neg_bigram_finder = BigramCollocationFinder.from_words(neg_words) pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words + pos_bigrams neg = neg_words + neg_bigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords, n = 5000): # (posWords,negWords) = readwordarr() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) bigramfinder = BigramCollocationFinder.from_words(posWords) posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) bigramfinder = BigramCollocationFinder.from_words(negWords) negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) posWords = posWords + posbigrams negWords = negWords + negbigrams wordscores = {} wordfd = FreqDist() conditionwordfd = ConditionalFreqDist() for word in posWords: wordfd[word]+=1 conditionwordfd['pos'][word]+=1 for word in negWords: wordfd[word]+=1 conditionwordfd['neg'][word]+=1 pos_word_count = conditionwordfd['pos'].N() neg_word_count = conditionwordfd['neg'].N() totalcount = pos_word_count + neg_word_count for word,freq in wordfd.items(): pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount) neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount) wordscores[word] = pos_score + neg_score return wordscores
def create_word_bigram_scores(): posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word) for word in neg: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq): ''' 以双词来统计词的信息量 ''' bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(score_method, 5000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(score_method, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300): sents_pos = [] sents_neg = [] # Separate positive and negative sentences. for tag, sent in sents_tagged: if tag == 1: sents_pos.append(sent) elif tag == -1: sents_neg.append(sent) # Extract words from positive and negative sentences. words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation] words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation] # Find the best bigrams for positive sentences based on informative collocations bigram_finder1 = BigramCollocationFinder.from_words(words_pos) bigrams_best_pos = bigram_finder1.nbest(score_fn, n) # Find the best bigrams for negative sentences based on informative collocations bigram_finder2 = BigramCollocationFinder.from_words(words_neg) bigrams_best_neg = bigram_finder2.nbest(score_fn, n) bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg))) # Select only the bigrams that have either one of the word greater than length 3 bigrams_best = [bigram for bigram in bigrams_all if len(bigram[0]) > 3 and len(bigram[1]) > 3 and bigram[0] not in ex and bigram[1] not in ex ] return bigrams_best
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams return get_scores(pos, neg)
def create_word_bigram_scores(): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) objWords = list(itertools.chain(*objdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) bigram_finder = BigramCollocationFinder.from_words(objWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams obj = objWords + objBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 for word in objWords: word_fd[word] += 1 cond_word_fd['obj'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() obj_word_count = cond_word_fd['obj'].N() total_word_count = pos_word_count + neg_word_count + obj_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) word_scores[word] = pos_score + neg_score + obj_score return word_scores
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( '_collocations' in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def get_bigram(self, features_list): #Top ten best bigrams are selected score = BigramAssocMeasures.chi_sq all_bigrams = BigramCollocationFinder.from_words(features_list) best_bigrams = all_bigrams.nbest(score, self.bigram_threshold) selected_bigrams = [(bigram, True) for bigram in best_bigrams] return selected_bigrams
def get_frequencies(self, desc): stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset words = word_tokenize(desc) print '------gram--------' words_to_count = [word for word in words if word not in stopset] words_to_count = [word for word in words_to_count if not len(word) < 3] c = Counter(words_to_count) single = c.most_common(20) print single print '------bigram--------' bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15) print bigrm print '------trigram--------' tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) #only keep those that appear more than 3 times trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10) print trigrm matches = [single,bigrm,trigrm] return matches
def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200): bigramslist = [] bigram_finder = BigramCollocationFinder.from_words(tweet) bigrams = bigram_finder.nbest(score_fn, n) for bigram in bigrams: bigramslist.append(' '.join(str(i) for i in bigram)) print bigramslist
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_features(words)) return d
def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \ score_fn=BigramAssocMeasures.chi_sq): if stop: words = [w for w in words if w not in stopset] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, limit) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def tweet_features(tweet): tweet_words = word_tokenize(tweet) bigram_finder = BigramCollocationFinder.from_words(tweet_words) score_fn=BigramAssocMeasures.chi_sq bigrams = bigram_finder.nbest(score_fn, 200) print bigrams return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def get_bag_of_bigrams_words( word_list, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(word_list) bigrams = bigram_finder.nbest(score_fn, n) return get_bag_of_words(word_list + bigrams)
def get_collocations(self): ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.text_array,2) finder.apply_freq_filter(3) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() return finder.nbest(bigram_measures.likelihood_ratio,40)
def bigram_words(words, score_fn = BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams)
def converter(tokens): bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(score_fn, n) return ( {ngram: True for ngram in itertools.chain(tokens, bigrams)}, label )
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500): '''Removes the stopwords and computes the best bigrams''' stopset = set(stopwords.words('english')) words = [word for word in tokenize(text) if word not in stopset] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def collaction_discovery(self): self.corpus = nltk.word_tokenize(self.corpus.lower()) bigramm_finder = BigramCollocationFinder.from_words(self.corpus) filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_ bigramm_finder.apply_word_filter(filter_bigram) top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) return top_10_bigrams
def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200): bigramslist = [] bigram_finder = BigramCollocationFinder.from_words(tweet) bigrams = bigram_finder.nbest(score_fn, n) for bigram in bigrams: bigramslist.append(' '.join(str(i) for i in bigram)) return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']
def demo_collocations(self, num=40, window_size=2): """ Print collocations derived from the text, ignoring stopwords. @seealso: L{find_collocations} @param num: The maximum number of collocations to print. @type num: C{int} @param window_size: The number of tokens spanned by a collocation (default=2) @type window_size: C{int} """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') from nltk.collocations import BigramCollocationFinder finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) from nltk.metrics import f_measure, BigramAssocMeasures bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations] print "List {0} collocations".format(num) print tokenwrap(colloc_strings, separator=u'; ')
def ShowCollocations(): text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n") import nltk from nltk.collocations import BigramCollocationFinder from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.metrics import TrigramAssocMeasures pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']''' data = resultsbox.get(1.0,END) rawtext=nltk.regexp_tokenize(data, pattern) prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()] text.delete(1.0, END) text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n") text.insert(END, "\nBigram Collocations:\n") bigram = BigramAssocMeasures() bigramfinder = BigramCollocationFinder.from_words(prepcolloc) bigramfinder.apply_freq_filter (3) bigrams=bigramfinder.nbest(bigram.pmi, 10) for item in bigrams: first = item[0] second = item[1] text.insert(END, first) text.insert(END, " ") text.insert(END, second) text.insert(END, "\n")
def bigram(ytcomments, drug): bi = BigramAssocMeasures() bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20) top_general = bi_finder.nbest(bi.pmi,30) bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug) top_bi = bi_finder.nbest(bi.pmi, 30) return top_bi
def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40): # TODO configuration value clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3) bigram_finder = BigramCollocationFinder.from_words(clean_text['3']) bigram_measures = BigramAssocMeasures() bigrams = bigram_finder.nbest(bigram_measures.pmi, n) return bigrams
def create_features(X, user_data=None): res = [] for date, comment, user in X: feat = {} has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0 has_fwenzel_word = 0 has_swastika = swastika in comment comment = comment.lower() comment = parse_text(comment) comment = nltk.clean_html(comment) sents = sent_tokenize(comment) doc = [] for sent in sents: # Tokenize each sentence. doc += wordtokenizer.tokenize(sent) def repl_filter(x): return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"] # Remove stopwords and replacement tokens. doc = filter(repl_filter, doc) for i, word in enumerate(doc): if doc[i] in bad_words: doc[i] = '_badword_' doc[i] = ps.stem(doc[i]) doc[i] = wnl.lemmatize(doc[i]) if doc[i] in bad_words: doc[i] = '_badword_' if doc[i] in hate_words: has_hate_word = 1 if doc[i] in drug_words: has_drug_word = 1 if doc[i] in cult_words: has_cult_word = 1 if doc[i] in occult_words: has_occult_word = 1 if doc[i] in porn_words: has_porn_word = 1 if doc[i] in fwenzel_words: has_fwenzel_word = 1 bigram_finder = BigramCollocationFinder.from_words(doc) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5) bigram = dict([(ngram, True) for ngram in itertools.chain(doc, bigrams)]) feat.update(bigram) text_vocab = set(w for w in doc if w.isalpha()) unusual = text_vocab.difference(english_vocab) unusual_ratio = len(unusual) / len(text_vocab) if len( text_vocab) != 0 else -1.0 unusual2 = unusual.difference(set("_badword_")) unusual_ratio2 = len(unusual2) / len(text_vocab) if len( text_vocab) != 0 else -1.0 if user_data is not None: user_info = user_data[user] has_bad_word = True for word in bad_words: if word in comment.lower(): break else: has_bad_word = False def n_none(x): return int(x) if x is not None else 0 def c_none(x): return x if x is not None else "__None__" readability = ReadabilityTool(comment) read_feat = {} for f, val in readability.analyzedVars.items(): if f != 'words': read_feat["_" + f] = val for test, val in readability.tests_given_lang['eng'].items(): read_feat["__" + test] = val(readability.text) feat['_always_present'] = True feat['_word_num'] = len(doc) feat['_sent_num'] = len(sents) feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0 feat['_sent_var'] = len(set(sents)) / len(sents) feat['_unusual_ratio'] = unusual_ratio feat['_unusual_ratio2'] = unusual_ratio2 if user_data is not None: feat['_username'] = user feat['_user_subcount'] = int(user_info['SubscriberCount']) feat['_user_friends'] = int(user_info['FriendsAdded']) feat['_user_favs'] = int(user_info['VideosFavourited']) feat['_user_videorates'] = int(user_info['VideosRated']) feat['_user_videouploads'] = int(user_info['VideosUploaded']) feat['_user_videocomments'] = int(user_info['VideosCommented']) feat['_user_videoshares'] = int(user_info['VideosShared']) feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded']) feat['_user_gender'] = c_none(user_info['Gender']) feat['_user_age'] = n_none(user_info['Age']) feat['_user_closed'] = user_info['UserAccountClosed'] feat['_user_suspended'] = user_info['UserAccountSuspended'] feat['_user_has_gender'] = 1 if user_info[ 'Gender'] is not None else 0 feat['_user_has_school'] = 1 if user_info[ 'School'] is not None else 0 feat[ '_user_has_books'] = 1 if user_info['Books'] is not None else 0 feat['_user_has_movies'] = 1 if user_info[ 'Movies'] is not None else 0 feat[ '_user_has_music'] = 1 if user_info['Music'] is not None else 0 feat['_user_has_location'] = 1 if user_info[ 'Location'] is not None else 0 feat['_user_has_hometown'] = 1 if user_info[ 'Hometown'] is not None else 0 # feat['_user_last'] = user_info['LastWebAccess'] # Dictionary features feat['_has_bad_word'] = has_bad_word # feat['_has_hate_word'] = has_hate_word # feat['_has_drug_word'] = has_drug_word feat['_has_cult_word'] = has_cult_word feat['_has_swastika'] = has_swastika # feat['_has_occult_word'] = has_occult_word # feat['_has_has_fwenzel_word'] = has_fwenzel_word feat['_has_porn_word'] = has_porn_word feat['_has_swastika'] = has_swastika feat.update(read_feat) # print feat res.append(feat) return res
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) # 使用卡方统计的方法,选择排名前1000的词语 newBigrams = [u + v for (u, v) in bigrams]
def take_bigram(self, text, stop_words): finder = BigramCollocationFinder.from_words(text) return finder.nbest(BigramAssocMeasures.likelihood_ratio, 15)
def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500): '''Find the best n bigrams of a text by means of a give measure.''' words = tokenize(text) bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(bigrams)
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式 bigrams = bigram_finder.nbest(score_fn, n) # 使用了卡方统计的方法,选择排名前1000的双词 return bag_of_words(bigrams)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d
#nltk.download('averaged_perceptron_tagger') #nltk.download('wordnet') text = 'Mary had a little lamb. Her fleece was white as snow. Lamb little ' sents = sent_tokenize(text) #print(sents) #words = word_tokenize(text) words = [word_tokenize(t) for t in sents] #print(words) customstopwords = set(stopwords.words('english') + list(punctuation)) #print(customstopwords) wordsstop = [ word for word in word_tokenize(text) if word not in customstopwords ] #print(wordsstop ) bm = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(wordsstop) # can do trigrams too. #print(sorted(finder.ngram_fd.items())) text2 = 'Mary closed closer in close' st = LancasterStemmer() # reduces to root form. stemw = [st.stem(i) for i in word_tokenize(text2)] #print(set(stemw)) #print(nltk.pos_tag(word_tokenize(text2))) # part of speech tagging for ss in wordnet.synsets('bass'): pass #print(ss,ss.definition()) sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), 'bass') #print(sense1,sense1.definition())
TEXT_DIR = "./_TEXT" READMES = sorted( [f for f in listdir_nohidden(TEXT_DIR) if isfile(join(TEXT_DIR, f))]) bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() bi_dict = dict() for README in READMES: readme_file_name = TEXT_DIR + "/" + README with open(readme_file_name, "r") as readme_file: readme_contents = onlyLetters(readme_file.read()) words = readme_contents.split(" ") removeStopwords(words) bi_finder = BigramCollocationFinder.from_words(words) bi_collocations = bi_finder.nbest(bigram_measures.likelihood_ratio, 10) for collocation in bi_collocations: if len(collocation[0]) + len(collocation[1]) > 1: incrementDict(" ".join(collocation), bi_dict) if " " in bi_dict: bi_dict.pop(" ") bi_dict_sorted = OrderedDict( sorted(bi_dict.items(), reverse=True, key=lambda (k, v): (v, k))) bi_dict_json = json.dumps(take(1000, bi_dict_sorted)) with open("bigram_words.json", "w") as bigram_file: bigram_file.write(bi_dict_json)
t_df = pd.DataFrame(t_array, columns = range(len(cleaned_tweets)), index = list_vocab) sum_df = t_df.sum(axis = 1, skipna = True) sum_df = pd.DataFrame(sum_df, columns = ['Frequency']) sum_df = sum_df.sort_values(by = 'Frequency', ascending = False) print(sum_df.head(50)) print(sum_df.sum()) #-----------------------------# cvec = CountVectorizer(analyzer=lambda x:x.split(',')) c_feat = cvec.fit_transform(split_words_j) # vocabs = [w for w in cvec.vocabulary_.keys()] flattened_split_words = [y for x in split_words for y in x] biagram_collocation = BigramCollocationFinder.from_words(flattened_split_words) th_stop = get_th_stop() filter_stops = lambda w: len(w) < 3 or w in th_stop biagram_collocation.apply_word_filter(filter_stops) biagram = biagram_collocation.score_ngrams(BigramAssocMeasures.likelihood_ratio) prefix_keys = collections.defaultdict(list) for key, scores in biagram: prefix_keys[key[0]].append((key[1], scores)) for key in prefix_keys: prefix_keys[key].sort(key = lambda x: -x[1]) n_words = int(sys.argv[2])
def bigrams_words_features(words, nbigrams, measure=BigramAssocMeasures.chi_sq): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(measure, nbigrams) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams) # 所有词和(信息量大的)双词搭配一起作为特征
def bigram(words,score_fn=BigramAssocMeasures.chi_sq,n=1000): bigram_finder=BigramCollocationFinder.from_words(words) bigrams= bigram_finder.nbest(score_fn,n) newBigrams = [u+v for (u,v) in bigrams] return bag_of_words(newBigrams)
l.append(z) l = sorted(l,key=itemgetter(1),reverse=True) return(l[0:300]) top_words_quadcounter(job_text) special_chars = ['--','...','\n','•','®'] a = ' '.join(job_text) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] text = ' '.join(resultwords) finder = BigramCollocationFinder.from_words(word_tokenize(text)) for k,v in finder.ngram_fd.items(): print(k,v) ##deep copy. save a copy. a = ' '.join(job_text) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case a = a.replace('\n', ' ') #replace \n with a space a = a.replace('•', ' ') resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
def main(in_dir: Path, out_dir: Path, num_corpus_chunks: int, min_frequency: int, conserve_RAM: bool = False) -> None: Path.mkdir(out_dir, parents=True, exist_ok=True) preview = open(out_dir / f'vocab.txt', 'w') corpus: List[LabeledDoc] = [] for part_index in tqdm(range(num_corpus_chunks), desc='Loading cache'): with open(in_dir / f'tokenized_{part_index}.pickle', 'rb') as in_file: corpus += pickle.load(in_file) # Lowercase, discard punctuations, replace numbers, deduplicate number = re.compile(r'\d') starts_with_letter = re.compile(r"^\w") select_punctuations = re.compile(r"[@#&:]|.com") norm_freq: Counter[str] = Counter() existed: Set[Tuple[str, ...]] = set() duplicates = 0 for doc in tqdm(corpus, desc='Normalizing tokens'): for sent in doc.sentences: for token in sent.tokens: if not starts_with_letter.search(token): continue if select_punctuations.search(token): continue if number.search(token): norm_token = '<NUM>' else: norm_token = token.lower() sent.normalized_tokens.append(norm_token) norm_freq[norm_token] += 1 if conserve_RAM: del sent.tokens # all_norm_tokens += sent.normalized_tokens hashable = tuple(sent.normalized_tokens) if hashable not in existed: existed.add(hashable) else: duplicates += 1 doc.sentences = [ # Filter out duplicate sentences sent for sent in doc.sentences if tuple(sent.tokens) not in existed ] print(f'Number of duplicate sentences = {duplicates:,}') UNK_filtered_freq: Counter[str] = Counter() for key, val in norm_freq.items(): if val >= min_frequency: UNK_filtered_freq[key] = val else: UNK_filtered_freq['<UNK>'] += val print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}') print(f'Number of filtered unigrams = {len(UNK_filtered_freq):,}', file=preview) all_norm_tokens: List[str] = [ nt for doc in corpus for sent in doc.sentences for nt in sent.normalized_tokens ] special_tokens = {'<UNK>', '<NUM>', "n't", "n’t"} print('Finding bigrams...') bigram_finder = BigramCollocationFinder.from_words(all_norm_tokens) num_tokens = len(all_norm_tokens) bigram_finder.apply_freq_filter(min_frequency) stop_words = set(stopwords.words('english')).union(special_tokens) bigram_finder.apply_word_filter(lambda word: word in stop_words) bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq) # bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().pmi) print(f'Number of filtered bigrams = {len(bigrams):,}') print(f'Number of filtered bigrams = {len(bigrams):,}', file=preview) with open(out_dir / 'bigrams.txt', 'w') as bigram_file: for bigram, relative_freq in bigrams: absolute_freq = relative_freq * num_tokens bigram_str = ' '.join(bigram) # bigram_file.write(f'{relative_freq:.4f}\t{bigram_str}\n') # for PMI bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n') # print('Finding trigrams...') # trigram_finder = TrigramCollocationFinder.from_words(all_norm_tokens) # trigram_finder.apply_freq_filter(min_frequency) # trigram_finder.apply_word_filter(lambda word: word in stop_words) # # trigram_finder.apply_ngram_filter( # # lambda w1, w2, w3: (w1 in stop_words) or (w3 in stop_words) or (w2 in special_tokens)) # trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq) # print(f'Number of filtered trigrams = {len(trigrams):,}') # print(f'Number of filtered trigrams = {len(trigrams):,}', file=preview) # with open(out_dir / 'trigrams.txt', 'w') as trigram_file: # for trigram, relative_freq in trigrams: # absolute_freq = relative_freq * num_tokens # trigram_str = ' '.join(trigram) # trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n') del all_norm_tokens # Multi-Word Expression tokenize to underscored underscorer = MWETokenizer([bi for bi, _ in bigrams ]) # maybe add affordable care act # underscorer = MWETokenizer( # [tri for tri, _ in trigrams] + [bi for bi, _ in bigrams]) vocab: Counter[str] = Counter() for doc in tqdm(corpus, desc='Underscoring multi-phrase expressions'): for sent in doc.sentences: sent.underscored_tokens = underscorer.tokenize( sent.normalized_tokens) vocab.update(sent.underscored_tokens) if conserve_RAM: del sent.normalized_tokens print('Pickling...') with open(out_dir / 'MWE_underscored.pickle', 'wb') as out_file: pickle.dump(corpus, out_file) for key, val in vocab.most_common(): if val >= min_frequency: print(f'{val:,}:\t{key}', file=preview) preview.close()
print(len(sentiment_list)) trump20_sent = sentiment_ct(trump_speech, "Trump 2020 ") biden20_sent = sentiment_ct(biden_speech, "Biden 2020 ") pence20_sent = sentiment_ct(pence20_speech, "Trump 2020 ") harris20_sent = sentiment_ct(harris20_speech, "Biden 2020 ") trump16_sent = sentiment_ct(trump16_speech, "Trump 2016 ") clinton16_sent = sentiment_ct(clinton16_speech, "Clinton 2016 ") ################################################################################################ # Bigrams # 2020 POTUS dnc_finder = BigramCollocationFinder.from_words(biden_tokens) dnc_finder.nbest(BigramAssocMeasures.chi_sq, 30) # top 30 DNC bigrams dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores # plot barchart plot_word_freqs(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'b', "Top 30 bigrams in Biden's 2020 DNC Speech", "Frequency Score") # plot network visualize_bigram(dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], .6, "Top 30 bigrams in Biden's 2020 DNC Speech") # democrat network rnc_finder = BigramCollocationFinder.from_words(trump_tokens) rnc_finder.nbest(BigramAssocMeasures.raw_freq, 30) # top 30 RNC bigrams rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30] # bigrams with scores # plot barchart plot_word_freqs(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'r', "Top 30 bigrams in Trump's 2020 RNC Speech", "Frequency Score") # plot network visualize_bigram(rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 0.6, "Top 30 bigrams in Trump's 2020 RNC Speech") # republican network
def extract_bigram(words): finder = BigramCollocationFinder.from_words(words) return finder.nbest(bigram_measures.pmi, 5)
plt.figure(figsize = (50,25)) plt.imshow(bigram_wordcloud,interpolation = 'bilinear') plt.axis("off") plt.show() # ============================================================================= # from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures finder=BigramCollocationFinder.from_words(email_wc_na) a=finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) print(a) ============================================================================= #pos tagging ============================================================================= import nltk nltk.download('averaged_perceptron_tagger') token=nltk.word_tokenize(email_wc_a) a=list(nltk.pos_tag(token)) from nltk import pos_tag from nltk import RegexpParser
@author: issfz """ import string from nltk.corpus import reuters from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures #bigram associations from nltk.corpus import stopwords grain_tok = [reuters.words(f) for f in reuters.fileids('grain') ] #return values are already tokenised trade_tok = [reuters.words(f) for f in reuters.fileids('trade')] words = [w.lower() for f in grain_tok for w in f] #lower case to prevent case sensitivity bcf = BigramCollocationFinder.from_words( words) # will give words but not matrix top100 = bcf.nbest( BigramAssocMeasures.likelihood_ratio, 100 ) #will give top n no of best candidates based on certain criteria(likelihood ration we can use to start with) top = [(t1, t2) for (t1, t2) in top100 if t1 not in string.punctuation and t2 not in string.punctuation] stopset = set(stopwords.words('english')) filter_stops = lambda w: len( w ) < 3 or w in stopset # filter stop words , more filtering required, prepare filter pattern first , we use lambda function bcf.apply_word_filter(filter_stops) bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10) bcf.nbest(BigramAssocMeasures.chi_sq, 10) bcf.nbest(BigramAssocMeasures.jaccard, 10) bcf.nbest(BigramAssocMeasures.mi_like, 10)
stopwordsList.append('x') stopwordsList.append('z') stopwordsList.append('Pp') stopwordsList.append('Pq') return stopwordsList stopwords = prepareStopWords() # fdist = FreqDist(text) # fdist_no_punc_no_stopwords = nltk.FreqDist(dict((word, freq) for word, freq in fdist.items() if word not in stopwords and word.isalpha())) # bigramas bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.nbest(bigram_measures.pmi, 10) finder.apply_freq_filter(3) finder.nbest(bigram_measures.pmi, 10) # WC para los bigramas mas frecuentes stopWords = stopwords text_content = [ ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text ] text_content = [word for word in text_content if word not in stopWords] text_content = [s for s in text_content if len(s) != 0] text_content = [WNL.lemmatize(t) for t in text_content] finder = BigramCollocationFinder.from_words(text_content)
data_token = pd.read_csv(processed_path + "processed+tokenized.csv") data_token['message'] = data_token['message'].apply(eval) #########################Entire Dictionary##################################### flat_list = [] for sublist in data_token['message']: for item in sublist: flat_list.append(item) ######################finds top bigrams and trigrams########################### bigrams = nltk.collocations.BigramAssocMeasures() trigrams = nltk.collocations.TrigramAssocMeasures() trigramfinder = TrigramCollocationFinder.from_words(flat_list) bigramfinder = BigramCollocationFinder.from_words(flat_list) bigram_freq = bigramfinder.ngram_fd.items() bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram', 'freq']).sort_values(by='freq', ascending=False) trigram_freq = trigramfinder.ngram_fd.items() trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram', 'freq']).sort_values(by='freq', ascending=False) bigramFreqTable.to_csv(raw_path + "bigramFreqTable.csv") trigramFreqTable.to_csv(raw_path + "trigramFreqTable.csv")
from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures from nltk.collocations import BigramCollocationFinder from nltk.corpus import stopwords textWord = [w.lower() for w in webtext.words('pirates.txt')] finder = BigramCollocationFinder.from_words(textWord) #print(finder.nbest(BigramAssocMeasures.likelihood_ratio,10)) ignored_word = set(stopwords.words('english')) print(ignored_word) filterStpos = lambda w: len(w) < 3 or w in ignored_word finder.apply_word_filter(filterStpos) print(finder.nbest(BigramAssocMeasures.likelihood_ratio, 10))
num = len(dic.positions(word)) + 1 except: return 1 return num # Instantiate dictionary used to count syllables dic = pyphen.Pyphen(lang='en') # Instantiate corpus reader for word selection ignoredWords = set(stopwords.words("english")) filterStops = lambda w: len(w) < 3 or w in ignoredWords # Load the brown corpus, get the collocations for each word and scores based on the likelihood of occurrence bigramMeasures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(nltk.corpus.brown.words()) scored = finder.score_ngrams(bigramMeasures.likelihood_ratio) # Create dictionary of lists to associate keys with all their bigram pairs (word, likelihood ratio) dictList = collections.defaultdict(list) for key, score in scored: dictList[key[0]].append((key[1], score)) # Get words from picture and assess for suitability first = choice(tags) tags.remove(first) second = choice(tags) tags.remove(second) third = choice(tags) # Create lists to hold words, syllables and max syllables for each line
def word_features(words, score_fn=BAM.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict((bg, True) for bg in chain(words, bigrams))
def bigram_feats(text, score_fn=BigramAssocMeasures.pmi, n_best=200): bigram_finder = BigramCollocationFinder.from_words(text) n_grams = bigram_finder.nbest(score_fn, n_best) return dict([(n_gram, True) for n_gram in n_grams])
def findBigrams(self, tweet): words = [w for w in tweet] bigrams = BigramCollocationFinder.from_words(words) return bigrams.nbest(BigramAssocMeasures.likelihood_ratio, 20)
import nltk from nltk.collocations import BigramCollocationFinder from utils import tokenize_transcripts, get_files # a list of tokens for each of the talks transcript_tokens = tokenize_transcripts(stem=True) # built in bigram metrics are in here bigram_measures = nltk.collocations.BigramAssocMeasures() # compute top bigrams and output results to console for i, file in enumerate(get_files()): finder = BigramCollocationFinder.from_words(transcript_tokens[i]) bigrams = finder.score_ngrams(bigram_measures.likelihood_ratio) print(file) for [tokens, value] in bigrams[0:50]: print('{},{}'.format(" ".join(tokens), value)) print('---------\n')
wordcloud = WordCloud(max_font_size=40).generate(word_cloud_fr) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('word_cloud_fr.png') #plt.show() #English bigrams from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures bcf = BigramCollocationFinder.from_words(words_en) from nltk.corpus import stopwords stopset = sw filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) bcf_list = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20) bcf_joint_list = [] for words in bcf_list: bcf_joint_list.append(' '.join(words)) #save list in txt file with open("bigrams_en.txt", "w") as output: output.write(str(bcf_joint_list)) #English trigrams
#Continuing to work from NLP for hackers. import nltk from nltk import word_tokenize from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder #Load ulysees into a variable. with open('messages_only.txt', 'r', encoding="utf-8") as myfile: text = myfile.read() #tokenize the text tokens = word_tokenize(text) bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() #compute length-2 collocations finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(5) print(finder.nbest(bigram_measures.pmi, 20)) finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) #only trigrams that appear 5+ times finder.apply_freq_filter(5) #return the 50 trigrams with the highest PMI print(finder.nbest(trigram_measures.pmi, 20))
def main(): # parser = ArgumentParser() # parser.add_argument("--folder", type=str, dest="folder") # args = parser.parse_args() # hotelname = args.folder # Load the tokenized reviews (sentences) # infile = args.folder+"/"+hotelname+"_NB_trainingdata.senttokens_sel.pyvar" infile = "NB_data/NB_trainingdata.senttokens.pyvar" infile = open(infile, 'r') print infile word_tokens_byreviewid = pickle.load(infile) infile.close() # Load the training reviewids # infile = args.folder+"/"+hotelname+"_NB_trainingdata.labels.pyvar" infile = "NB_data/NB_trainingdata.labels.pyvar" infile = open(infile, 'r') keepreviewids = pickle.load(infile) infile.close() word_tokens_byreviewid_expanded = {} negtags = []; postags = [] for reviewid in word_tokens_byreviewid: sents = word_tokens_byreviewid[reviewid] for sent_idx in range(0, len(sents)): tag = (reviewid, str(sent_idx)) word_tokens_byreviewid_expanded[tag] = sents[sent_idx] if reviewid in keepreviewids['1']: negtags.append(tag) if reviewid in keepreviewids['5']: postags.append(tag) print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags)) # # Stem the words in the sentences # # Separate each sentence into a unique entry # word_tokens_byreviewid_expanded = {} # negtags = []; postags = [] # tag_expanded = [] # for reviewid in word_tokens_byreviewid: # sents = word_tokens_byreviewid[reviewid] # for sent_idx in range(0, len(sents)): # tag = (reviewid, str(sent_idx)) # tag_expanded.append(tag) # # print tag # # print sents[sent_idx] # word_tokens_byreviewid_expanded[tag] = sents[sent_idx] # if reviewid in keepreviewids[0]: # negtags.append(tag) # # print "negtag : " # # print tag # # print negtags # if reviewid in keepreviewids[1]: # postags.append(tag) # print "neg sents: %d\t pos sents: %d" %(len(negtags), len(postags)) # print negtags, len(negtags), len(set(negtags)) # print # # print postags # print word_tokens_byreviewid_expanded # print tag_expanded # print word_tokens_byreviewid_expanded[tag_expanded] # all_words = [] # # Get all words to analyze frequency of unigrams and bigrams # for t in tag_expanded : # print tag # tag = t # word = word_tokens_byreviewid_expanded[tag] # token=nltk.word_tokenize(word) # # print len(token) # for i in range (0, len(token)): # all_words.append(token[i]) # # all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]] # # all_words =word_tokens_byreviewid_expanded[negtags] # # Get all the stop words # stopwords = get_bad_words() # # print all_words # Get all words to analyze frequency of unigrams and bigrams all_words = [word for tag in word_tokens_byreviewid_expanded for word in word_tokens_byreviewid_expanded[tag]] # Get all the stop words stopwords = get_bad_words() # dispersion_plot(all_words,postags) # Trigrams trigram_finder = TrigramCollocationFinder.from_words(all_words) trigram_finder.apply_ngram_filter(lambda w1, w2, w3: w1 in stopwords or w3 in stopwords) trigram_finder.apply_freq_filter(10) trigrams = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 2000) print "Number trigrams: %d" %len(trigrams) # print trigrams[:100] # Bigrams bigram_finder = BigramCollocationFinder.from_words(all_words) bigram_finder.apply_freq_filter(20) bigram_finder.apply_word_filter(lambda stopword: stopword in stopwords) bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 2000) print "Number bigrams: %d" %len(bigrams) # print bigrams[:100] # Unigrams word_freq_dist = DataFrame(dict(FreqDist(all_words)).items(), columns = ['word','count']) word_freq_dist = word_freq_dist[word_freq_dist['count'] > 20] # print word_freq_dist good_features = list(set(word_freq_dist['word']) - stopwords) print "Number unigrams: %d" %len(good_features) good_features.extend(bigrams) good_features.extend(trigrams) # print good_features # Output the features in the model # outfile = args.folder+"/"+ args.folder+"_NB_sentiment.model.features.pyvar" outfile = "NB_data/NB_sentiment.model.features.pyvar" outfile = open(outfile, 'w') pickle.dump(good_features, outfile) outfile.close() # Calculate the features negfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'neg') for fid in negtags] posfeatures = [(get_sent_features(word_tokens_byreviewid_expanded[fid], good_features), 'pos') for fid in postags] # print negfeatures # # Shuffle and balance the two classes # n_min = min([len(negfeatures), len(posfeatures)]) # random.shuffle(negfeatures) # negfeatures = negfeatures[:n_min] # random.shuffle(posfeatures) # posfeatures = posfeatures[:n_min] # # Define training and testing data # numfolds = 10 # foldsize = n_min/numfolds # negfolds = make_folds(negfeatures, foldsize) # posfolds = make_folds(posfeatures, foldsize) negfolds = cross_validation.StratifiedKFold(negfeatures, n_folds=10) print negfolds posfolds = cross_validation.StratifiedKFold(posfeatures, n_folds=10) print posfolds # 10 fold cross validation outfile = "NB_data/NB_sentiment.model.performance.tab" outfile = open(outfile, 'w') outfile.write("Fold\taccuracy\tpos_precision\tpos_recall\tneg_precision\tneg_recall\n") for fold in range(0, numfolds): outfile.write("%d\t" %fold) testdata = negfolds[fold] + posfolds[fold] traindata = [] for i in range(0, numfolds): if i != fold: traindata += negfolds[i] traindata += posfolds[i] print 'train on %d instances, test on %d instances' % (len(traindata), len(testdata)) result = eval_classifier(traindata, testdata) accuracy, posprecision, posrecall, negprecision, negrecall = result print result outfile.write("%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n"%(accuracy, posprecision, posrecall, negprecision, negrecall)) outfile.close() # Save the classifier trained using all data classifier = NaiveBayesClassifier.train(negfeatures + posfeatures) # outfile = args.folder+"/"+ args.folder+"_NB_sentiment.model.pyvar" outfile = "NB_data/NB_sentiment.model.pyvar" outfile = open(outfile, 'w') pickle.dump(classifier, outfile) outfile.close()