def _get_bigram_scores(self, posdata, negdata): pos_words = list(itertools.chain(*posdata)) neg_words = list(itertools.chain(*negdata)) pos_bigram_finder = BigramCollocationFinder.from_words(pos_words) neg_bigram_finder = BigramCollocationFinder.from_words(neg_words) pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words + pos_bigrams neg = neg_words + neg_bigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_unibigram_features(all_words, uni_feanum, bi_feanum): word_fd = nltk.FreqDist(all_words) bigram_fd = nltk.FreqDist(nltk.bigrams(all_words)) if uni_feanum == 'max': uni_feanum = len(list(word_fd.keys())) elif uni_feanum > len(list(word_fd.keys())): uni_feanum = len(list(word_fd.keys())) if bi_feanum == 'max': bi_feanum = len(list(bigram_fd.keys())) elif bi_feanum > len(list(bigram_fd.keys())): bi_feanum = len(list(bigram_fd.keys())) finder = BigramCollocationFinder(word_fd, bigram_fd) bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum) print "the number of unigram features is", uni_feanum print "the number of bigram features is", bi_feanum featuples = word_fd.most_common(uni_feanum) selected_words = [] for i in range(uni_feanum): selected_words.append(featuples[i][0]) features = [] for ngram in itertools.chain(selected_words, bigrams): features.append(ngram) return features
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq): ''' 以双词来统计词的信息量 ''' bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(score_method, 5000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(score_method, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count)) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords, n = 5000): # (posWords,negWords) = readwordarr() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) bigramfinder = BigramCollocationFinder.from_words(posWords) posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) bigramfinder = BigramCollocationFinder.from_words(negWords) negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n) posWords = posWords + posbigrams negWords = negWords + negbigrams wordscores = {} wordfd = FreqDist() conditionwordfd = ConditionalFreqDist() for word in posWords: wordfd[word]+=1 conditionwordfd['pos'][word]+=1 for word in negWords: wordfd[word]+=1 conditionwordfd['neg'][word]+=1 pos_word_count = conditionwordfd['pos'].N() neg_word_count = conditionwordfd['neg'].N() totalcount = pos_word_count + neg_word_count for word,freq in wordfd.items(): pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount) neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount) wordscores[word] = pos_score + neg_score return wordscores
def create_word_bigram_scores(): posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word) for word in neg: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300): sents_pos = [] sents_neg = [] # Separate positive and negative sentences. for tag, sent in sents_tagged: if tag == 1: sents_pos.append(sent) elif tag == -1: sents_neg.append(sent) # Extract words from positive and negative sentences. words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation] words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation] # Find the best bigrams for positive sentences based on informative collocations bigram_finder1 = BigramCollocationFinder.from_words(words_pos) bigrams_best_pos = bigram_finder1.nbest(score_fn, n) # Find the best bigrams for negative sentences based on informative collocations bigram_finder2 = BigramCollocationFinder.from_words(words_neg) bigrams_best_neg = bigram_finder2.nbest(score_fn, n) bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg))) # Select only the bigrams that have either one of the word greater than length 3 bigrams_best = [bigram for bigram in bigrams_all if len(bigram[0]) > 3 and len(bigram[1]) > 3 and bigram[0] not in ex and bigram[1] not in ex ] return bigrams_best
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams return get_scores(pos, neg)
def create_word_bigram_scores(): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) objWords = list(itertools.chain(*objdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) bigram_finder = BigramCollocationFinder.from_words(objWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams obj = objWords + objBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 for word in objWords: word_fd[word] += 1 cond_word_fd['obj'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() obj_word_count = cond_word_fd['obj'].N() total_word_count = pos_word_count + neg_word_count + obj_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) word_scores[word] = pos_score + neg_score + obj_score return word_scores
def get_bigrams1(tweet, score_fn=BigramAssocMeasures.chi_sq, n=200): bigramslist = [] bigram_finder = BigramCollocationFinder.from_words(tweet) bigrams = bigram_finder.nbest(score_fn, n) for bigram in bigrams: bigramslist.append(' '.join(str(i) for i in bigram)) print bigramslist
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( '_collocations' in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_features(words)) return d
def bigram_word_features(words, limit, stop, stopset, word_score_placeholder, \ score_fn=BigramAssocMeasures.chi_sq): if stop: words = [w for w in words if w not in stopset] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, limit) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def tweet_features(tweet): tweet_words = word_tokenize(tweet) bigram_finder = BigramCollocationFinder.from_words(tweet_words) score_fn=BigramAssocMeasures.chi_sq bigrams = bigram_finder.nbest(score_fn, 200) print bigrams return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def get_bag_of_bigrams_words( word_list, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(word_list) bigrams = bigram_finder.nbest(score_fn, n) return get_bag_of_words(word_list + bigrams)
def demo_collocations(self, num=40, window_size=2): """ Print collocations derived from the text, ignoring stopwords. @seealso: L{find_collocations} @param num: The maximum number of collocations to print. @type num: C{int} @param window_size: The number of tokens spanned by a collocation (default=2) @type window_size: C{int} """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') from nltk.collocations import BigramCollocationFinder finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) from nltk.metrics import f_measure, BigramAssocMeasures bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations] print "List {0} collocations".format(num) print tokenwrap(colloc_strings, separator=u'; ')
def collaction_discovery(self): self.corpus = nltk.word_tokenize(self.corpus.lower()) bigramm_finder = BigramCollocationFinder.from_words(self.corpus) filter_bigram = lambda w: len(w) < 3 or w in self.stopwords_ bigramm_finder.apply_word_filter(filter_bigram) top_10_bigrams = bigramm_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10) return top_10_bigrams
def converter(tokens): bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(score_fn, n) return ( {ngram: True for ngram in itertools.chain(tokens, bigrams)}, label )
def get_frequencies(self, desc): stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset words = word_tokenize(desc) print '------gram--------' words_to_count = [word for word in words if word not in stopset] words_to_count = [word for word in words_to_count if not len(word) < 3] c = Counter(words_to_count) single = c.most_common(20) print single print '------bigram--------' bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15) print bigrm print '------trigram--------' tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) #only keep those that appear more than 3 times trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10) print trigrm matches = [single,bigrm,trigrm] return matches
def get_bigrams(self, rawText, score_fn=BigramAssocMeasures.pmi, n=40): # TODO configuration value clean_text = TokensCleaner.clean(self, rawText, cleaning_level=3) bigram_finder = BigramCollocationFinder.from_words(clean_text['3']) bigram_measures = BigramAssocMeasures() bigrams = bigram_finder.nbest(bigram_measures.pmi, n) return bigrams
def get_bigrams(self, tweet, score_fn=BigramAssocMeasures.chi_sq, n=200): bigramslist = [] bigram_finder = BigramCollocationFinder.from_words(tweet) bigrams = bigram_finder.nbest(score_fn, n) for bigram in bigrams: bigramslist.append(' '.join(str(i) for i in bigram)) return bigramslist #This is list e.g. ['you dude', 'Hi How', 'How are', 'are you']
def get_collocations(self): ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.text_array,2) finder.apply_freq_filter(3) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() return finder.nbest(bigram_measures.likelihood_ratio,40)
def ShowCollocations(): text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n") import nltk from nltk.collocations import BigramCollocationFinder from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.metrics import TrigramAssocMeasures pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']''' data = resultsbox.get(1.0,END) rawtext=nltk.regexp_tokenize(data, pattern) prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()] text.delete(1.0, END) text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n") text.insert(END, "\nBigram Collocations:\n") bigram = BigramAssocMeasures() bigramfinder = BigramCollocationFinder.from_words(prepcolloc) bigramfinder.apply_freq_filter (3) bigrams=bigramfinder.nbest(bigram.pmi, 10) for item in bigrams: first = item[0] second = item[1] text.insert(END, first) text.insert(END, " ") text.insert(END, second) text.insert(END, "\n")
def get_bigram(self, features_list): #Top ten best bigrams are selected score = BigramAssocMeasures.chi_sq all_bigrams = BigramCollocationFinder.from_words(features_list) best_bigrams = all_bigrams.nbest(score, self.bigram_threshold) selected_bigrams = [(bigram, True) for bigram in best_bigrams] return selected_bigrams
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500): '''Removes the stopwords and computes the best bigrams''' stopset = set(stopwords.words('english')) words = [word for word in tokenize(text) if word not in stopset] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram(ytcomments, drug): bi = BigramAssocMeasures() bi_finder = BigramCollocationFinder.from_words(ytcomments, window_size = 20) top_general = bi_finder.nbest(bi.pmi,30) bi_finder.apply_ngram_filter(lambda w1,w2: w1 != drug and w2 != drug) top_bi = bi_finder.nbest(bi.pmi, 30) return top_bi
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams) #所有词和(信息量大的)双词搭配一起作为特征
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) #把文本变成双词搭配的形式 bigrams = bigram_finder.nbest(score_fn, n) #使用了卡方统计的方法,选择排名前1000的双词 return bag_of_words(bigrams)
def get_bigrams(self, words): bigram_finder = BigramCollocationFinder.from_words(words) self.biagrams = bigram_finder.nbest(self.bigram_score_funcion, self.top_ngram_count) return self.biagrams
def flatten_corpus(corpus): return ' '.join([document.strip() for document in corpus]) def get_top_ngrams(corpus, ngram_val=1, limit=5): corpus = flatten_corpus(corpus) tokens = nltk.word_tokenize(corpus) ngrams = compute_ngrams(tokens, ngram_val) ngrams_freq_dist = nltk.FreqDist(ngrams) sorted_ngrams_fd = sorted(ngrams_freq_dist.items(), key=itemgetter(1), reverse=True) sorted_ngrams = sorted_ngrams_fd[0:limit] sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams] return sorted_ngrams print(get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10)) finder = BigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() print(finder.nbest(bigram_measures.raw_freq, 10)) # Now using gensim print("Sentence: ", norm_alice[2]) key_words = keywords(norm_alice[2], ratio=1.0, scores=True, lemmatize=True) print([(item, round(score, 3)) for item, score in key_words][:25])
def bi(text): bigram_measures = nltk.collocations.BigramAssocMeasures() finder=BigramCollocationFinder.from_words(word_tokenize(text)) finder.apply_freq_filter(5) finder.nbest(bigram_measures.pmi, 5) return finder.ngram_fd.items()
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d
def get_bigrams(tokens, freq_filter=None): finder = BigramCollocationFinder.from_words(tokens) if freq_filter: finder.apply_freq_filter(freq_filter) return list(' '.join(b[0]) for b in finder.ngram_fd.items())
def bigram(collat_data): df_co = pd.DataFrame.to_string(collat_data, columns=['lemmatization']).split(',') bcf = BigramCollocationFinder.from_words(df_co) top20 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20) return top20
def find_bigrams(sentences, n_ngrams): cf = BigramCollocationFinder.from_documents(sentences) fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams) return fng
modelkmeans = KMeans(init='k-means++', max_iter=200, n_init=100) modelkmeans.fit(X) order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(modelkmeans.n_clusters): print("Cluster {}:".format(i)), for ind in order_centroids[i, :10]: print("{}".format(terms[ind])) s = all_text_docs[name] tokens = word_tokenize(s) text = nltk.Text(tokens) text.collocations() text.concordance('social') bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(word_tokenize(s)) finder.nbest(bigram_measures.pmi, 10) ###### NUM_TOPICS = 10 vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True) data_vectorized = vectorizer.fit_transform(train_clean_sentences) # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
def bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bigrams
with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file: complaints = list(csv.reader(file)) file.close() compClean = [] for i in range(len(complaints)): tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5]) tokens = re.sub('!', ".", tokens) compClean.append(tokens) from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')] bcf = BigramCollocationFinder.from_words(words) #from nltk.collocations import TrigramCollocationFinder #from nltk.metrics import TrigramAssocMeasures #tcf = TrigramCollocationFinder.from_words(words) #tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4) from nltk.corpus import stopwords stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50) newText = 'a credit card is issued to me' tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200): # finds words that often occur togther bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams)
def analyze_text(text, filename, stopwords, min_length, freq, total_ngrams, min_measure, bigrams_only, trigrams_only): print(len(text), filename) words = [ w.lower() for w in text if w not in string.punctuation if w.lower() not in stopwords and len(w) >= min_length ] bigrams = None b_prefix_keys = None trigrams = None t_prefix_keys = None # what follows could totally be generalized if not trigrams_only: # Bigrams print("Generating bigrams from", filename) b_finder = BigramCollocationFinder.from_words(words) b_finder.ngram_fd b_finder.apply_freq_filter(freq) # if stopwords: # b_finder.apply_word_filter(lambda w: w in stopwords) bigrams = b_finder.nbest(BigramAssocMeasures.pmi, total_ngrams) b_scored = b_finder.score_ngrams(BigramAssocMeasures.pmi) b_prefix_keys = collections.defaultdict(list) for key, scores in b_scored: if scores > min_measure: b_prefix_keys[key[0]].append((key[1], scores)) # Trigrams if not bigrams_only: print("Generating trigrams from", filename) t_finder = TrigramCollocationFinder.from_words(words) t_finder.apply_freq_filter(freq) # if stopwords: # t_finder.apply_word_filter(lambda w: w in stopwords) trigrams = t_finder.nbest(TrigramAssocMeasures.pmi, total_ngrams) t_scored = t_finder.score_ngrams(TrigramAssocMeasures.pmi) t_prefix_keys = collections.defaultdict(list) for key, scores in t_scored: if scores > min_measure: t_prefix_keys[key[0]].append((key[1], key[2], scores)) if bigrams_only: ret = { 'bigrams': bigrams, 'b_prefix': b_prefix_keys, 'b_fd': b_finder.ngram_fd } elif trigrams_only: ret = { 'trigrams': trigrams, 't_prefix': t_prefix_keys, 't_fd': t_finder.ngram_fd } else: ret = { 'bigrams': bigrams, 'b_prefix': b_prefix_keys, 'b_fd': b_finder.ngram_fd, 'trigrams': trigrams, 't_prefix': t_prefix_keys, 't_fd': t_finder.ngram_fd } return ret
def main(): # stopwords to filter out for collocations stopwords_eng = set(stopwords.words("english")) stopwords_eng.add(b'et') stopwords_eng.add(b'al') # bigram identifier from nltk bigram_measures = nltk.collocations.BigramAssocMeasures() # tf-idf vectorizer from nltk tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) file = open('CultureRelatedDiaognosticIssues.txt','r') a = [] names = [] for line in file: miniList = line.split("|") names.append(int(miniList[0].strip())) a.append(miniList[1].strip()) file.close() allvocab_stemmed = [] allvocab_tokenized = [] for element in a: stemmed_result = tokenize_and_stem(element) allvocab_stemmed.extend(stemmed_result) tokenized_result = tokenize_only(element) allvocab_tokenized.extend(tokenized_result) # data frame that contains stems and tokenized words vocab_frame = pd.DataFrame({'words': allvocab_tokenized}, index = allvocab_stemmed) # tf-idf matrix for the terms in the corpus tfidf_matrix = tfidf_vectorizer.fit_transform(a) terms = tfidf_vectorizer.get_feature_names() # number of clusters num_clusters = 10 # fitting the k-means algorithm and saving it in a .pkl file km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) joblib.dump(km, 'cluster.pkl') km = joblib.load('cluster.pkl') clusters = km.labels_.tolist() # data frame that saves the chapter, the text, and the assigned cluster dsm = {'chapter': names, 'text': a, 'cluster': clusters} frame = pd.DataFrame(dsm, index = [clusters], columns = ['chapter', 'text', 'cluster']) #groupby cluster for aggregation purposes grouped = frame['chapter'].groupby(frame['cluster']) # getting rid of all punctuation for bigram measures - will use this later puncTokenizer = RegexpTokenizer(r'\w+') print("Top terms per cluster:") print() #sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(num_clusters): print("Cluster %d words:" % i, end='') for ind in order_centroids[i, :6]: print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') print() print("Cluster %d titles:" % i, end='') for title in frame.ix[i]['chapter'].values.tolist(): print(str(title) + " , ", end='') print() # this for-loop finds the most common pairs of words in each diagnosis for text in frame.ix[i]['text'].values.tolist(): data_tokens = puncTokenizer.tokenize(text) data_tokens = [x.lower() for x in data_tokens] tokens = [w for w in data_tokens if w not in stopwords_eng] finder = BigramCollocationFinder.from_words(tokens) print('Printing collocations in this chapter:') print(finder.nbest(bigram_measures.likelihood_ratio, 5)) print() print() print() # distribution of clusters plt.hist(km.labels_, bins=num_clusters) plt.show()
def word_features(words, score_fn=BAM.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict((bg, True) for bg in chain(words, bigrams))
def bigram_words(words, score_fn=BigramAssocMeasures.pmi, n=121): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_words(words + bigrams)
def create_features(X, user_data=None): res = [] for date, comment, user in X: feat = {} has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0 has_fwenzel_word = 0 has_swastika = swastika in comment comment = comment.lower() comment = parse_text(comment) comment = nltk.clean_html(comment) sents = sent_tokenize(comment) doc = [] for sent in sents: # Tokenize each sentence. doc += wordtokenizer.tokenize(sent) def repl_filter(x): return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"] # Remove stopwords and replacement tokens. doc = filter(repl_filter, doc) for i, word in enumerate(doc): if doc[i] in bad_words: doc[i] = '_badword_' doc[i] = ps.stem(doc[i]) doc[i] = wnl.lemmatize(doc[i]) if doc[i] in bad_words: doc[i] = '_badword_' if doc[i] in hate_words: has_hate_word = 1 if doc[i] in drug_words: has_drug_word = 1 if doc[i] in cult_words: has_cult_word = 1 if doc[i] in occult_words: has_occult_word = 1 if doc[i] in porn_words: has_porn_word = 1 if doc[i] in fwenzel_words: has_fwenzel_word = 1 bigram_finder = BigramCollocationFinder.from_words(doc) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5) bigram = dict([(ngram, True) for ngram in itertools.chain(doc, bigrams)]) feat.update(bigram) text_vocab = set(w for w in doc if w.isalpha()) unusual = text_vocab.difference(english_vocab) unusual_ratio = len(unusual) / len(text_vocab) if len( text_vocab) != 0 else -1.0 unusual2 = unusual.difference(set("_badword_")) unusual_ratio2 = len(unusual2) / len(text_vocab) if len( text_vocab) != 0 else -1.0 if user_data is not None: user_info = user_data[user] has_bad_word = True for word in bad_words: if word in comment.lower(): break else: has_bad_word = False def n_none(x): return int(x) if x is not None else 0 def c_none(x): return x if x is not None else "__None__" readability = ReadabilityTool(comment) read_feat = {} for f, val in readability.analyzedVars.items(): if f != 'words': read_feat["_" + f] = val for test, val in readability.tests_given_lang['eng'].items(): read_feat["__" + test] = val(readability.text) feat['_always_present'] = True feat['_word_num'] = len(doc) feat['_sent_num'] = len(sents) feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0 feat['_sent_var'] = len(set(sents)) / len(sents) feat['_unusual_ratio'] = unusual_ratio feat['_unusual_ratio2'] = unusual_ratio2 if user_data is not None: feat['_username'] = user feat['_user_subcount'] = int(user_info['SubscriberCount']) feat['_user_friends'] = int(user_info['FriendsAdded']) feat['_user_favs'] = int(user_info['VideosFavourited']) feat['_user_videorates'] = int(user_info['VideosRated']) feat['_user_videouploads'] = int(user_info['VideosUploaded']) feat['_user_videocomments'] = int(user_info['VideosCommented']) feat['_user_videoshares'] = int(user_info['VideosShared']) feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded']) feat['_user_gender'] = c_none(user_info['Gender']) feat['_user_age'] = n_none(user_info['Age']) feat['_user_closed'] = user_info['UserAccountClosed'] feat['_user_suspended'] = user_info['UserAccountSuspended'] feat['_user_has_gender'] = 1 if user_info[ 'Gender'] is not None else 0 feat['_user_has_school'] = 1 if user_info[ 'School'] is not None else 0 feat[ '_user_has_books'] = 1 if user_info['Books'] is not None else 0 feat['_user_has_movies'] = 1 if user_info[ 'Movies'] is not None else 0 feat[ '_user_has_music'] = 1 if user_info['Music'] is not None else 0 feat['_user_has_location'] = 1 if user_info[ 'Location'] is not None else 0 feat['_user_has_hometown'] = 1 if user_info[ 'Hometown'] is not None else 0 # feat['_user_last'] = user_info['LastWebAccess'] # Dictionary features feat['_has_bad_word'] = has_bad_word # feat['_has_hate_word'] = has_hate_word # feat['_has_drug_word'] = has_drug_word feat['_has_cult_word'] = has_cult_word feat['_has_swastika'] = has_swastika # feat['_has_occult_word'] = has_occult_word # feat['_has_has_fwenzel_word'] = has_fwenzel_word feat['_has_porn_word'] = has_porn_word feat['_has_swastika'] = has_swastika feat.update(read_feat) # print feat res.append(feat) return res
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): print words, "\n" bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
from nltk.tokenize import word_tokenize, sent_tokenize from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder from nltk.stem import LancasterStemmer f = open('data.txt', 'r') lines = f.readlines() f.close() custom_stopwords = set(stopwords.words('english') + list(punctuation)) tokenized_lines = [] for line in lines: tokenized_words = [ word for word in word_tokenize(line) if word not in custom_stopwords ] tokenized_lines.append(tokenized_words) bigram_measures = BigramAssocMeasures() ngrams = [] for line in tokenized_lines: ngrams.append( sorted(BigramCollocationFinder.from_words(line).ngram_fd.items())) st = LancasterStemmer() stemmed = [] for line in tokenized_lines: stemmed_words = [st.stem(word) for word in line] stemmed.append(stemmed_words) for st in stemmed: print(st)
def bag_of_bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=100): bigram_finder= BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bag_of_non_stopwords(words+bigrams)
import nltk nltk.download('punkt') from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder bi_dict = dict() bg_measures = BigramAssocMeasures() with open('text/text.txt', 'r') as file: text = file.read() table = str.maketrans(dict.fromkeys('0123456789')) textWithoutNumbers = text.translate(table) words = nltk.word_tokenize(textWithoutNumbers) bi_finder = BigramCollocationFinder.from_words(words, window_size=2) bigram_measures = nltk.collocations.BigramAssocMeasures() bi_finder.apply_freq_filter(2) t = bi_finder.ngram_fd.items() ngram = list(t) ngram.sort(key=lambda item: item[-1], reverse=False) for (k, v) in ngram: print(k, v) bi_finder.score_ngrams(bigram_measures.pmi) bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10) print(bi_collocs) tri_finder = TrigramCollocationFinder.from_words(words) bi_finder.apply_freq_filter(5) t = tri_finder.ngram_fd.items() ngram = list(t) ngram.sort(key=lambda item: item[-1], reverse=False) for (k, v) in ngram:
def ExtractCollocationFeatures(train_dataset, test_dataset, X_train_filename, X_test_filename, window_size, n_features, balance_dataset=False, remove_center_interval=None): # This method extract Collocations of two words within the given # window of words as features from the given train and test datasets. # It returns X, Y matrices the vectorizer and a list with the feature names. # It also stores those X matrices in txt files with names X_train_filename and # X_test_filename under the /feature_matrices folder. # There are five tuneable parameters: # - window_size: size of the window # - n_features: number of features considered. # - balance_dataset: set to True to balance the training dataset. # - remove_center_interval: format: [-0.2, 0.2]. To remove samples with DW-Nominate inside # the interval. print("Reading datasets...") path_train = "../datasets/train/" train_dataset_df = pd.read_csv(path_train + train_dataset, sep="|", encoding="latin_1", header=None) train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech'] #Remove rows with DW-nominates close to 0 if type(remove_center_interval) != type(None): train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply( lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) < remove_center_interval[1]) else x) train_dataset_df = train_dataset_df[train_dataset_df['ideology'] != 0] train_dataset_df['ideology'] = train_dataset_df['nominate_dim1'].apply( lambda x: 1.0 if (float(x) >= 0) else -1.0) path_test = "../datasets/test/" test_dataset_df = pd.read_csv(path_test + test_dataset, sep="|", encoding="latin_1", header=None) test_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech'] if balance_dataset == True: positive_rows = len( train_dataset_df[train_dataset_df['ideology'] == 1.0]) negative_rows = len( train_dataset_df[train_dataset_df['ideology'] == -1.0]) if positive_rows > negative_rows: n = positive_rows - negative_rows indices = train_dataset_df[train_dataset_df['ideology'] == 1.0].index.values.tolist() drop_indices = random.sample(indices, n) train_dataset_df = train_dataset_df.drop(drop_indices) else: n = negative_rows - positive_rows indices = train_dataset_df[train_dataset_df['ideology'] == -1.0].index.values.tolist() drop_indices = random.sample(indices, n) train_dataset_df = train_dataset_df.drop(drop_indices) train_speeches = train_dataset_df['speech'].values.tolist() Y_train = train_dataset_df['ideology'].values.tolist() if type(remove_center_interval) != type(None): test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply( lambda x: 0 if (float(x) > remove_center_interval[0] and float(x) < remove_center_interval[1]) else x) test_dataset_df = test_dataset_df[test_dataset_df['ideology'] != 0] test_dataset_df['ideology'] = test_dataset_df['nominate_dim1'].apply( lambda x: 1.0 if (float(x) >= 0) else -1.0) test_speeches = test_dataset_df['speech'].values.tolist() Y_test = test_dataset_df['ideology'].values.tolist() print("Extracting features from train dataset...") t_start = time.time() stop_words = stopwords.words('english') total_bigrams = {} bigrams_per_speech_train = [] t0 = time.time() print(len(train_speeches)) for i in range(0, len(train_speeches)): if (i % 1000 == 0): print(i) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() speech = train_speeches[i] speech = speech.lower() speech = speech.translate(str.maketrans('', '', string.punctuation)) words = speech.split() stop_words = set(stopwords.words('english')) filtered_words = [w for w in words if not w in stop_words] bcf = BigramCollocationFinder.from_words(filtered_words, window_size=window_size) for item in bcf.ngram_fd.items(): if item[0] not in total_bigrams: total_bigrams.update({item[0]: item[1]}) else: total_bigrams[item[0]] += item[1] bigrams_per_speech_train.append(bcf.ngram_fd.items()) print("Total bigrams finded: ", len(total_bigrams)) feature_names = [] most_frequent_bigrams_sorted = sorted(total_bigrams.items(), key=lambda x: x[1], reverse=True)[:n_features] print("Number of features: ", len(most_frequent_bigrams_sorted)) most_frequent_bigrams = dict(most_frequent_bigrams_sorted) for i in range(0, len(most_frequent_bigrams_sorted)): feature_names.append(most_frequent_bigrams_sorted[i][0]) print(len(feature_names)) order = list(range(0, len(feature_names))) collocation_order = dict(zip(feature_names, order)) print("Computing X_train...") X_train_matrix = np.zeros( (len(bigrams_per_speech_train), len(feature_names))) for i in range(0, len(bigrams_per_speech_train)): if (i % 1000 == 0): print(i) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() bigrams_per_speech_i = dict(bigrams_per_speech_train[i]) for bigram in bigrams_per_speech_i: if bigram in most_frequent_bigrams: column = collocation_order[bigram] X_train_matrix[i][column] = bigrams_per_speech_i[bigram] print("Creating dataframe...") X_train_df = pd.DataFrame(X_train_matrix, columns=feature_names) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() pathX = "../feature_matrices/" print("Saving X_train into a txt file...") X_train_df.to_csv(pathX + X_train_filename, header=feature_names, index=None, sep=',') print("Transforming X_train into a csr_matrix...") X_train = csr_matrix(X_train_df) print("Extracting bigrams from test dataset...") bigrams_per_speech_test = [] t0 = time.time() print(len(test_speeches)) for i in range(0, len(test_speeches)): if (i % 1000 == 0): print(i) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() speech = test_speeches[i] speech = speech.lower() speech = speech.translate(str.maketrans('', '', string.punctuation)) words = speech.split() stop_words = set(stopwords.words('english')) filtered_words = [w for w in words if not w in stop_words] bcf = BigramCollocationFinder.from_words(filtered_words, window_size=window_size) bigrams_per_speech_test.append(bcf.ngram_fd.items()) print("Computing X_test...") X_test_matrix = np.zeros( (len(bigrams_per_speech_test), len(feature_names))) for i in range(0, len(bigrams_per_speech_test)): if (i % 1000 == 0): print(i) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() bigrams_per_speech_i = dict(bigrams_per_speech_test[i]) for bigram in bigrams_per_speech_i: if bigram in most_frequent_bigrams: column = collocation_order[bigram] X_test_matrix[i][column] = bigrams_per_speech_i[bigram] print("Creating dataframe...") X_test_df = pd.DataFrame(X_test_matrix, columns=feature_names) t1 = time.time() print(str(t1 - t0) + " segundos") t0 = time.time() print("Saving X_test into a txt file...") X_test_df.to_csv(pathX + X_test_filename, header=feature_names, index=None, sep=',') print("Transforming X_train into a csr_matrix...") X_test = csr_matrix(X_test_df) t_end = time.time() total_time = t_end - t_start print("Total time: ") print(str(total_time) + " segundos") return X_train, Y_train, X_test, Y_test, feature_names
ngram_freq_dist = nltk.FreqDist(ngrams) sorted_ngrams_fd = sorted(ngram_freq_dist.items(), key=itemgetter(1), reverse=True) sorted_ngrams = sorted_ngrams_fd[0:limit] sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams] return sorted_ngrams corpus, category = get_data() from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() print finder.nbest(bigram_measures.raw_freq, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() print finder.nbest(trigram_measures.raw_freq, 10) print finder.nbest(trigram_measures.pmi, 10) # print get_top_ngrams(corpus, ngram_val=2, limit=10)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): words_nopunc = [word for word in words if word not in string.punctuation] bigram_finder = BigramCollocationFinder.from_words(words_nopunc) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words_nopunc, bigrams)])
from nltk.corpus import stopwords from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures set = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) words.apply_word_filter(stops_filter) print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
from sklearn.model_selection import cross_val_score from sklearn.preprocessing import MinMaxScaler import seaborn as sns df = pd.read_csv('../preprocessed_dataset.csv') df.head() # Calculating number of repeated bigrams per song. Only considered bigrams of which repetition frequency is greater than 3 bigram_score = [] for i in range(len(df.index)): mean_pmi = 0.0 pmi_bigram = [] text = df["Lyrics"][i].split() coll_bia = bigram_collocation.from_words(text) coll_bia.apply_freq_filter(3) bigram_freq = coll_bia.ngram_fd.items() bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram', 'freq' ]).sort_values(by='freq', ascending=False) bigram_score.append(len(bigramFreqTable.index.values)) # Calculating number of repeated trigrams per song. Only considered trigrams of which repetition frequency is greater than 3 trigram_score = [] for i in range(len(df.index)): mean_pmi = 0.0 pmi_trigram = [] text = df["Lyrics"][i].split() coll_tri = trigram_collocation.from_words(text)
plt.show() fd = fdist_no_punc_no_stopwords # las mas comunes fd.most_common(50) # diagramas_dispersion text.dispersion_plot(["God", "mind", "knowledge"]) text.dispersion_plot(["power", "reason", "nature"]) # text.concordance("god") # bigramas # from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.nbest(bigram_measures.pmi, 10) finder.apply_freq_filter(3) finder.nbest(bigram_measures.pmi, 10) # lo que aqui cambia es el cambio de filtro # WC para los bigramas mas frecuentes stopWords = stopwords text_content = [ ''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text ] text_content = [word for word in text_content if word not in stopWords] text_content = [s for s in text_content if len(s) != 0]
corpus = [] while True: l = hpmor.readline() if l == '': break l = re.sub(r"[^а-яё \t-]", "", l.lower()).strip().split() if l: corpus.extend(l) bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() stop = set(stopwords.words('russian')) stop.update(['гарри', 'поттер', 'профессор' ]) # добавим самые популярные слова из текста в стоп-лист corpus_ = list(filter(lambda x: x not in stop, corpus)) finder = BigramCollocationFinder.from_words(corpus_) finder3 = TrigramCollocationFinder.from_words(corpus_) # фильтры по частотам и стоп-слова finder.apply_freq_filter(5) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop) finder3.apply_freq_filter(5) finder3.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stop) # биграммы и триграммы raw_bigrams = finder.nbest(bigram_measures.raw_freq, 100) pmi_bigrams = finder.nbest(bigram_measures.pmi, 100) raw_trigrams = finder3.nbest(trigram_measures.raw_freq, 100) pmi_trigrams = finder3.nbest(trigram_measures.pmi, 100)
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=500): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) # 使用卡方统计的方法,选择排名前1000的词语 newBigrams = [u + v for (u, v) in bigrams]