def intersection_N_most_popular(text, text2, N, exclude): set_exclude = set(exclude) M = len(text) M2 = len(text2) # tokens = [w.lower() for w in text] # tokens2 = [w.lower() for w in text2] fd = FreqDist(text) new = [] sort = sorted(fd.items(), key=itemgetter(1), reverse=True) j = 0 while len(new) < N and j < len(sort): if not sort[j][0] in set_exclude: new.append(sort[j][0]) j += 1 fd2 = FreqDist(text2) new2 = [] sort = sorted(fd2.items(), key=itemgetter(1), reverse=True) j = 0 while len(new2) < N and j < len(sort): if not sort[j][0] in set_exclude: new2.append(sort[j][0]) j += 1 total = 0 for word in new: if word in new2: print word, 1.0 * fd[word] / M, 1.0 * fd2[word] / M2 total += 1 print "%i words in the intersection" % total
def make_cutOff(flatList, bottomCutOff, topCutOff): ''' INPUT: flatList is a 1-d list of all tokens in set of tweets and both bottom and topCutOff are intergers OUTPUT: newVocab = a 1-d list of all tokens we want to keep thrownOut = a 1-d list of all tokens to throw out ''' fd = FreqDist(flatList) newVocab = [] thrownOut = [] for item in fd.items()[:topCutOff]: # append most common words thrownOut.append(item) for item in fd.items()[topCutOff:]: if item[1] > bottomCutOff: # append good words newVocab.append(item[0]) else: # append uncommon words thrownOut.append(item) print 'Cutoffs made...' return newVocab, thrownOut
def n_gram_nltk(terms): terms_bi_gram = bigrams(terms) terms_tri_gram = trigrams(terms) uni_gram_matrix = FreqDist(terms) bi_gram_matrix = FreqDist(terms_bi_gram) tri_gram_matrix = FreqDist(terms_tri_gram) return uni_gram_matrix.items(), bi_gram_matrix.items( ), tri_gram_matrix.items()
def get_most_frequent(self, rawText, number = None, cleaning_level = 3): cleaned_tokens_levels = TokensCleaner.clean(self, rawText, cleaning_level) freq_distributions_levels = dict() for level, cleand_tokens in cleaned_tokens_levels.items(): all_words = FreqDist(cleand_tokens) if number == None: freq_distributions_levels[level] = all_words.items() else: freq_distributions_levels[level] = all_words.items()[:number] return freq_distributions_levels
def main(): fileName = '../data/deals.txt' words,lines = get_filter(fileName) word_dist = FreqDist(words) # get distribution, descending order print("Most Popular Term: ",word_dist.items()[0])# question 1 print("Least Popular Term: ", word_dist.items()[-1]) # question 2 # solution 1 for question 3 # print("Types of Guitars Found: ",len(count_guitar_types.count(lines))) # Solutioin 2 , better and more reasonable, but could be better print("Type of Guitars mentioned", count_guitar_types2.count(lines))
def __getTimelineFeatures(self, timeline): logger.info(u"Get timeline features") tweets = [] self.__changePhase(PHASE["GET_TIMELINE_URLS"]) for t in timeline: try: tweet = TweetText(t, self.__urlBuilder, self.__userBuilder) except: logger.exception(u"Error: \"" + unicode(t) + u"\"") raise ValueError(t) logger.debug(u"Tweet:" + unicode(tweet)) tweets.append(tweet) urls = [] ti = 0 for tweet in tweets: for url in tweet.urls(): self.__breakIfStopped() self.__urlResolver.addUrlToQueue(url) urls.append(url) logger.info(u"Tweet:" + unicode(tweet)) ti += 1 self.__proc = 100 * float(ti) / float(len(tweets)) #Kategorie self.__changePhase(PHASE["GET_TIMELINE_FEATURES"]) url2labels = {} ui = 0 for url in urls: self.__breakIfStopped() if not url.isError(): logger.debug(u"Classify " + unicode(url.getUrl())) url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText()) ui += 1 self.__proc = 100 * float(ui) / float(len(urls)) labelsFreq = FreqDist() for labels in url2labels.values(): for label in labels: labelsFreq.inc(label) self.__catFreq = labelsFreq.items() logger.info(u"Categories: " + unicode(labelsFreq.items())) labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']] #normalizacja labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues} logger.info(u"Category factors: " + unicode(labelsFreqValues)) #Języki langFreq = FreqDist() for u in urls: langFreq.inc(u.lang()) self.__langFreq = langFreq.items() logger.info(u"Languages: " + unicode(langFreq.items())) return labelsFreqValues
def get_feats_counts(text): tokens = word_tokenize(text) t = Text(tokens) g1s = ngrams(t, 1) freq1 = FreqDist(g1s) g1s_list = [(g, count) for g, count in freq1.items()] g2s = ngrams(t, 2) freq2 = FreqDist(g2s) g2s_list = [(g, count) for g, count in freq2.items()] gs = g1s_list + g2s_list return dict(gs)
def prepResult(title, array): arq_base = open("base.txt","r") text = arq_base.read().lower().split() chrs = (78 - len(title)) / 2 cont = 0 enf = "" enf2 = "_" while cont < chrs: enf += "*" enf2 += "_" cont += 1 result = ("\n//" + enf + " " + title + " " + enf + "\\\\\n\n" "| Palavra | | Frequência |\n\n") frequencia = FreqDist(text) frequencia_ord = ordereddict(sorted(frequencia.items(), key = lambda e: (-e[1], e[0]))) for freq in frequencia_ord: if(freq in array): lim = 84 / 2 right = lim / 2 + len(freq) chrs = (78 - (len(freq)) + len(str(frequencia_ord[freq]))) / 4 cont = 0 enf = "" while cont < chrs: enf += " " cont += 1 result += "|" + enf + freq + enf + " | " + enf + str(frequencia_ord[freq]) + enf + "|\n" result += "\n\\\\________________________________________________________________________________//\n\n" arq_base.close() return result
def __extract_bigram_words(self, bigrams, values): bigrams_number_per_value = self.__configuration_map["most_frequent_bigrams_number_per_value"] most_frequent_bigrams = {} for value in values: fdist = FreqDist(bigrams[value]) most_frequent_bigrams[value] = fdist.items()[:bigrams_number_per_value] return most_frequent_bigrams
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False): """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn] limit truncates the freq_dist to the limit most common items return_counts determines whether a list of tuples (word, count) are returned, or whether a list of just the limit most used words is returned """ for f in filtering_functions + [str.lower, str.strip]: input = map(f, input) nltk_fdist = FreqDist(list(input)) if plot: #use nltks built in plotting function before destroying the data structure nltk_fdist.plot(limit) if limit else nltk_fdist.plot() fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0])) #alphabetically sort equally counted items fidst = fdist[0:limit] if limit else fdist #apply limit fdist = [i[0] for i in fdist] if not return_counts else fdist #remove counts if desired return fdist
def preprocess(content): stopset = set(stopwords.words('english')) #replace punctuation and tag with space tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) pos_list = pos_tag(tokens) s_tokens = list() #noun and verb only for pos in pos_list: #print pos[1] #if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: if pos[1] in ['NN', 'NNS']: s_tokens.append(pos[0]) wordfreq = FreqDist(s_tokens) stemfreq = dict() st = LancasterStemmer() for word, freq in wordfreq.items(): #stopwords if word in stopset: del wordfreq[word] continue #tiny words if len(word) <= 2: del wordfreq[word] continue #stemmer stem = st.stem(word) try: stemfreq[stem]+=freq except: stemfreq[stem]=freq return stemfreq
def ngrams_cloud(text, output_filepath): tokenizer = RegexpTokenizer(r'\w+') text = ' '.join(text) sent_words = tokenizer.tokenize(text) # Calculate the frequency distance freq_dist = FreqDist(bigrams(sent_words)) # Sort highest to lowest based on the score. scoredList = sorted(freq_dist.items(), key=itemgetter(1), reverse=True) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. word_dict = {} # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. listLen = len(scoredList) for i in range(listLen): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] WC_max_words = 50 wordcloud = WordCloud( max_words=WC_max_words, height=400, width=800, collocations=False, background_color='white', colormap='Set2').generate_from_frequencies( word_dict ) # height=WC_height, width=WC_width, background_color='white') wordcloud.to_file(os.path.join(output_filepath, "bigrams_wordcloud.png"))
def get_word_bigram_scores(pos_words, neg_words): pos_words_plain = list(itertools.chain(*pos_words)) neg_words_plain = list(itertools.chain(*neg_words)) bigram_finder = BigramCollocationFinder.from_words(pos_words_plain) pos_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) bigram_finder = BigramCollocationFinder.from_words(neg_words_plain) neg_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words_plain + pos_bigrams # 词和双词搭配 neg = neg_words_plain + neg_bigrams all_words = pos + neg pos_word_fd = FreqDist(pos) neg_word_fd = FreqDist(neg) word_fd = FreqDist(all_words) pos_word_count = pos_word_fd.N() # 积极词的数量 neg_word_count = neg_word_fd.N() # 消极词的数量 #total_word_count = pos_word_count + neg_word_count total_word_count = word_fd.N() word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def summarize(self, text): # get words from text words = word_tokenize(text) # filter out stop words and lower case words = [word.lower() for word in words if word not in self.stopwords] # filter non-alphameric chars from words words = [filter(unicode.isalnum, word) for word in words] words = filter(lambda w: len(w) > 0, words) # Remove empty words # stemming words = [self.pst.stem(word) for word in words] word_frequencies = FreqDist(words) most_frequent = [word[0] for word in word_frequencies.items()[:self.top_words_count]] # get sentences sentences = sent_tokenize(text) sentence_score = defaultdict(int) for i in range(len(sentences)): sentence = sentences[i] sentence_words = word_tokenize(sentence) sentence_words = [self.pst.stem(word).lower() for word in sentence_words if word not in self.stopwords] for sentence_word in sentence_words: if sentence_word in most_frequent: sentence_score[i] += 1 sorted_wordcounts = sorted(sentence_score.iteritems(), key=operator.itemgetter(1), reverse=True)[:self.number_of_sentences] summary = "\n".join([sentences[num] for num, count in sorted_wordcounts]) return summary
def wordfreq(file): f = open(file, 'rU') raw = f.read() raw = raw.replace('\n', ' ') #raw = raw.decode('utf8') #tokenization tokens = nltk.word_tokenize(raw) #stopwords = stopwords.words('english') #use the NLTK stopwords #lower everything words = [w.lower() for w in tokens] #words_nostop = [w.lower() for w in tokens] #remove numbers words = [w for w in words if w.isalpha()] #words_nostop = [w for w in words_nostop if w.isalpha()] #encode words = [w.encode('utf8') for w in words] #words_nostop = [w.encode('utf8') for w in words if w not in stopwords] #remove punctuations words = [w.translate(None, string.punctuation) for w in words] #words_nostop = [w.translate(None, string.punctuation) for w in words_nostop] freq = FreqDist(words) #freq_nostop = FreqDist(words_nostop) sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True) #sorted_freq_nostop = sorted(freq_nostop.items(),key = lambda k:k[1], reverse = True) return sorted_freq
def termfreq(storytext, filename): ''' This function takes a speech/text/article, preprocesses it into tokens, removes stopwords, and outputs a csv of term counts and frequencies relative to the size of the speech/text/article ''' # Split into tokens, remove stopwords tokens = make.preprocess(storytext) stops = make.filter_stopwords(tokens) numstops = len(stops) # Create a FreqDist and turn it into a list of tuples freq = FreqDist(stops) data = freq.items()[:numstops] # Build a pandas DataFrame of that list df = pd.DataFrame(data) df.columns = ['word', 'count'] # Add a 'relative frequency' column to the DataFrame a = [] for i in df['count']: a.append(i/numstops) df['pct'] = a # Write the file to csv df.to_csv('%s.csv' % filename, sep=',') print df print 'Check your files for the csv!'
def count_pos(input, language): if language == 'english-nltk': words = word_tokenize(input) pos = pos_tag(words) elif language == 'english': s = pattern.en.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'spanish': s = pattern.es.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'dutch': words = word_tokenize(input, 'dutch') tagger = nltk.data.load('taggers/alpino_aubt.pickle') pos = tagger.tag(words) tags = FreqDist(tag for (word, tag) in pos) relative_frequency = [] for item in tags.items(): relative_frequency.append((item[0], float(item[1])/tags.N())) return relative_frequency
def count_pos(input, language): if language == 'english-nltk': words = word_tokenize(input) pos = pos_tag(words) elif language == 'english': s = pattern.en.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'spanish': s = pattern.es.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'dutch': words = word_tokenize(input, 'dutch') tagger = nltk.data.load('taggers/alpino_aubt.pickle') pos = tagger.tag(words) tags = FreqDist(tag for (word, tag) in pos) relative_frequency = [] for item in tags.items(): relative_frequency.append((item[0], float(item[1]) / tags.N())) return relative_frequency
def summary_corpus(data, column, language="english"): """ Return summary info for the frequency of words in the corpus example: tokens, vocab, frequency_dist= summary_corpus(data= df, column= 'reviews', language="english") """ tokens = [ word for text in data[column] for word in word_tokenize(text, language=language) ] vocab = set(tokens) frequency_dist = FreqDist(tokens) keys, values = [], [] for key, value in frequency_dist.items(): keys.append(key) values.append(value) frequency_dist = {"word": keys, "frequency": values} frequency_dist = pd.DataFrame.from_dict(frequency_dist) frequency_dist.sort_values(by='frequency', ascending=False, inplace=True, axis=0) print('Number of tokens in the corpus :', len(tokens)) print('Vocabulary size :', len(vocab)) return tokens, vocab, frequency_dist
def build_top_words(self): pos_reviews = [(review, c) for (review, c) in self.documents if c == 'pos'] neg_reviews = [(review, c) for (review, c) in self.documents if c == 'neg'] pos_words = [token for (review, c) in pos_reviews for token in review] neg_words = [token for (review, c) in neg_reviews for token in review] fd_all = FreqDist(pos_words + neg_words) pos_class_words = [('pos', word) for word in pos_words] neg_class_words = [('neg', word) for word in neg_words] cfd_pos = ConditionalFreqDist(pos_class_words) cfd_neg = ConditionalFreqDist(neg_class_words) pos_word_count = len(pos_words) neg_word_count = len(neg_words) total_word_count = pos_word_count + neg_word_count word_scores = {} for (word, freq) in fd_all.items(): pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), reverse=True, key=lambda x: x[1])[:1000] self.top_words = set([w for w, s in best])
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for category, words in wordsInCategories: word_fd.update(words) label_word_fd[category].update(words) word_counts = {} for condition in label_word_fd.conditions(): word_counts[condition] = label_word_fd[condition].N() total_word_count = 0 for condition, count in word_counts.items(): total_word_count += count word_scores = {} for word, freq in word_fd.items(): score = 0 for condition, count in word_counts.items(): score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count) word_scores[word] = score best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words] return set([w for w, s in best])
def buildcollocgraph(corpus,settings): u = nx.Graph() wordsall = [] window = settings['window'] stem = settings['stem'] kblack = settings['kblack'] cgcutoff = settings['cgcutoff'] ncorpus = normalise_text(corpus,settings) #normalise corpus here for doc in ncorpus: words = [textacy.extract.ngrams(doc,1,filter_stops=kblack)] words = [t.text for word in words for t in word] if len(words) > cgcutoff: g = textacy.network.terms_to_semantic_network(words, normalize=stem, window_width=window, edge_weighting='cooc_freq') u.add_nodes_from(g.nodes(data=True)) u.add_edges_from(g.edges(data=True)) wordsall.append(words) wordsall = [w for wdlist in wordsall for w in wdlist] word_fd = FreqDist(wordsall) #test visualise #textacy.viz.network.draw_semantic_network(U, node_weights=word_fd, spread=3.0, draw_nodes=True, base_node_size=300, node_alpha=0.25, line_width=0.5, line_alpha=0.1, base_font_size=12, save=False) #convert networkx graph to json for d3 for i,v in [k for k in word_fd.items()]: u.node[i]['freq'] = v graphdata = json_graph.node_link_data(u) graphdata['links'] = [ { 'source': graphdata['nodes'][link['source']]['id'], 'target': graphdata['nodes'][link['target']]['id'] } for link in graphdata['links']] return graphdata
def generate_corpus(folder_name, top, n): '''corpus of words generated to be used as the vocabulary. Function takes into account topn and will create a corpus with the topn amount of tokens if topn is True.''' lower = True #activates lowercase tokens subfolders = [i for i in os.listdir(folder_name) ] #iterates through subfolder corpus_list = [] for i in subfolders: for v in os.listdir(folder_name + "/" + i): text = open_text(i, folder_name, v, lower) corpus_list += [i for i in text] corpus_freqs = FreqDist(corpus_list) sorted_x = sorted(corpus_freqs.items(), key=operator.itemgetter(1), reverse=True) if top == True: topn_words = {} for i in sorted_x[:n]: topn_words[i[0]] = 0 vocabulary = list(sorted(topn_words.keys())) return topn_words, vocabulary #empty topn dictionary to be used to populate vectors and vocabulary for columns else: vocabulary = list(sorted(corpus_freqs.keys())) corpus = {str(i): 0 for i in sorted(vocabulary)} return corpus, vocabulary
def run(rawTokenListList, numTokens, freqThreshold): freqDistMap = {} retVal = "" total = 0 # freqDistMap [key: n-gram, value: 빈도수]를 구축한다. for rawTokenList in rawTokenListList: # 각 문장 토큰 리스트를 numTokens 단위로 묶어서 n-gram 리스트를 생성한다. ngramList = ngrams(rawTokenList, numTokens) # 각 n-gram의 빈도수를 측정한다. freqDist = FreqDist(ngramList) for key, value in freqDist.items(): # map 내 n-gram이 존재하면 빈도수를 누적한다. if key in freqDistMap: freqDistMap[key] += value # 최초 등장 n-gram의 경우 map에 추가한다. else: freqDistMap[key] = value # 임계값 이상의 n-gram을 추출한다. for key in freqDistMap: freq = freqDistMap[key] if freq >= freqThreshold: for gram in key: retVal += (gram + " ") retVal += ("- %d\r\n" % freq) total += 1 retVal = (("total: %d\r\nnumTokens: %d, freqThreshold: %d\r\n\r\n" % (total, numTokens, freqThreshold)) + retVal) return retVal
def get_notes_bigrams(data): # run after self.clean bigrams_list = list(bigrams(data)) bigrams_fdist = FreqDist(bigrams_list) bigram_freqs = [] for k,v in bigrams_fdist.items(): bigram_freqs.append((k,v)) sorted_bigram_freqs = sorted(bigram_freqs, key=lambda x: x[1], reverse=True) temp_dict = {} for bigram in sorted_bigram_freqs: if bigram[0] in temp_dict: temp_dict[bigram[0]] += int(bigram[1]) else: temp_dict[bigram[0]] = int(bigram[1]) dict_copy = {} for key in temp_dict: if key not in dict_copy: dict_copy[key] = temp_dict[key] for k in temp_dict: if (k[1],k[0]) == key: dict_copy[key] += temp_dict[(k[0],k[1])] del dict_copy[key] mod_bigram_freqs = [] for k,v in dict_copy.items(): mod_bigram_freqs.append((k,v)) mod_sorted_bigram_freqs = sorted(mod_bigram_freqs, key=lambda x: x[1], reverse=True) # self.sorted_bigrams = mod_sorted_bigram_freqs return mod_sorted_bigram_freqs
def summarize(self, article_text, num_sentences=DEFAULT_SUMMARIZATION_NUMBER): # Get words from article words = word_tokenize(article_text) # Filter non-alphanumeric chars from words words = [filter(unicode.isalnum, word) for word in words] words = filter(lambda w: len(w) > 0, words) # Remove empty words # Now lemmatize all words words = [ self.lemmatizer.lemmatize(word).lower() for word in words if word.lower() not in self.stopwords ] word_frequencies = FreqDist(words) most_frequent = [word[0] for word in word_frequencies.items()[:100]] # Now get sentences sentences = self.sent_detector.tokenize(article_text) wordcountdict = defaultdict(int) for word in most_frequent: lem_word = self.lemmatizer.lemmatize(word).lower() for i in range(0, len(sentences)): if lem_word in sentences[i]: wordcountdict[i] += 1 sorted_wordcounts = sorted(wordcountdict.iteritems(), key=operator.itemgetter(1), reverse=True)[:num_sentences] return [sentences[num] for num, count in sorted_wordcounts]
def analyze(search_path, config_file): """ analyze PATH tags.json: update syntax to just take a path, ignoring tags """ # load config file, if provided if config_file: cfg = load_cfg(config_file) if 'tags' not in cfg: cfg['tags'] = [] else: cfg = { 'tags': [] } word_list = [] search_glob = "{}/**".format(search_path) for filename in glob.iglob(search_glob, recursive=True): if os.path.isfile(filename): stem = pathlib.Path(filename).stem.lower() word_list += [token for token in re.split(r'\W', stem) if len(token) > 1] # remove stopwords and tags filtered_words = [word for word in word_list if word not in stopwords.words('english')] filtered_words = [word for word in filtered_words if word not in cfg['tags']] raw = " ".join(filtered_words) bag = nltk.word_tokenize(raw) freqdist = FreqDist(bag) words_sorted = sorted(freqdist.items(), key = lambda kv:(kv[1], kv[0])) top_words = words_sorted[-30:] top_words.reverse() for word in top_words: print("{1}: {0}".format(*word))
def palavrasChaves(self): # fun��o da NLTK que retorna as stopwords na lingua inglesa stopE = stopwords.words('english') # fun��o da NLTK que retorna as stopwords na lingua portuguesa stop = stopwords.words('portuguese') stopS = stopwords.words('spanish') palavrasChaves = [] textoArtigo = [] #retira pontua��es do texto e divide o texto em palavras for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split(): #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado if i not in stop: #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado if i not in stopE: #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�" if i not in stopS: if len(i) > 2: textoArtigo.append(i) # apresenta a frequencia de repeticoes das palavras no corpo do artigo freq = FreqDist(textoArtigo) # separa as quatro palavras mais frequentes items = freq.items()[:4] # coloca as palavras mais frequentes do texto na variavel palavrasChaves for i in range(0,len(items)): palavrasChaves.append(items[i][0]) return palavrasChaves
def top_words_from_corpus(self, num_words, test_name): corpus_tokens = [] for i in self.corpus_vars["corpus_member_ids"]: title = 'document_' + str(i) doc_tokens = Library.document_instances[title].metadata["tokenized_doc"] corpus_tokens += doc_tokens top_words = [] fdist_corpus = FreqDist(corpus_tokens) fdist_list = fdist_corpus.items() if test_name == "Function Word PCA": function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT'] for i in fdist_list: top_words.append(i[0]) if len(top_words) == num_words: tagged_top = nltk.pos_tag(top_words) for j,k in tagged_top: if k not in function_pos: top_words.remove(j) if len(top_words) == num_words: break elif test_name == "Burrows's Delta": for i in fdist_list: top_words.append(i[0]) if len(top_words) == num_words: break return top_words
def palavrasChaves(self): # fun��o da NLTK que retorna as stopwords na lingua inglesa stopE = stopwords.words('english') # fun��o da NLTK que retorna as stopwords na lingua portuguesa stop = stopwords.words('portuguese') stopS = stopwords.words('spanish') palavrasChaves = [] textoArtigo = [] #retira pontua��es do texto e divide o texto em palavras for i in self.titulo.lower().replace(',', '').replace('.', '').replace( '-', '').replace('(', '').replace(')', '').split(): #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado if i not in stop: #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado if i not in stopE: #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�" if i not in stopS: if len(i) > 2: textoArtigo.append(i) # apresenta a frequencia de repeticoes das palavras no corpo do artigo freq = FreqDist(textoArtigo) # separa as quatro palavras mais frequentes items = freq.items()[:4] # coloca as palavras mais frequentes do texto na variavel palavrasChaves for i in range(0, len(items)): palavrasChaves.append(items[i][0]) return palavrasChaves
def read_all(): f = open(filename, "r") raw = f.read() #generate tokens by jieba tokens = jieba.lcut(raw) #load chinese stop words stopwords = [] cfp = open('stopwords.txt', 'r+') for line in cfp: for word in line.split(): stopwords.append(word) cfp.close() # remove characters in chinese stop words wordlist_N = [] for word in tokens: if word not in stopwords: if word != '\n' and word != '―' and word != ' ' and word != '\u200b' and word != '\n' and word != '##': wordlist_N.append(word) #generate a frequency dictionary for wordlist_N freq = FreqDist(wordlist_N) #sort the frequency list in descending order sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True) #write result into .txt file with open('withoutstopwords.txt', 'w') as f: for line in sorted_freq: if line[1] > 5: f.write(str(line[0]) + '\t' + str(line[1]) + '\n') f.close()
def create_vocab(self): All_Contents = [] i=0 for rest in self.corpus: #for hotel in self.corpus: print("loading file :" + str(i+1)) for review in rest.get("Reviews"): #print review s= [] try: for v in parse_to_sentence(review.get('Content'),self.stopwords): s = v + s All_Contents = All_Contents + s except: print 'parsing error' i=i+1 term_freq = FreqDist(All_Contents) Vocab = [] Count = [] VocabDict={} for k,v in term_freq.items(): if v>5: Vocab.append(k) Count.append(v) self.Vocab = np.array(Vocab)[np.argsort(Vocab)].tolist() self.Count = np.array(Count)[np.argsort(Vocab)].tolist() self.VocabDict = dict(zip(self.Vocab,range(len(self.Vocab))))
def posAnalysis(collection): reviews = collection.find(timeout=False) __reportProgress.counter = 0 skip = 1 for rev in reviews: if skip%200 == 0: print 'skip'+str(skip) __reportProgress() if rev.has_key('tags'): skip += 1 if rev['tags'].has_key('NN'): continue sents = sent_tokenize(rev['text']) tokens = [word for sent in sents for word in word_tokenize(sent)] pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" ']) tag_fd = FreqDist(tag for (word, tag) in pos) tags = dict() for (key,value) in tag_fd.items(): k = key.replace('$','S') out = key.translate(string.maketrans("",""), string.punctuation) if len(out)>0: tags[k] = value collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})
def preprocess(content): stopset = set(stopwords.words('english')) #replace punctuation and tag with space tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) pos_list = pos_tag(tokens) s_tokens = list() #noun and verb only for pos in pos_list: #print pos[1] #if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: if pos[1] in ['NN', 'NNS']: s_tokens.append(pos[0]) wordfreq = FreqDist(s_tokens) stemfreq = dict() st = LancasterStemmer() for word, freq in wordfreq.items(): #stopwords if word in stopset: del wordfreq[word] continue #tiny words if len(word) <= 2: del wordfreq[word] continue #stemmer stem = st.stem(word) try: stemfreq[stem] += freq except: stemfreq[stem] = freq return stemfreq
def N_keyword_evolution_by_date(corpus, D, N, exclude): set_exclude = set(exclude) files = sorted([(date_from_file_name(f), f) for f in corpus.fileids()]) delta = datetime.timedelta(days=D) lower_bound = files[0][0] upper_bound = files[0][0] + delta keywords = [] i = 0 while lower_bound <= files[-1][0]: text = [] while i < len(files) and files[i][0] < upper_bound: new_file = corpus.words(files[i][1]) for j in new_file: text.append(j.lower()) i += 1 else: fd = FreqDist(text) new = [] sort = sorted(fd.items(), key=itemgetter(1), reverse=True) j = 0 while len(new) < N: if not sort[j][0] in set_exclude: new.append(sort[j][0]) j += 1 keywords.append(new) lower_bound = upper_bound upper_bound = upper_bound + delta return keywords
def extract_doc_feats_counts(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() for i in range(0, doc_num): doc_features = [0] * len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) for (tok, freq) in doc_freqs.items(): indx = tokens.index(tok) doc_features[indx] = freq * doc_freqs.N() f_tmp = numpy.asarray(doc_features) glob_features[i] = f_tmp.tolist() return (glob_features, tokens)
def N_most_freq_words(corpus, N): tokens = read_all_corpus(corpus) fdist = FreqDist([token.lower() for token in tokens]) return [ a for a, b in sorted(fdist.items(), key=itemgetter(1), reverse=True)[:N] ]
def __init__(self, tokens): frequency = FreqDist(tuple(trigrams(tokens))) self.trigram_freq = defaultdict(dict) for head_n_tail, num in frequency.items(): head1, head2, tail = head_n_tail head_tup = (head1, head2) del head1, head2 self.trigram_freq[head_tup][tail]: int = num
def trigramCalc(data): trigram=ngrams(data,3) freqDistTrigram=FreqDist(trigram) trigramCount={} for k,v in freqDistTrigram.items(): trigramCount[k[0] +" "+ k[1] +" "+ k[2]]=v return trigramCount
def count_bigrams(words): bigram_counts = [] wordFreq = FreqDist(bigrams(words)) for bigram, count in wordFreq.items(): printable_bigram = (str(bigram[0]) + " " + str(bigram[1])).replace( ',', ' ') bigram_counts.append((printable_bigram, count)) return bigram_counts
def bigramCalc(data): bigram=ngrams(data,2) freqDistBigram=FreqDist(bigram) bigramCount={} for k,v in freqDistBigram.items(): bigramCount[k[0] +" "+k[1]]=v return bigramCount
def unigramCalc(data): unigram=ngrams(data,1) freqDistUnigram=FreqDist(unigram) unigramCount={} for k,v in freqDistUnigram.items(): unigramCount[k[0]]=v return unigramCount
def get_features(document): document = re.sub('[%s]' % re.escape(string.punctuation), '', document) # removes punctuation document = document.lower() # make everything lowercase all_words = [w for w in word_tokenize(document) if len(w) > 3 and len(w) < 16] p = PorterStemmer() all_words = [p.stem(w) for w in all_words] all_words_freq = FreqDist(all_words) print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w)) return all_words_freq
def get_probs(filename): """read the given text and calculate the probabilities for all symbols.""" with open(filename) as file_in: text = file_in.read() probs = FreqDist(text) count_sum = sum(v for v in probs.values()) for k,v in probs.items(): probs[k] = v * 1.0 / count_sum return probs
def __extract_level_words(self, levels_db, level, values): words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"] most_freq_words = {} for value in values: fdist = FreqDist() for word_dist in levels_db[level][value]: fdist.inc(word_dist[0], count = word_dist[1]) most_freq_words[value] = fdist.items()[:words_number_per_value] return most_freq_words
def return_freq_types(list_types, list_words): """ returns 10 most frequent types and 10 most frequent words """ fd = FreqDist(list_types) agglomerated = defaultdict(int) for w in list_words: if not w.lower() in STOPWORDS: agglomerated[w] += 1 sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=True) return sorted_dict[:10], [t for t, freq in fd.items()[:10]]
def process_report(): tokenizer = RegexpTokenizer(r'\w+') f = open('scraper.txt','r') textfile = unicode(f.read(),errors='ignore') words = tokenizer.tokenize(textfile) filtered_words = [w for w in words if not w in stopwords.words('english')] fdist = FreqDist(filtered_words) with open("report.csv", "wb") as fp: writer = csv.writer(fp, quoting=csv.QUOTE_ALL) writer.writerows(fdist.items()) return "Wrote report"
def keywords(self, query): final_text = u' '.join(query) resultKE = self.key_extractor._fetch_text(final_text) resultFE = self.bg_extractor._fetch_text(final_text) keywordsFE = [u' '.join(w).lower() for w in resultFE for idx in range(self.bg_extractor.n_best)] keywordsFE += resultKE keywords = FreqDist(w.lower() for w in keywordsFE) return {'response': {'language': '', 'keywords': keywords.items(), 'text': query}}
def Bigrams(self,i,words): Frecs_BG=self.FrecArray_bigrams # < bigrams > # bgs = nltk.bigrams(words) word_frequencies = FreqDist(bgs) most_frequent = [word[0] for word in word_frequencies.items()] for w in most_frequent: W=" ".join(w) if not Frecs_BG.has_key(W): Frecs_BG[W]=[] idpub_occs=i+"*"+`word_frequencies[w]` Frecs_BG[W].append(idpub_occs)
def Monograms(self,i,words): lmtzr = WordNetLemmatizer() Frecs_MG=self.FrecArray_monograms # < monograms > # words=[lmtzr.lemmatize(word) for word in words] word_frequencies = FreqDist(words) most_frequent = [word[0] for word in word_frequencies.items()]#[:10]] for w in most_frequent: if not Frecs_MG.has_key(w): Frecs_MG[w]=[] idpub_occs=i+"*"+`word_frequencies[w]` Frecs_MG[w].append(idpub_occs)
def frequency(in_file, out_file): """Input: a text file Output: a table of word frequency with three columns for Word, Count and Percent frequency """ text = unicode(open(in_file, 'r').read(), errors='ignore') words = nltk.word_tokenize(text) frequency = FreqDist(words) total = float(frequency.N()) output = open(out_file, 'w') output.write("Word\tCount\tPercent\n") for pair in frequency.items(): output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) output.close()
def bestWords(): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() reviews = product_reviews_1.reviews() reviewlines = [] for review in reviews: for line in review.review_lines: reviewlines.append(line) featlines = [line for line in reviewlines if len(line.features) > 0] pluswords = [] minuswords = [] for line in featlines: plus = False minus = False for feat in line.features: if feat[1][0] == "+": plus = True elif feat[1][0] == "-": minus = True if plus: for word in line.sent: pluswords.append(word) if minus: for word in line.sent: minuswords.append(word) for word in pluswords: word_fd[word.lower()] += 1 label_word_fd['+'][word.lower()] += 1 for word in minuswords: word_fd[word.lower()] += 1 label_word_fd['-'][word.lower()] += 1 pos_word_count = label_word_fd['+'].N() neg_word_count = label_word_fd['-'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515] return set([w for w, s in best])
def mapper(key,value): sentence = value.split() for (index, tagtuple) in enumerate(sentence): token, tag = get_token_tag(tagtuple) if we_like(token, tag): fd = FreqDist() token = token.lower() window = sentence[index+1:index+5] for windowtuple in window: wtoken, wtag = get_token_tag(windowtuple) if we_like(wtoken, wtag): wtoken = wtoken.lower() fd.inc(wtoken) yield token, tuple(fd.items())
def keywords(self, query): if query.startswith('www') or query.startswith('http'): text = self.html_extractor.extract(query)['response']['text'] result = self._fetch_text(text) for r in result: print r keywords = result return {'response': {'language': '', 'keywords': keywords, 'text': query}} else: result = self._fetch_text(query) keywords = result keywords = FreqDist(w.lower() for w in keywords) return {'response': {'language': '', 'keywords': keywords.items(), 'text': query}}
def FreqDistHit(self): db = self.getHitSet() fdist = FreqDist(db) print "FreqDistVote. The number" print "========================" print len(fdist.items()) print "\n\n" print "FreqDistVote. Top votes" print "========================" print repr(fdist.items()[:50]) print "\n\n" print "FreqDistVote. Most votes" print "========================" print repr(sorted(fdist.keys()[-50:], reverse=True)) print "\n\n" print "FreqDistVote. Most votes and frequence" print "======================================" print repr(sorted(fdist.items()[-50:], reverse=True)) print "\n\n"
def get_trigrams_freqdist(tokens): tri_grams = trigrams(tokens) print 'Returned trigrams' freq_dist_trigrams = FreqDist(tri_grams) print freq_dist_trigrams.most_common(10) freq_dist_trigrams_new = dict() for item in freq_dist_trigrams.items(): temp_str = item[0] temp_key = temp_str[0] + ' ' + temp_str[1] + ' ' + temp_str[2] freq_dist_trigrams_new[temp_key] = item[1] freq_dist_trigrams_new = OrderedDict(sorted(freq_dist_trigrams_new.items(), key=lambda x: x[1], reverse=True)) return freq_dist_trigrams_new
def __call__(self, key, value): sent = value.split() for idx, tagged in enumerate(sent): token, tag = self.split_tagged(tagged) if self.valid(token, tag): dist = FreqDist() window = sent[idx+1:idx+5] for wtagged in window: wtoken, wtag = self.split_tagged(wtagged) if self.valid(wtoken, wtag): dist.inc(wtoken) yield token, tuple(dist.items())
def freq(tokens, n=None, prints=None): ''' This function takes a list of tokens and returns a list of the top n most frequent tokens It also prints a frequency distribution of the top 50 tokens ''' fdist2 = FreqDist(tokens) fdist2.plot(50, cumulative=True) [i[0] for i in fdist2.items()[:20]] if prints is 'yes': if n is None: print fdist2.items()[:20] return [i[0] for i in fdist2.items()[:20]] else: print fdist2.items()[:n] return [i[0] for i in fdist2.items()[:n]] else: if n is None: return [i[0] for i in fdist2.items()[:20]] else: return [i[0] for i in fdist2.items()[:n]]