def alpha(self): """Krippendorff 1980 """ # check for degenerate cases if len(self.K) == 0: raise ValueError("Cannot calculate alpha, no data present!") if len(self.K) == 1: log.debug("Only one annotation value, allpha returning 1.") return 1 if len(self.C) == 1 and len(self.I) == 1: raise ValueError( "Cannot calculate alpha, only one coder and item present!") total_disagreement = 0.0 total_ratings = 0 all_valid_labels_freq = FreqDist([]) total_do = 0.0 # Total observed disagreement for all items. for i, itemdata in self._grouped_data("item"): label_freqs = FreqDist(x["labels"] for x in itemdata) labels_count = sum(label_freqs.values()) if labels_count < 2: # Ignore the item. continue all_valid_labels_freq += label_freqs total_do += self.Disagreement(label_freqs) * labels_count do = total_do / sum(all_valid_labels_freq.values()) de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. k_alpha = 1.0 - do / de return k_alpha
def alpha(self): """Krippendorff 1980 """ # check for degenerate cases if len(self.K) == 0: raise ValueError("Cannot calculate alpha, no data present!") if len(self.K) == 1: log.debug("Only one annotation value, allpha returning 1.") return 1 if len(self.C) == 1 and len(self.I) == 1: raise ValueError("Cannot calculate alpha, only one coder and item present!") total_disagreement = 0.0 total_ratings = 0 all_valid_labels_freq = FreqDist([]) total_do = 0.0 # Total observed disagreement for all items. for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) labels_count = sum(label_freqs.values()) if labels_count < 2: # Ignore the item. continue all_valid_labels_freq += label_freqs total_do += self.Disagreement(label_freqs) * labels_count do = total_do / sum(all_valid_labels_freq.values()) de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. k_alpha = 1.0 - do / de return k_alpha
def char_freq(lines): """ 返回 DataFrame,按字符频率倒序排列 """ corpus = nltk.Text(chain.from_iterable(lines)) # 需要一个长字符串,而不是字符串列表 wc = FreqDist(corpus) df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()}) df.sort('freq', ascending=False, inplace=True) df['idx'] = np.arange(len(wc.values())) return df
def getFreq(self, text, normalize=True): stop_words = stopwords.words(self.detectLanguage(text)) words = self.getTokens(text) clean_words = filter( lambda word: not word in stop_words and not word in punctuation, words) fdist = FreqDist(clean_words) #============================================================================== # # same result # fdist = FreqDist() # for word in word_tokenize(text): # word = word.lower() # if not word in stop_words and not word in punctuation: # fdist[word] += 1 #============================================================================== # normalization by dividing on max freqency if normalize: norm = float(max(fdist.values())) for word in fdist.keys(): fdist[word] = fdist[word] / norm # remove too frequent and too rare words if fdist[word] >= self._upper_bound or fdist[ word] <= self._lower_bound: del fdist[word] return fdist
def entropy(alist): f = FreqDist(alist) ent = (-1) * sum([ float(i) / len(alist) * math.log(float(i) / len(alist)) for i in f.values() ]) return ent
def get_top_words(directory, n, file): num_docs = 0.0 flist = {} result = {} for f in os.listdir(directory): #stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt" num_docs+=1 rawContents = load_file_tokens(directory+'/'+f) fdist = FreqDist( rawContents ) normalF = max(fdist.values()) for key in fdist.keys(): fdist[key]=float(float(fdist[key])/normalF) flist[directory+'/'+f] = fdist for key in flist[file].keys(): num_appear=0 for key_file in flist.keys(): if key in flist[key_file].keys(): num_appear+=1 result[key] = flist[file][key]*math.log(num_docs/(num_appear)) sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True) top_x = sorted_x[:n] result = [] for item in top_x: result.append(item[0]) return result
def main(self): TABLE = "article" cats = [ 'Economy', 'Art', 'Climate', 'Crime', 'Health', 'Politics', 'Religion', 'Science', 'Sport', 'Tech' ] config = { "host": 'localhost', "user": '******', "password": '******', "db": 'mdac' } self.db_connect(config) print(u"processing records") self.corpora = dict() for cat in cats: bag = dict() for article in self.select( TABLE, cat): # tryutn Tuple .. [0] is the result article = purifier.purify(article[0]) words = classic_tokenizer.tokenize(article) bag = FreqDist(words) self.corpora.update({cat: bag}) print(u"Words of Cat:[{}] are: ({})".format( cat, str(len(bag.values()))))
def freq_func(input_text): #nltk输入文本,输出词频 corpus = nltk.Text(input_text) fdist = FreqDist(corpus) w = list(fdist.keys()) v = list(fdist.values()) freqpd = pd.DataFrame({'word':w,'freq':v}) freqpd.sort_values(by='freq',ascending=False,inplace=True) freqpd['idx'] = np.arange(len(v)) return freqpd
def freq_func(input_txt): corpus = nltk.Text(input_txt) fdist = FreqDist(corpus) w = fdist.keys() v = fdist.values() freqdf = pd.DataFrame({'word': w, 'freq': v}) freqdf.sort('freq', ascending=False, inplace=True) freqdf['idx'] = np.arange(len(v)) return freqdf
def word_count_func(data): # #using the process_tweet function to tokenize, lower and remove stopwords process_data = list(map(process_tweet, data)) # #lemmatizing words from nltk.stem import WordNetLemmatizer # #instantiating lemmatizer = WordNetLemmatizer() # #process_data is a list of lists - here looping over the lists and then the words in the list lemmatizer_tweets = [] for l in process_data: new_row = [] for w in l: new_row.append(lemmatizer.lemmatize(w)) lemmatizer_tweets.append(new_row) #This is more descriptive info - calculating the unique vocab of the subset overall_lem_vocab = set() for tweet in lemmatizer_tweets: overall_lem_vocab.update(tweet) print(f'Overall vocab of subset: {len(overall_lem_vocab)}') #Flattening (going from a list of lists to one single list) the lemmatized tweets for freq flat_lemmatizer_tweets = [ item for sublist in lemmatizer_tweets for item in sublist ] #applying nltk freqdist function to the flat list lem_freq = FreqDist(flat_lemmatizer_tweets) print('30 most common words in subset:') print(lem_freq.most_common(30)) #returning normalized word freq because there are different N's total_words = sum(lem_freq.values()) top_30 = lem_freq.most_common(30) print("Word \t\t Normalized Frequency") print() for word in top_30: normalized_frequency = word[1] / total_words print("{} \t\t {:.4}".format(word[0], normalized_frequency)) #Creating word clouds - input is a dict with key word value num of occurences word_dict = dict(top_30) from wordcloud import WordCloud wordcloud = WordCloud( colormap='Spectral').generate_from_frequencies(word_dict) # Display the generated image w/ matplotlib: plt.figure(figsize=(10, 10), facecolor='k') plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.tight_layout(pad=0) plt.show()
def yules_k(text): """Returns the yules_k of the text Keyword arguments: text: text """ word_list = nltk.tokenize.word_tokenize(text) s1 = len(word_list) word_freq_dist = FreqDist(nltk.tokenize.word_tokenize(text)) s2 = sum([freq ** 2 for freq in word_freq_dist.values()]) K = 10000 * (s2-s1)/(s1**2) return K
def createPDwithTeleport(readerWordlist,mergedWordList): ### teleporation paramerter with value of 1 percent corpusPD = {} readerPD = {} unigramReaderWordList = FreqDist(readerWordlist) unigramCorpusWordList = FreqDist(mergedWordList) for word in unigramCorpusWordList.keys(): corpusPD[word] = unigramCorpusWordList[word]/float(sum(unigramCorpusWordList.values())) if word in unigramReaderWordList: readerPD[word] = unigramReaderWordList[word]/float(sum(unigramReaderWordList.values())) else: readerPD[word] = 0 readerPD[word] = 0.99*readerPD[word] + 0.01*corpusPD[word] return readerPD
def wordprefixsuffixsubstringsprobdist(): for w in englishdicttxt: wtok=w.split() if len(wtok) > 0: computeprefixessuffixessubstrings(wtok[0]) wordlist.append(wtok[0]) #prefixf=open("WordPrefixesProbabilities.txt","w") #suffixf=open("WordSuffixesProbabilities.txt","w") prefixdict=FreqDist(prefixes) suffixdict=FreqDist(suffixes) substringsdict=FreqDist(suffixes) totalprefixes=sum(prefixdict.values()) totalsuffixes=sum(suffixdict.values()) totalsubstrings=sum(substringsdict.values()) for pk,pv in zip(prefixdict.keys(), prefixdict.values()): prefixprobdict[pk] = float(pv)/float(totalprefixes) for pk,pv in zip(suffixdict.keys(), suffixdict.values()): suffixprobdict[pk] = float(pv)/float(totalsuffixes) for pk,pv in zip(substringsdict.keys(), substringsdict.values()): substringsprobdict[pk] = float(pv)/float(totalsubstrings) #json.dump(prefixprobdict,prefixf) #json.dump(suffixprobdict,suffixf) #print "prefix probabilities:",prefixprobdict #print "suffix probabilities:",suffixprobdict return (prefixprobdict, suffixprobdict, substringsprobdict)
def freq_words(x, terms = 30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n = terms) plt.figure(figsize=(20,5)) ax = sns.barplot(data=d, x= "word", y = "count") ax.set(ylabel = 'Count') plt.show()
def remove_low_frequent_words(texts): """ Function to remove low frequent words from texts """ utils.log("Doc2Vec", "Remove low frequent words...") dictionary = FreqDist([item for sublist in texts for item in sublist]) word_frequencies = list(dictionary.values()) low_word_frequency_quantile = np.percentile(np.array(word_frequencies), LOW_WORD_FREQUENCY_QUANTILE) return [[ word for word in text if dictionary[word] >= low_word_frequency_quantile ] for text in tqdm(texts)]
def plot_dist_productions_by_frequency(productions): f= FreqDist(productions) fdd = FreqDist(f.values()) x = [] y = [] for k in fdd.keys(): x.append(k) y.append(fdd[k]) plt.plot(x,y,lw=2,color= 'b') plt.title('Productions by frequency' ) plt.xlabel('frequency') plt.ylabel('number of rules with frequency') plt.show()
def plot_dist_productions_by_frequency(productions): f = FreqDist(productions) fdd = FreqDist(f.values()) x = [] y = [] for k in fdd.keys(): x.append(k) y.append(fdd[k]) plt.plot(x, y, lw=2, color='b') plt.title('Productions by frequency') plt.xlabel('frequency') plt.ylabel('number of rules with frequency') plt.show()
def _termInfo(self): info = [] rawInfo = [] # lemma frec in referencedLemmas try: for bData in self.referencedLemmas: if self.lemma in bData: fdist = FreqDist(bData) freq = fdist[self.lemma] rawInfo.append(freq) lenTokenList = len(bData) if self.useWdfIdf: metric = math.log(freq * 1.0 + 1.0, 2) / math.log( lenTokenList + 1, 2) else: metric = freq if DISPLAY: app_logger.info( u'[%s] Apariciones: %s Len: %s Max: %s Metric: %s' % (self.lemma, fdist[self.lemma], len(bData), max(fdist.values()), metric)) info.append(metric) # lemma frec in textLemmas freq = self.fdistLemmas[self.lemma] _lowerLimit, _median, upperLimit = getMedianDistributionInfo( rawInfo) self.rawScore = int(freq) self.upperLimit = max(settings.MANDATORY_TOKEN_MIN_QUANTITY, int(upperLimit)) lenTokenList = len(self.textLemmas) if self.useWdfIdf: termFreq = math.log(freq * 1.0 + 1.0, 2) / math.log( lenTokenList + 1, 2) else: termFreq = freq # referencedLemmas mean/sigma of lemma lowerLimit, _median, upperLimit = getMedianDistributionInfo(info) if not self.useWdfIdf: lowerLimit = math.ceil(lowerLimit) upperLimit = math.ceil(upperLimit) return termFreq, lowerLimit, upperLimit except Exception as ex: raise ex
def _train(self, tagged_corpus, cutoff=0, verbose=False): token_count = hit_count = 0 useful_contexts = set() fd = ConditionalFreqDist() tag_prob = FreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 tag_prob.inc(tag) context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, # calculate the entropy. Only include contexts that # lower then `cutoff` . total_tags = float(sum(tag_prob.values())) tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()] useful_contexts_after_filter = useful_contexts.copy() most_high = FreqDist() for context in useful_contexts: dd = fd[context] # total_tags = float(sum(dd.values())) # tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()] h = self.H(dd.keys(),tags_probs) if h > cutoff: useful_contexts_after_filter.remove(context) continue most_high[context] = h print most_high.keys() # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. for context in useful_contexts_after_filter: best_tag = fd[context].max() hits = fd[context][best_tag] self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
def add_description_text_analysis(data): print "Adding description text analysis..." d = data.description d_words = d.apply(word_tokenize) d_words_count = pd.Series(d_words.apply(len)) d_words_count.reset_index(d.index) d_words_count.rename("word_count", inplace=True) content = " ".join(d) distr = FreqDist(word_tokenize(content)) distr_len = float(len(distr.values())) word_freqs = d_words.apply(lambda x: [distr[z] / distr_len for z in x]) data['description_diversity'] = word_freqs.apply( np.mean) # this introduces nans return data.join(d_words_count)
def get_pos_entropy(all_tokens): """Get part-of-speech entropy.""" # Get all pos tags pos = [t.sim_pos for t in all_tokens] # Get frequencies pos_dist = FreqDist(pos) values = list(pos_dist.values()) # Get probability array prob_array = np.array(values) prob_array_norm = prob_array / sum(prob_array) # Compute entropy entropy = np.sum(-1 * (prob_array_norm) * np.nan_to_num(np.log2(prob_array_norm))) return entropy
def filterTokens(tokens, typefeatures=None): all_terms = FreqDist(tokens) if typefeatures == 'unigrams': minimal = 2 elif typefeatures == 'bigrams': minimal = 2 else: minimal = 1 other = FreqDist() for freq,term in zip(all_terms.values(),all_terms.keys()): if freq >= minimal: other.inc(term, freq) else: break return other
def maxTF(text, normalize=True): lang = detectLanguage(text) stop_words = stopwords.words(lang) + [i for i in punctuation] words = simple_preprocess(text) clean_words = filter(lambda word: not word in stop_words, words) fdist = FreqDist(clean_words) # Maximum tf normalization source: # http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html if normalize: norm = float(max(fdist.values())) a = 0.5 for word in fdist.keys(): fdist[word] = a + (1-a) * (fdist[word] / norm) # remove too frequent and too rare words if fdist[word] >= 0.9 or fdist[word] <= 0.1: del fdist[word] return fdist
def common_words(self, wfilter, n_words): self.filter = wfilter self.n_words = n_words all_words = ' '.join([text for text in wfilter]) all_words = all_words.split() #get word frequency fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) #converts to df # selecting top #terms most frequent words and plot d = words_df.nlargest(columns="count", n=self.n_words) # plt.figure(figsize=(20, 5)) # ax = sns.barplot(data=d, x="word", y="count") # ax.set(ylabel='Count') # plt.show() return d
def getFreq(text, normalize=True): try: stop_words = stopwords.words(detectLanguage(text)) except LookupError: import nltk nltk.download('stopwords') stop_words = stopwords.words(detectLanguage(text)) words = getTokens(text) clean_words = filter( lambda word: not word in stop_words and not word in punctuation, words) fdist = FreqDist(clean_words) # normalization by dividing on max freqency if normalize: norm = float(max(fdist.values())) for word in fdist.keys(): fdist[word] = fdist[word] / norm # remove too frequent and too rare words if fdist[word] >= upper_bound or fdist[word] <= lower_bound: del fdist[word] return fdist
def get_buzzwords(docs): buzzwords = [] for doc in docs: freqdist = FreqDist(docs[doc]) vocab = freqdist.keys() freqs = freqdist.values() buzzwords = buzzwords + vocab[:50] buzzwords = set(buzzwords) freq_counts = {} for buzzword in buzzwords: print buzzword l = [] for doc in docs: freqdist = FreqDist(docs[doc]) t = (doc, freqdist[buzzword]) l.append(t) freq_counts[buzzword] = l dump_content('freqs', freq_counts) return freq_counts
def getFreq(self, text, normalize=True): stop_words = stopwords.words(self.detectLanguage(text)) words = self.getTokens(text) clean_words = filter(lambda word: not word in stop_words and not word in punctuation, words) fdist = FreqDist(clean_words) #============================================================================== # # same result # fdist = FreqDist() # for word in word_tokenize(text): # word = word.lower() # if not word in stop_words and not word in punctuation: # fdist[word] += 1 #============================================================================== # normalization by dividing on max freqency if normalize: norm = float(max(fdist.values())) for word in fdist.keys(): fdist[word] = fdist[word] / norm # remove too frequent and too rare words if fdist[word] >= self._upper_bound or fdist[word] <= self._lower_bound: del fdist[word] return fdist
def txt_summry(text_data): text_data = text_data.lower() tokens = nltk.word_tokenize(text_data) fdist = FreqDist(tokens) maxfreq = max(fdist.values()) for word in fdist: fdist[word] = (fdist[word] / maxfreq) sentence_list = nltk.sent_tokenize(text_data) sentence_scores = {} for sent in sentence_list: #considering each word in the sentence, in lowercase for word in nltk.word_tokenize(sent.lower()): """checking if the word exists in the word_frequencies dictionary. This check is performed since we created the sentence_list list from the wikiarticle_text object but the word frequencies were calculated using the formatted_wikiarticle object(which doesn't contain any stop words, numbers, etc.)""" if word in fdist.keys(): #considering only those sentences which have less than 30 words if len(sent.split(' ')) < 30: if sent not in sentence_scores.keys(): #for first word of sentence, setting frequency to frequency of the first word sentence_scores[sent] = fdist[word] else: #for other words (not first word) in same sentence, increasing frequency by frequency of the word sentence_scores[sent] += fdist[word] #gathering the 7 sentences which have the largest scores into a list summary_sentences = heapq.nlargest(3, sentence_scores, key=sentence_scores.get) #making the sentences into a printable format summary = ''.join(summary_sentences) #generating the summary print("Summarised version of the article: ") print() print(summary) return summary
word_tokenize(text) re.sub('\W', '', text) re.sub('[^\w ]', '', text) re.sub('[^\w ']', '', text) re.sub('[^\w \']', '', text) nltk.bigrams(text) big = nltk.bigrams(text) next(big) nltk.word_tokenize(text) text.similar fdist fdist['delicious'] dir(fdist) fdist.max fdist.values fdist.values() fdist.values().sum() sum(fdist.values()) fdist['delicious'] / sum(fdist.values()) fdist['disgusting'] / sum(fdist.values()) fdist['disgusting'] fdist['vegetarian'] fdist['old-timey'] fdist['healthy'] fdist['expensive'] print text print(text) fdist.freq('delicious') fdist.freq('delicnotehu') fdist.N() fdist ?
for r_list in tokenized: for word in r_list: freq_string = freq_string + word + " " oneString_tokenize = nltk.word_tokenize(freq_string) oneString_distribution = FreqDist(oneString_tokenize) mostcommon_words = oneString_distribution.most_common(10) keys = [] vals = [] for key in oneString_distribution.keys(): keys.append(key) for val in oneString_distribution.values(): vals.append(val) #Plotting Charts plt.figure(figsize=(80, 3)) plt.bar(keys, vals) plt.title("Distribution of a sentence") plt.xticks(rotation='vertical') plt.ylabel("Counts ") plt.xlabel("words") plt.show() #higher than 10 letters print("Words which have more than 10 letters") print("*************************************") for i in tokenized:
class StylometryExtractor: DALE_CHALL_WORDS = _load_dale_chall_words() TOKENIZER = RegexpTokenizer(r"\w+'\w+|\w+") SPECIAL_CHAR = '@<:@' def __init__(self, text): self.raw_text = text self.raw_text_length = len(text) self.number_of_letters = len( [x for x in self.raw_text if x.isalpha() or x.isdigit()]) self.words = StylometryExtractor.TOKENIZER.tokenize(self.raw_text) self.tokens = word_tokenize(self.raw_text) self.number_of_words = len(self.words) self.number_of_tokens = len(self.tokens) # self.text = Text(word_tokenize(self.raw_text)) self.words_frequency = FreqDist(Text(self.words)) self.tokens_frequency = FreqDist(Text(self.tokens)) self.chars_counter = FreqDist(self.raw_text) self.lemmatizer = WordNetLemmatizer() self.lemmatized_words_frequency = FreqDist( Text([self.lemmatizer.lemmatize(word) for word in self.words])) self.sentences = sent_tokenize(self.raw_text) self.number_of_sentences = len(self.sentences) self.sentence_chars = [len(sent) for sent in self.sentences] self.sentence_word_length = [ len(sent.split()) for sent in self.sentences ] self.paragraphs = [ p for p in self.raw_text.split("\n\n") if len(p) > 0 and not p.isspace() ] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] self.all_trigrams = self._all_trigrams() self.all_fourgrams = self._all_fourgrams() self.ngram_string = self._to_ngram_string() self.features = self._to_dict() self.feature_names = list(self.features.keys()) def _to_ngram_string(self): cleared_text = ' '.join([ word for word in self.words if word not in stopwords.words('english') ]) return StylometryExtractor.SPECIAL_CHAR.join( ''.join(ngram) for ngram in ngrams(cleared_text, 4) if ' ' not in ngram and '\n' not in ngram) def word_per_thousand(self, word): return self.words_frequency[word] * 1000 / self.words_frequency.N() def token_per_thousand(self, token): return self.tokens_frequency[token] * 1000 / self.tokens_frequency.N() def char_per_thousand(self, char): return self.chars_counter.freq(char) * 1000 def chars_per_thousand(self, chars): return sum([self.char_per_thousand(char) for char in chars]) def special_chars_per_thousand(self, special_chars): count = self.chars_counter.N() for char in special_chars: count -= self.chars_counter[char] return count / self.chars_counter.N() * 1000 def upper_chars_per_thousand(self): return len(re.findall(r'[A-Z]', self.raw_text)) / self.raw_text_length * 1000 def spaces_per_thousand(self): return len([x for x in self.raw_text if x.isspace() ]) / self.raw_text_length * 1000 def has_urls(self): return int( bool( re.search('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', self.raw_text))) def syllables_per_thousand(self): return self.get_number_syllables() / self.raw_text_length * 1000 def get_number_syllables(self): dic = pyphen.Pyphen(lang='en') return sum([len(dic.inserted(word).split("-")) for word in self.words]) def get_number_pollisyllable_words(self): dic = pyphen.Pyphen(lang='en') return len([ word for word in self.words if len(dic.inserted(word).split("-")) >= 3 ]) def get_words_longer_than_X(self, x): return len([word for word in self.words if len(word) >= x]) def mean_of_syllables_per_word(self): return self.get_number_syllables() / self.number_of_words def num_of_words_with_more_than_three_syllables_per_thousand(self): return self.get_number_pollisyllable_words( ) / self.number_of_words * 1000 def get_flesch_reading_ease(self): # http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests """ 90.0- 100.0 - sily understood by an average 11-year-old student 60.0 - 70.0 - easily understood by 13- to 15-year-old students 0.00 - 30.0 - best understood by university graduates """ return 206.835 - 1.015 * self.number_of_words / self.number_of_sentences - 84.6 * self.get_number_syllables( ) / self.number_of_words def flesch_kincaid_grade_level(self): # http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests """ It is more or less the number of years of education generally required to understand this text. The lowest grade level score in theory is -3.40. """ return 0.39 * self.number_of_words / self.number_of_sentences + 11.8 * self.get_number_syllables( ) / self.number_of_words - 15.59 def get_coleman_liau_index(self): # http://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index """ It approximates the U.S. grade level thought necessary to comprehend the text. """ return 5.89 * self.number_of_letters / self.number_of_words - 29.6 * self.number_of_sentences / self.number_of_words - 15.8 def get_gunning_fog_index(self): # http://en.wikipedia.org/wiki/Gunning_fog_index """ The index estimates the years of formal education needed to understand the text on a first reading """ return 0.4 * (self.number_of_words / self.number_of_sentences + 100.0 * self.get_number_pollisyllable_words() / self.number_of_words) def get_smog_index(self): # http://en.wikipedia.org/wiki/SMOG """ Simple Measure of Gobbledygook (SMOG) is a simplification of Gunning Fog, also estimating the years of formal education needed to understand a text """ return 1.043 * math.sqrt(self.get_number_pollisyllable_words() * 30.0 / self.number_of_sentences) + 3.1291 def get_ari_index(self): # http://en.wikipedia.org/wiki/Automated_Readability_Index """ It produces an approximate representation of the US grade level needed to comprehend the text. """ return 4.71 * self.number_of_letters / self.number_of_words + 0.5 * self.number_of_words / self.number_of_sentences - 21.43 def get_lix_index(self): # http://en.wikipedia.org/wiki/LIX # http://www.readabilityformulas.com/the-LIX-readability-formula.php """ Value interpretation: Very Easy - 20, 25 Easy - 30, 35 Medium - 40. 45 Difficult - 50, 55 Very Difficult - 60+ """ long_words = self.get_words_longer_than_X(6) number_of_periods = self.number_of_sentences + self.tokens_frequency[ ':'] + self.tokens_frequency[';'] return self.number_of_words / number_of_periods + 100.0 * long_words / self.number_of_words def number_of_dale_chall_difficult_words(self): return len([ word for word in self.words if word not in StylometryExtractor.DALE_CHALL_WORDS ]) def get_dale_chall_score(self): # http://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula """ 4.9 or lower --- easily understood by an average 4th-grade student or lower 5.0–5.9 --- easily understood by an average 5th or 6th-grade student 6.0–6.9 --- easily understood by an average 7th or 8th-grade student 7.0–7.9 --- easily understood by an average 9th or 10th-grade student 8.0–8.9 --- easily understood by an average 11th or 12th-grade student 9.0–9.9 --- easily understood by an average 13th to 15th-grade (college) student 10.0 or higher --- easily understood by an average college graduate """ return 15.79 * self.number_of_dale_chall_difficult_words( ) / self.number_of_words + 0.0496 * self.number_of_words / self.number_of_sentences def get_dale_chall_known_fraction(self): """ Computes the fraction of easy words in the text, i.e., the fraction of words that could be found in the dale chall list of 3.000 easy words. """ return 1.0 - self.number_of_dale_chall_difficult_words( ) / self.number_of_words def yule_vocabulary_richness(self): M2 = sum([ len(list(g)) * (freq**2) for freq, g in groupby( sorted(self.lemmatized_words_frequency.values())) ]) M1 = float(sum(self.lemmatized_words_frequency.values())) return ((M2 - M1) / (M1 * M1)) * 10000 def simpson_vocabulary_richness(self): result = 0 for freq, g in groupby(sorted( self.lemmatized_words_frequency.values())): result += (len(list(g))) * freq * (freq - 1) n = sum(self.lemmatized_words_frequency.values()) return (result / n / (n - 1)) def mean_sentence_len(self): return np.mean(self.sentence_word_length) def std_sentence_len(self): return np.std(self.sentence_word_length) def mean_paragraph_len(self): return np.mean(self.paragraph_word_length) def std_paragraph_len(self): return np.std(self.paragraph_word_length) def mean_word_len(self): word_chars = [len(word) for word in self.words] return sum(word_chars) / len(word_chars) def unique_words_ratio(self): return len(set(self.words)) / self.number_of_words * 100 # def get_byte_ngrams(self, number_of_bytes): @classmethod def to_pos_tags(cls, sentence): tokens = StylometryExtractor.TOKENIZER.tokenize(sentence) pos_tags = list(map(lambda x: x[1], pos_tag(tokens))) return ['__START__'] + pos_tags + ['__END__'] @classmethod def pos_tag_trigrams(cls, sentence): pos_tags = StylometryExtractor.to_pos_tags(sentence) return [(x, y, z) for x, y, z in zip(pos_tags, pos_tags[1:], pos_tags[2:])] @classmethod def pos_tag_fourgrams(cls, sentence): pos_tags = StylometryExtractor.to_pos_tags(sentence) return [(p, l, m, n) for p, l, m, n in zip(pos_tags, pos_tags[1:], pos_tags[2:], pos_tags[3:])] def _all_trigrams(self): return Counter( trigram for sentence in self.sentences for trigram in StylometryExtractor.pos_tag_trigrams(sentence)) def _all_fourgrams(self): return Counter( fourgram for sentence in self.sentences for fourgram in StylometryExtractor.pos_tag_fourgrams(sentence)) def pos_tag_trigrams_percents(self): number_of_trigrams = sum(self.all_trigrams.values()) return { '_'.join(trigram): self.all_trigrams[trigram] / number_of_trigrams * 1000 for trigram in MOST_COMMON_POS_TAG_TRIGRAMS } def pos_tag_fourgrams_percents(self): number_of_fourgrams = sum(self.all_fourgrams.values()) return { '_'.join(fourgram): self.all_fourgrams[fourgram] / number_of_fourgrams * 1000 for fourgram in MOST_COMMON_POS_TAG_FOURGRAMS } def char_ngrams_tf_idf(self): return dict( zip(VECTORIZER.get_feature_names(), VECTORIZER.transform([self.ngram_string]).toarray()[0])) def to_dict(self): return self.features def to_vector(self): return list(self.features.values()) def _to_dict(self): features = { 'Lexical diversity': self.unique_words_ratio(), 'Mean Word Length': self.mean_word_len(), 'Mean Sentence Length': self.mean_sentence_len(), 'STDEV Sentence Length': self.std_sentence_len(), 'Mean paragraph Length': self.mean_paragraph_len(), 'Flesch Reading Ease': self.get_flesch_reading_ease(), 'Flesch Kincaid Grade': self.flesch_kincaid_grade_level(), 'Coleman Liau Index': self.get_coleman_liau_index(), 'Gunning Fog Index': self.get_gunning_fog_index(), 'Smog Index': self.get_smog_index(), 'Ari Index': self.get_ari_index(), 'Lix Index': self.get_lix_index(), 'Dale Chall Score': self.get_dale_chall_score(), 'Dale Chall Known Fraction': self.get_dale_chall_known_fraction(), 'Yule Vocabulary Richness': self.yule_vocabulary_richness(), 'Simpson Vocabulary Richness': self.simpson_vocabulary_richness(), 'Punctuation': self.chars_per_thousand(['.', ',', '!', ';', '?']), 'Special characters': self.chars_per_thousand([ '%', '#', ')', '(', '@', '$', '^', '&', '>', '<', '*', '_', '-', '=', '-', '+', '/', '\\', '\'', '`' ]), 'Even more special characters': self.special_chars_per_thousand([ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", '%', '#', ')', '(', '@', '$', '^', '&', '>', '<', '*', '_', '-', '=', '-', '+', '/', '\\', "'", '`', '"', '\n', '\r', ' ', '.', ',', '!', ';', '?', '[', ']', '{', '}', '\t', ':' ]), 'Commas': self.token_per_thousand(','), 'Semicolons': self.token_per_thousand(';'), 'Quotations': self.token_per_thousand('"'), 'Exclamations': self.token_per_thousand('!'), 'Colons': self.token_per_thousand(':'), 'Hyphens': self.token_per_thousand('-'), 'Double Hyphens': self.token_per_thousand('--'), 'Spaces': self.spaces_per_thousand(), 'UpperCase Letters': self.upper_chars_per_thousand(), 'Has URLs': self.has_urls(), 'A': self.chars_per_thousand(['a', 'A']), 'B': self.chars_per_thousand(['b', 'B']), 'C': self.chars_per_thousand(['c', 'C']), 'D': self.chars_per_thousand(['d', 'D']), 'E': self.chars_per_thousand(['e', 'E']), 'F': self.chars_per_thousand(['f', 'F']), 'G': self.chars_per_thousand(['g', 'G']), 'H': self.chars_per_thousand(['h', 'H']), 'I': self.chars_per_thousand(['i', 'I']), 'J': self.chars_per_thousand(['j', 'J']), 'K': self.chars_per_thousand(['k', 'K']), 'L': self.chars_per_thousand(['l', 'L']), 'M': self.chars_per_thousand(['m', 'M']), 'N': self.chars_per_thousand(['n', 'N']), 'O': self.chars_per_thousand(['o', 'O']), 'P': self.chars_per_thousand(['p', 'P']), 'Q': self.chars_per_thousand(['q', 'Q']), 'R': self.chars_per_thousand(['r', 'R']), 'S': self.chars_per_thousand(['s', 'S']), 'T': self.chars_per_thousand(['t', 'T']), 'U': self.chars_per_thousand(['u', 'U']), 'V': self.chars_per_thousand(['v', 'V']), 'W': self.chars_per_thousand(['w', 'W']), 'X': self.chars_per_thousand(['x', 'X']), 'Y': self.chars_per_thousand(['y', 'Y']), 'Z': self.chars_per_thousand(['z', 'Z']), 'Numbers': self.chars_per_thousand( ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']), 'Syllables': self.syllables_per_thousand(), 'Mean syllables per word': self.mean_of_syllables_per_word(), 'Words with >= 3 syllables': self.num_of_words_with_more_than_three_syllables_per_thousand(), } for stopword in stopwords.words('english'): features[stopword] = self.word_per_thousand(stopword) features.update(self.pos_tag_trigrams_percents()) features.update(self.pos_tag_fourgrams_percents()) features.update(self.char_ngrams_tf_idf()) return OrderedDict(sorted(features.items(), key=lambda t: t[0]))
counts = data_vect.sum(axis=0).A1 top_idxs = (-counts).argsort()[:50] top_idxs import matplotlib.pyplot as plt fdist.plot(30,cumulative=False) plt.show() ranks = range(1, len(fdist) + 1) # range of 1-2-3-4-5-...-N #freqs = list() #for token in fdist.keys(): # freqs.append(fdist[token]) # unsorted list of frequencies per word #ranks = range(1, fdist.B() + 1) freqs = list(fdist.values()) # sorted (=ranked!) list of frequencies freqs.sort(reverse = True) plt.plot(ranks, freqs, '-') plt.xscale('log') plt.yscale('log') plt.show() import random random.sample(unique_tokens, 20) all_words = list() for text in sections: words = text.split() all_words.extend(words)
def getDomainRanking(siteDomain, seoLibrary, queries): from nltk.probability import FreqDist queriesRankingInfo = {} domainList = [] ''' Recopilamos la información para cada Query ''' for query in queries: try: queriesRankingInfo[query] = getQueryRanking( query, seoLibrary.language, seoLibrary.country) domainList.extend(queriesRankingInfo[query].keys()) except Exception as ex: print(ex) continue domainFreq = FreqDist(domainList) #lowerLimit = domainFreq[siteDomain] import numpy as np lowerLimit = np.percentile(domainFreq.values(), 25) ''' Unificamos los resultados por dominio ''' domainsInfo = {} for domain in domainFreq.keys(): if domainFreq[domain] >= lowerLimit or domain == siteDomain: appearIn = {} notAppearIn = [] for query, data in queriesRankingInfo.items(): if domain in data: appearIn[query] = data[domain] else: notAppearIn.append(query) domainsInfo[domain] = DomainGoogleRankingInfo( domain, appearIn, notAppearIn) #Analisis del sitio a partir de las queries. Si no está .. malo # puede que pasemos sin www y se redirija a www try: siteDomainInfo = domainsInfo[siteDomain] except: # ponemos o quitamos www p = urlparse.urlparse(siteDomain) netloc = p.netloc or p.path if not netloc.startswith('www.'): siteDomain = 'www.' + netloc else: siteDomain = netloc[4:] siteDomainInfo = domainsInfo[siteDomain] #Competencia detectada en las queries analizadas del domainsInfo[siteDomain] domainCompetence = domainsInfo.values() #mas apariciones ##domainCompetence.sort(key=lambda x: x.avgPos, reverse=False) domainCompetence.sort(key=lambda x: len(x.appearIn), reverse=True) domainCompetence = domainCompetence[:MAX_COMPETENCE_URLS] domainCompetence.sort(key=lambda x: x.avgPos, reverse=False) # por posicion return siteDomainInfo, domainCompetence
ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)]) outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1] # Write out the distribution of words in the document with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out: for k,v in fdist_words.most_common(): prozent = fdist_words.freq(k) out.write("{},{},{}\n".format(k,v, prozent)) # Write out the distribution of ngrams in the document with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out: for k,v in fdist_ngrams.most_common(): prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams)) out.write("{},{},{}\n".format(k,v, prozent)) # Write the size of bins of words that appear with the same frequency with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out: for i in sorted(set(fdist_words.values())): bin_size = fdist_words.Nr(i) out.write("{},{}\n".format(i,bin_size)) print('Output distributions saved in \'output\' folder.') print('Output bins saved in \'bins\' folder.') # If there are many documents -> compare their most common words and ngrams if len(params.files) > 1: print("Pairwise overlap between {} most frequent words:".format(params.n)) short_names = [f.name[-15:] for f in params.files] for i, list1 in enumerate(words_most_common): for j, list2 in enumerate(words_most_common[i+1:]): print("{} | {} | ".format(short_names[i], short_names[i+j+1]), end="") overlap = len([w for w in list1 if w in list2]) print(overlap) print("Pairwise overlap between {} most frequent letters and letter pairs:".format(params.m)) short_names = [f.name[-15:] for f in params.files]
def zipfs(data): # create empty summary_tokens list to hold tokens from every summary summary_tokens = [] # boolean check to see if we have already gone through and tokenized everything files_exist = os.path.isfile("data/summary_tokens.txt") # if the data files don't exist, lets go through the process of creating them (takes ~3 minutes) if not files_exist: print( "\nThe summary_tokens file doesn't exist, beginning tokenization process, grab some coffee..." ) # store the start time so we can keep track of how long this process takes t1 = time.time() # for zipfs law we will take out punctuation, but not stop words noiseWords = [ "{{Expand section}}", ",", ".", "(", "[", "{", ")", "]", "}", ":", ";", "&", "'", '"', "'s", "``", "''", "n't", "`", '’' ] # iterate through the dataset, this is largely the same structure as in top_genres so I won't repeat comments for row in data.itertuples(index=True): # grab the summary string for tokenization summary_str = str(getattr(row, 'summary')) # tokenize the summary string tokens = word_tokenize(summary_str) # check to see if any of the tokens are in our noiseTokens list... for token in tokens: # if its not a noise word... if token not in noiseWords: # then add the token to our summary_tokens list summary_tokens.append(token) # grab the stop time and alert the user of progress t2 = time.time() print("Tokenization completed in " + str(t2 - t1) + " seconds.\n") # next lets write our summary tokens to the "summary_tokens.txt" file so we don't have to do this again summary_file = open("data/summary_tokens.txt", "w") # go through each token in the summary_tokens list for token in summary_tokens: # write each token on a newline summary_file.write("%s\n" % token) # close our file for memory summary_file.close() # the summary_tokens file already exists and we don't need to do any tokenization, this should be the normal case else: print("\nThe summary_tokens file exists, beginning token loading...") # open the summary_tokens.txt file summary_file = open("data/summary_tokens.txt", "r") # iterate over each line in the file for index, line in enumerate(summary_file): # trim the new line characters from the line trimmed_line = line.replace("\n", "") # append the line (token) to the summary_tokens list summary_tokens.append(trimmed_line) # close the summary file for memory summary_file.close() # we are now done loading the summary file and can proceed with addressing zipfs law print("Done loading!\n") print("Creating frequency distribution from " + str(len(summary_tokens)) + " summary tokens...") # create the frequency distribution of our summary tokens summary_fdist = FreqDist(summary_tokens) print("Frequency distribution computed!") # caclulate the frequency of our summary_fdist by grabbing the values and sorting descending (plot trends downwards) # this will be our x axis on our zipfs plot freqs = sorted(summary_fdist.values(), reverse=True) # calculate the ranks via our frequencies # this will be our y axis ranks = range(1, len(freqs) + 1) print( "\nNow plotting the ranks/frequencies of the summary frequency distribution to demonstrate zipfs law..." ) print( "A plot should be displayed shortly, close the plot to finish script execution." ) # use a loglog plot from matplotlib/pyplot (log of both axis) from our ranks by freqs plt.loglog(ranks, freqs) # label our x axis plt.xlabel('frequency (f)', fontsize=12, fontweight='bold') # label our y axis plt.ylabel('rank (r)', fontsize=12, fontweight='bold') # add a grid for visibility plt.grid(True) # display the plot plt.show()
#print(numpa) termino = [] # palabras por documento termifrecdoc = [] # matriz termino frecuencia documento m = 0 # contador k = 1 # contador for i in range(len(numpa)): n = numpa[i] # numero de palabras en la posicion i de numpa for j in range(n): termino.append(palabras[m + j]) #lista de frecuencia de palabras termino.sort() #print(termino) #print(len(termino)) fd = FreqDist(termino) # frecuencia de una palabra de un documento frecpalab = list( fd.values()) # lista de frecuencias de cada palabra de un documento #print(frecpalab) #print(len(frecpalab)) #lista de palabras sin repetir p = 0 # contador listpalabras = [] # palabras sin repetir for i in range(len(frecpalab)): listpalabras.append(termino[p]) p = p + frecpalab[i] #print(listpalabras) #crear matriz termino frecuencia documento for i in range(len(listpalabras)): termifrecdoc.append([listpalabras[i], frecpalab[i], k]) k = k + 1 listpalabras = [] termino = []
def summarize(text, n): sents = sent_tokenize(text) # assert n <= len(sents) wordSent = word_tokenize(text.lower()) stopWords = set(stopwords.words('english')+list(punctuation)) wordSent= [word for word in wordSent if word not in stopWords] freq = FreqDist(wordSent) # print(freq.items()) # (word,frequency) # print(list(freq.keys())) # (words) words = list(freq.keys()) # print(list(freq.values())) # (frequency) frequency = list(freq.values()) #print(frequency) # freq.plot(20,cumulative=False) # graph plot of the word and frquency dictlist = [] for i in range(len(words)): dict1 = {'word':words[i],'freq':frequency[i]} dictlist.append(dict1) # df = pd.DataFrame(dict) # print(df.head()) # dataFrame = df # ====================================================== Feed data into MySql database ================================================================ ''' tableName = "project" sqlEngine = create_engine('mysql+pymysql://root:[email protected]/project', pool_recycle=3600) dbConnection = sqlEngine.connect() ''' mydb = mysql.connector.connect( host="localhost", user="******", passwd="password", auth_plugin='mysql_native_password', database='project' ) mycursor = mydb.cursor() for i in dictlist: word,count = i.values() sql = "INSERT INTO project (word, count) VALUES (%s, %s)" val = (word,count) mycursor.execute(sql, val) mydb.commit() ''' try: frame = dataFrame.to_sql(tableName, dbConnection); except ValueError as vx: print(vx) except Exception as ex: print(ex) else: print("Table %s created successfully."%tableName); finally: dbConnection.close() ''' # ======================================================== End of Database connection ================================================================ ranking = defaultdict(int) for i, sent in enumerate(sents): for w in word_tokenize(sent.lower()): if w in freq: ranking[i] += freq[w] sentsIDX = nlargest(n, ranking, key=ranking.get) return [sents[j] for j in sorted(sentsIDX)]
def entropy(alist): f = FreqDist(alist) ent = (-1) * sum( [i / len(alist) * math.log(i / len(alist), 2) for i in f.values()]) return ent