def BootstrapFD(samp): fd = FreqDist(samp) f1 = float(fd.Nr(1)) f2 = float(fd.Nr(2)) N = float(fd.N()) B = fd.B() # Undetected species & Coverage if f2 > 0.0: f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2)) C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2) else: f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0) C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0) # Correct abundances probs = array(fd.values()) / N lambdah = (1 - C) / sum(probs * (1 - probs) ** N) probs = probs * (1 - lambdah * (1 - probs) ** N) # P for unseen # paux = (1-C)/f0 yield fd.values() popO = arange(B) dist = binom(n=N, p=1 - C) probsA = probs / sum(probs) while True: ns2 = dist.rvs() ns1 = int(N) - ns2 if ns1 > 0: samp1 = list(choice(popO, size=ns1, replace=True, p=probsA)) else: samp2 = [] if ns2 > 0: samp2 = list(random_integers(B, B + int(f0) - 1, ns2)) else: samp2 = [] yield FreqDist(samp1 + samp2).values()
def lesk_ratio(t, s, G=nx.Graph()): # LESK feita sobre o G para a aplicação mais simples. Estou testando para ver se funciona do jeito mais simples t_def = FreqDist(G.nodes(data='ext_gloss')[t].split()) s_def = FreqDist(G.nodes(data='ext_gloss')[s].split()) intersection = (t_def) & (s_def) value = 0 for i in intersection: value += t_def[i] * s_def[i] total = sum(t_def.values()) + sum(s_def.values()) return value/total
def stat_freq(text): words = common.tokenizing(text) freq_dist = FreqDist(words) freq_list = [] num_words = len(freq_dist.values()) for i in range(num_words): freq_list.append( [list(freq_dist.keys())[i], list(freq_dist.values())[i]]) freq_arr = np.array(freq_list) return freq_arr
def extract_most_common_words(self, words, sentiment): word_freq = FreqDist(words) print("for the sentiment", sentiment) print("there are", len(word_freq.keys()), "different words") print("that were used", sum(word_freq.values()), "times") df = pd.DataFrame({ f'{sentiment}_words': list(word_freq.keys()), f'{sentiment}_counts': list(word_freq.values()) }) df = df.nlargest(self.n_words, columns=f'{sentiment}_counts') df.reset_index(drop=True, inplace=True) return df, len(word_freq.keys()), sum(word_freq.values())
def append_terms(doc, terms, data, minterm_doc,vector): tf = FreqDist(terms) max_tf = max(tf.values()if len(tf)>0 else [0]) for term in tf.keys(): normalize_tf = tf[term] / max_tf new_doc = {'tf': normalize_tf, 'weight': 0, 'minterm':minterm_doc} in_data = False for term_data in data: if term == term_data['key']: #update term_data['value']['documents'][doc] = new_doc in_data = True break if not in_data: # add data.append({'key': term, 'value': {'idf':0, 'documents': {doc:new_doc}, 'index_in_vector': vector[term]}})
def freq_words(x, terms = 30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) return words_df
def getLongTermsRanked(self, minLen=7.0, numberMostCommons=30, display=False): result = [] resultDocuments = {} for seoDocument in self.seoLibrary.seoDocuments: tokenList = list( set( seoDocument.getTextTokens(removeSplitter=True, lemmatize=True))) for token in tokenList: if len(token) > minLen: result.append(token) if token not in resultDocuments: resultDocuments[token] = [seoDocument.order] else: resultDocuments[token].append(seoDocument.order) fdist = FreqDist(result) for token in fdist.keys(): fdist[token] = fdist[token] * self.getRankingModifier( numpy.mean(resultDocuments[token])) * self.getLengthModifier( len(token), minLen) maxValue = max(fdist.values()) return [(word, int(metric * 100.00 / maxValue)) for word, metric in fdist.most_common(numberMostCommons)]
class BrownDataset(object): def __init__(self, include_start=True): self.words = brown.words() self.words = map(lambda x: x.lower(), self.words) self.total_word_cnt = len( self.words) + 2 * len(brown.sents()) # include START and END if include_start: self.words.append(u'START') self.words.append(u'END') self.vocab = set(self.words) self.vocab_len = len(self.vocab) self.word_to_idx = dict(zip(list(self.vocab), range(self.vocab_len))) self.sentences = [] self.bigrams = [] self.unigrams = [] for sent in brown.sents(): sentence = map(lambda x: x.lower(), sent) if include_start: sentence.insert(0, u'START') sentence.append(u'END') self.sentences.append(sentence) self.bigrams.extend(list(ngrams(sentence, 2))) self.unigrams.extend(sentence) self.unigram_freq = dict(Counter(self.unigrams)) self.num_sentences = len(self.sentences) self.bigram_cnt = FreqDist(self.bigrams) self.bigram_len = len(self.bigram_cnt) self.bigram_idx = dict( zip(self.bigram_cnt.keys(), range(self.bigram_len))) self.bigram_freq = np.asarray(self.bigram_cnt.values()) self.num_bigrams = len(self.bigram_cnt)
def frequent_words(x, terms=30): all_words = ' '.join([text for text in x]) all_words = all_words.split() freq_dist = FreqDist(all_words) x = transformer.transform( word.replace("_", " ") for word in freq_dist.keys()) words_df = pd.DataFrame({ 'word': list(freq_dist.keys()), 'count': list(freq_dist.values()), 'vector': list(x) }) good = [] bad = [] for i in range(1, len(words_df)): if (nb.predict(words_df.at[i, 'vector']) == 5): good.append([ words_df.at[i, 'count'], words_df.at[i, 'word'].replace(" ", "_") ]) else: bad.append([ words_df.at[i, 'count'], words_df.at[i, 'word'].replace(" ", "_") ]) good = sorted(good, key=lambda x: x[0], reverse=True) bad = sorted(bad, key=lambda x: x[0], reverse=True) return format_result(good, bad, terms)
def texts_features(paras_list, sents_list, words_list, len_w_big=7, print_results=False): num_paras = len(paras_list) num_sents = len(sents_list) sperp = num_sents / num_paras tokens = FreqDist(words_list) words_count = sum(tokens.values()) vocab = len(tokens) lexdiv = words_count / vocab words_big = [word for word in words_list if len(word) > len_w_big] w_big_num = len(words_big) # flash_k_ind = 0.4*(0.78*words_count/num_sents + 100*w_big_num/words_count) flash_k_ind = 0.4 * (words_count / num_sents + 100 * w_big_num / words_count) if print_results: print( ("Текст сформирован из {} параграфов и {} предложений.\n" "{:0.3f} предложений на параграф \n" "Всего слов {} при словаре из {} уникальных слов\n" "Лексическое разнообразие -{:0.3f}\n" "Индекс туманности - {:0.3f}\n").format(num_paras, num_sents, sperp, words_count, vocab, lexdiv, flash_k_ind)) statsdict={'num_paras':num_paras,'num_sents':num_sents,\ 'sperp':sperp,'words_count':words_count,'vocab':vocab,'words{}+'.format(len_w_big):w_big_num,'lexdiv':lexdiv,'flash_k_ind':flash_k_ind} return statsdict
def get_common_values(cloud, feature, position=None, possible_values=None): """Return the most common values of the feature in the cloud.""" if feature in all_features: # Get all the values of the specified feature at the specified position # in the cloud provided with the restrictions provided. value_list = get_all_values(cloud, feature, position, possible_values) f_type = feature_type(feature) # Get common values for categorical features. if f_type == 'categorical': # Find all values that are tied for the most frequent and return # them. value_counts = FreqDist(value_list) top_values = {v for v in value_counts if value_counts[v] ==\ max(value_counts.values())} return top_values # Get common values for continuous features. elif f_type == 'continuous': # Return the average of the feature values. if len(value_list) > 0: return {mean(value_list)} else: return set() else: raise FeatureNotFoundError(feature)
def plot_postives(s=0, e=50): all_words = ' '.join([text for text in df['review']]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n=len(df['review'])) d.reset_index(inplace=True) d['pos_perc'] = np.nan for tag in d['word'].values: ret = df[df['review'].str.contains(tag)] pos_perc = ret[ret['prediction'] == 'pos'].shape[0] / ret.shape[0] * 100 neg_perc = 100 - pos_perc d.loc[(d['word'] == tag), 'pos_perc'] = pos_perc d = d.sort_values('pos_perc', ascending=False) plt.figure(figsize=(20, 5)) sns.barplot(data=d[s:e], x='word', y='pos_perc') if (e - s > 60): plt.xticks(rotation=90) else: plt.xticks(rotation=45) plt.xticks() plt.title('Percentage of Positive Reviews per tag.') plt.show()
def analysis(dataset, topic_list): ''' start with some data analysis on Review Text and Review Title applying the bag of words approach first ''' # remove stopwords and punctions and symbols dataset['Review Text'] = dataset['Review Text'].str.replace( "[^a-zA-Z#]", " ") # remove short words (length < 3) dataset['Review Text'] = dataset['Review Text'].apply( lambda x: ' '.join([w.lower() for w in x.split() if len(w) > 2])) all_reviews = [ remove_stop_words(words.split(" ")) for words in dataset['Review Text'] ] lemmatizer = WordNetLemmatizer() all_words = ' '.join([lemmatizer.lemmatize(word) for word in all_reviews]).split() ''' Plotting the top 30 words of highest frequency ''' freq_dist = FreqDist(all_words) words_distribution = pd.DataFrame({ 'word': list(freq_dist.keys()), 'count': list(freq_dist.values()) }) top_words_distribution = words_distribution.nlargest( columns='count', n=30) # want to view top 30 words #plot the output plt.figure(figsize=(50, 10)) ax = sns.barplot(data=top_words_distribution, x="word", y="count") ax.set(ylabel='Count') plt.show() return top_words_distribution, dataset
def tf_measure(word_tokens, query_tokens, N): tfscore = 0.0 freq = FreqDist(word_tokens) try: wf = max(freq.values()) except: wf = 0.0 for token in query_tokens: try: tf = freq[token] tf = 1.0 + log(tf) #tfscore = 0.5 + (0.5 * (0.0 + tf))/(0.0 + wf) except: tf = 0.0 tfscore+=tf # IDF measures #print tfscore return tfscore
def __extract_most_common_words_by_class(self, list_of_words, class_value): word_freq = FreqDist(list_of_words) df = pd.DataFrame({ f'{class_value}_words': list(word_freq.keys()), f'{class_value}_counts': list(word_freq.values()) }) df = df.nlargest(self.n_words, columns=f'{class_value}_counts') df.reset_index(drop=True, inplace=True) return df
def get_probs(filename): """read the given text and calculate the probabilities for all symbols.""" with open(filename) as file_in: text = file_in.read() probs = FreqDist(text) count_sum = sum(v for v in probs.values()) for k,v in probs.items(): probs[k] = v * 1.0 / count_sum return probs
def get_probs(filename): """read the given text and calculate the probabilities for all symbols.""" with open(filename) as file_in: text = file_in.read() probs = FreqDist(text) count_sum = sum(v for v in probs.values()) for k, v in probs.items(): probs[k] = v * 1.0 / count_sum return probs
def word_frequency_vs_rank(text, as_probability=False): frequency_distribution = FreqDist(text) frequency = np.asarray( sorted(frequency_distribution.values(), reverse=True)) if as_probability: frequency / len(text) rank = np.arange(1, len(frequency) + 1) return rank, frequency
def freq_words(x,terms=200): all_words=x.split() fdist=FreqDist(all_words) df=pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) return (df.nlargest(columns='count', n=terms)) d=df.nlargest(columns='count', n=terms) plt.figure(figsize=(15,5)) ax=sns.barplot(data=d, x="word", y="count") ax.set(ylabel="count") plt.show()
def word_frequence(al, rank): lst = LancasterStemmer() left = [ lst.stem(word.lower()) for word in word_tokenize(al) if word.lower() not in stopwords.words('english') and len(word) > 2 ] final = FreqDist(left) sort = sorted(list(set(final.values()))) sort = [i for i in sort[::-1]] for i in sort[:rank]: #¦C¥X«e´X¦W print([v for v, k in final.items() if k == i], i)
def freq_words(x, terms = 30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n = terms) plt.figure(figsize=(20,5)) ax = sns.barplot(data=d, x= "word", y = "count") ax.set(ylabel = 'Count') plt.show()
def zipf(tokens): from nltk import FreqDist # For frequenct distribution p = FreqDist(tokens) # Finding frequency distribution of the tokens found above freq = list(p.values()) freq.sort(reverse=True) # Sort freq reverse to get ranked values f = np.array(freq) r = np.arange(1, len(f)+1) k = np.median(f*r) return k
def generar_grafico2(self, lista_datos): import nltk from nltk import FreqDist lista_unica = "" for respuesta_encuesta in lista_datos: for respuesta_pregunta in respuesta_encuesta: for palabra in respuesta_pregunta: lista_unica += palabra + " " tokens = nltk.word_tokenize(lista_unica) fdist = FreqDist(tokens) print(fdist.keys()) print(fdist.values()) fdist.plot(30, cumulative=False)
def freq_words(x, terms=30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n=terms) return (d)
def freq_words(x, terms=30): all_words = ' '.join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) d = words_df.nlargest(columns="count", n=terms) plt.figure() ax = sns.barplot(data=d, x="count", y="word") ax.set(ylabel='word')
def freq(review, nlarge=30): all_sentence = [sent for sent in review] all_sentence = " ".join(all_sentence) all_words = all_sentence.split() word_freq = FreqDist(all_words) df_freqdist = pd.DataFrame({ "word": list(word_freq.keys()), "count": list(word_freq.values()) }) d = df_freqdist.nlargest(columns="count", n=nlarge) plt.figure(figsize=(20, 10)) # plt.bar(d["word"],d["count"]) sns.barplot(d["word"], d["count"]) plt.show()
def run(self): website = [ "new+york+times", "bbc+news", "cnn", "daily+mail", "al+jazeera" ] check = [ 'new york times', "bbc news", "cnn", "daily mail", "aljazeera" ] my_stopwords = [ 'first', 'could', 'says', 'year', 'years', 'may', 'us', 'set', 'time', 'new', 'trumps', 'one', 'say', 'times', 'city', 'day', 'top', 'making', 'make', 'bbc', 'cnn', "two", "news", 'like', 'wont', 'get', 'run', 'still', 'good', 'dont', 'take', 'days', 'im', 'gets', 'want', 'go', 'finds', 'goes', 'gets' ] portion = [7, 11, 11, 4, 4] self.month_titles = [] for site_num in range(5): temp_thread = [] for page in range(portion[site_num]): sleep(0.5) temp_thread.append( threading.Thread(target=self.extract, args=(page, website[site_num], check[site_num]))) temp_thread[-1].start() for i in temp_thread: i.join() print(len(self.month_titles)) text = ' '.join([i[0] for i in self.month_titles]) month_keyword = word_token(text, my_stopwords) with open(f"raw_text/{self.year}_{self.month}.csv", 'w', encoding='utf-8') as w: csv.writer(w).writerow(month_keyword) month_keyword, times = word_frequence(month_keyword, 20) recommand0 = [ j[0] + '@' + j[1] for i in month_keyword for j in self.month_titles if i.lower() in j[0].lower() ] recommand0 = FreqDist(recommand0) sort_recommand = sorted(list(set(recommand0.values())), reverse=True) recommand = [ k.split('@') for k, v in recommand0.items() if v in sort_recommand[:2] ] if len(recommand) > 15: recommand = random.sample(recommand, 15) #因為會照字母排序導致NYtimes永遠被排除 final_data.append( (self.year, self.month, month_keyword, times, recommand))
def normalized_top_50(page): """Returns the 50 most common words from the page with normalized scores.""" words_stopped = preprocess(page) freqdist = FreqDist(words_stopped) top_50 = freqdist.most_common(50) total = sum(freqdist.values()) normalized = [] for word in top_50: normalized_frequency = word[1] / total normalized.append((word[0], "{:.4}".format(normalized_frequency))) return normalized
def generate_word_counts_fig(x, terms=30): all_words = " ".join([text for text in x]) all_words = all_words.split() fdist = FreqDist(all_words) words_df = pd.DataFrame({ "word": list(fdist.keys()), "count": list(fdist.values()) }) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n=terms) plt.figure(figsize=(20, 5)) ax = sns.barplot(data=d, x="word", y="count") ax.set(ylabel="Count")
def getFrequencyDistribution(words, num=20): fdist = FreqDist(words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # selecting top 'num' most frequent words d = words_df.nlargest(columns="count", n=num) plt.figure(figsize=(20, 5)) ax = sns.barplot(data=d, x="word", y="count") ax.set(ylabel='Count') plt.show() return words_df
def freq_words(self, x): all_words = " ".join([text for text in x]) all_words = all_words.split() print("\n all_words in corpus --- ", len(all_words), all_words[:12]) fdist = FreqDist(all_words) # print("\n fdist --- ", fdist) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) print("\n words counts in corpus --- \n ", words_df.head()) # selecting top 20 most frequent words d = words_df.nlargest(columns='count', n=20) print("\n d --- ", d)
def word_frequence(left, rank): final = FreqDist(left) sort = sorted(list(set(final.values())), reverse=True) #次數去重複(例如多個只出現一次)排序 count = 0 keyword, times = [], [] for i in sort[:rank]: #列出前幾名用次數去找對應字 key = [k for k, v in final.items() if v == i] count += len(key) #限制最大關鍵字數 if count > rank: break if i > 3: #指挑最熱門的關鍵字 print(key, i) keyword += key for y in range(len(key)): times.append(i) return keyword, times #裝入前rank關鍵字的串列
def _pre_treate(self, records=None): MAX_SEQUENCE_LENGTH = self.MAX_SEQUENCE_LENGTH MAX_WORDS = self.MAX_WORDS MODEL_TYPE = self.MODEL_TYPE N_GRAM = self.N_GRAM CONFIG_PATH = self.CONFIG_DIR if os.path.exists(os.path.join(CONFIG_PATH,'classifier_config.json')): word2id, id2label, label2id, class_weight, parameter = json.load( open(os.path.join(CONFIG_PATH, 'classifier_config.json'), 'r', encoding='utf-8') ) if parameter['model'] != MODEL_TYPE or parameter['max_length'] != MAX_SEQUENCE_LENGTH: raise Exception("classifier_config error: inconsistent model type or sequence length, " "please delete config.json and rerun the program") class_weight = {int(index): weight for index, weight in class_weight.items()} else: try: x = list(map(lambda x: x['content'], records)) y = list(map(lambda x: x['label'], records)) # create word2id dict if MODEL_TYPE != 'FastText': N_GRAM = 1 vectorizer = CountVectorizer(token_pattern = '[^\s]+', ngram_range=(1,N_GRAM), max_df=0.95, min_df=3, max_features=MAX_WORDS) vectorizer.fit(x) word2id = {word: str(index)+2 for word, index in vectorizer.vocabulary_.items()} # create id2label and label2id dict Binarizer = LabelEncoder() Binarizer.fit(x) id2label = {index: label for index, label in enumerate(Binarizer.classes_)} label2id = {label: index for index, label in enumerate(Binarizer.classes_)} # create class_weight dict label_freq = FreqDist(y) class_weight = {int(index): max(label_freq.values())/label_freq[label] for index, label in enumerate(Binarizer.classes_)} # store model parameter parameter = {'model': MODEL_TYPE, 'max_length': MAX_SEQUENCE_LENGTH, 'ngram': N_GRAM} # dump config file json.dump([word2id, id2label, label2id, class_weight, parameter], open(os.path.join(CONFIG_PATH, 'classifier_config.json'), 'w', encoding='utf-8')) except Exception as e: sys.exit('Error: config file not exist') return word2id, id2label, label2id, class_weight, parameter
print "building Text format" text = nltk.Text(tokens) print "building freqdist." fdist = FreqDist(text) print "freqdist done." # output result to csv file print "opening csv file." csvfile = file("/Users/Zhao/Documents/gone_with_the_wind.csv", "aw") writer = csv.writer(csvfile) print "writing csv file" # no repeated item arr = [] arr.append((1, fdist.keys()[1], fdist.values()[1], 1)) pre_value = fdist.values()[1] cur_num = 2 for i in xrange(0, 24903): # print i+1, fdist.keys()[i], fdist.values()[i] if fdist.values()[i] != pre_value: if pattern.match(fdist.keys()[i]) != None: item = (i + 1, fdist.keys()[i], fdist.values()[i], cur_num) pre_value = fdist.values()[i] cur_num += 1 arr.append(item) print i + 1, "done." arr.append((24903, fdist.keys()[-1], fdist.values()[-1], cur_num)) print arr
newtuple = (new_words[i], t[1]) #Each new tuple uses same POS tag (t[1]) text.insert(position+i, newtuple) position+=1 #============================================================================== text = [(w,p) for w,p in text if re.match(r"[\'a-z]",w[0])] nonlemwords = [w for w,p in text] #============================================================================== # Create non-lemmatized version to use if the lemmatized version doesn't have matches (because of differences in POS tagging) #============================================================================== bigrams = FreqDist(zip(nonlemwords[:-1],nonlemwords[1:])) unigram = FreqDist(nonlemwords) sbig = float(sum(bigrams.values())) suni = float(sum(unigram.values())) nonlemassoc = {} for b0,b1 in bigrams: p1 = unigram[b0]/suni p2 = unigram[b1]/suni p12 = bigrams[b0,b1]/sbig nonlemassoc[b0,b1] = log(p12)-log(p1)-log(p2) #============================================================================== # #Write SBC Bigram association scores to file #============================================================================== f=open("/Users/heathersimpson/Documents/Dissertation/Articles/Chp3_IUvsClauseBoundaries/BigramStrength/SBC-nonlembigrams.txt","w") #Give it headers first f.write("Word1\tWord2\tpwMI\n") f.close()
# How many times does a word appear in the text? text1.count("do") # Percentage of the text occupied by a word, see E28 below for a better function. from nltk.book import text5 # Chat conversations 100*text5.count("call")/len(text5) 100*text5.count("whatever")/len(text5) # Frequency distribution from nltk import FreqDist fdist1 = FreqDist(text1) vocabulary = fdist1.keys() frequencies = fdist1.values() fdist1['whale'] # Define a function that computes lexical diversity def lexical_diversity(text): return len(text)/len(set(text)) #Note that our new function can be used on any text, even your own: lexical_diversity(myText) # You can combine two lists with text (the addition operator concatenates strings and lists): myText1 = ["This", "is", "my","text","and"] myText2 = ["there","is","nothing","you","can","do","about","it","!"]
def buildcorpus(corpus, rootpath, filelimit = 0): #rootpath = corpus.rootpath fileids = os.listdir(rootpath) hugewordlist = [] hugewordlist.extend(corpus.words) # will contain distinct Word instances numoffiles = 0 corpus.set_corpusname(str(max(filelimit, len(fileids)))+"texts") for fileid in fileids: allwords = FreqDist() # will contain all words in this text doc_id = fileid.split(".")[0] # corpus.inserttext(doc_id) ##### ! text in kendisini gondermeli newtext = Text(doc_id) path = rootpath + os.sep + fileid #lines = readtextlines(path) #rawtext = texter.readtxtfile(path) rawtext = texter.readnewstext(path) lines = texter.splitToSentences(rawtext) sntindex = 0 # each line is a sentence for line in lines: words = [] # words in this sentence words = line.split() words = texter.eliminatepunctuation(words) words = [word for word in words if not word.isspace()] for word in words: allwords.inc(word) newword = Word(word) newword.insertsentenceid(doc_id+"_"+str(sntindex)) if allwords[word] <= 1: # if this was not added to the hugelist before, add it hugewordlist.append(newword) sentence = Sentence(sntindex) sntindex = sntindex + 1 # sentence'a Word mu wordindex mi atalim? for word in words: index = hugewordlist.index(Word(word)) hugewordlist[index].insertsentenceid(doc_id+"_"+str(sntindex-1)) sentence.insertword(index) newtext.insertsentence(sentence) if (not rawtext.isspace()) or (len(allwords) != 0): corpus.inserttext(newtext) print str(numoffiles)," : finished handling the words-snts-txts ",doc_id numofwords = reduce(lambda x,y : x+y, allwords.values()) for word in hugewordlist: cnt = allwords[word.literal] #freq = cnt / float(numofwords) word.assigntermfreq(cnt, numofwords, doc_id) #hugewordlist[index].toscreen() numoffiles = numoffiles + 1 if filelimit == numoffiles: break # end for - docs numofdocs = len(fileids) print "computing tf*idf" for word in hugewordlist: word.computeinvdocfreq(numofdocs) word.computeTFIDF() #word.toscreen() corpus.assignwords(hugewordlist) print "corpus length ",str(len(corpus.words))," words" print "huges length ",str(len(hugewordlist))," words" print "exiting buildcorpus()" print "pickle-dumping words" corpus.pickledumpwords()
import matplotlib import string exclude = set(string.punctuation) with open("YT_Comment_Output.txt", "rb") as f: lines = [line.rstrip() for line in f] splits = [line.split() for line in lines] some_upper = [item for sublist in splits for item in sublist] #replace BOM w known stopword BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper] punct_gone = [] for word in BOM_gone: punct_gone.append(''.join(ch for ch in word if ch not in exclude)) YT_comment_words = [word.lower() for word in punct_gone] with open('stopwords.txt', 'rb') as f: stopwords = [line.rstrip() for line in f] print YT_comment_words[:10] print stopwords[:10] filtered_words = [w for w in YT_comment_words if not w in stopwords] print filtered_words[:10] fd = FreqDist(filtered_words) print fd.values()[:10] print fd fd.plot(30)
def get_frequency_distribution(words): fd = FreqDist(i.lower() for i in words) print(fd) sorted_fd = sorted(fd.values(), reverse=True) print(sorted_fd[0:10]) return sorted_fd
# -*- coding: utf-8 -*- from nltk import FreqDist from nltk.corpus import reuters yen = reuters.words(categories='yen') fd1 = FreqDist(i.lower() for i in yen) sfd1 = sorted(fd1.values(), reverse=True) # --- for i, v in enumerate(fd1[0:100], 1): print('%d, %d, %d' % (i, v, i*v)) # --- import pylab pylab.plot(sfd1, color='red') pylab.xscale('log') pylab.yscale('log') pylab.show() # --- from nltk.corpus import stopwords english_stopwords = stopwords.words('english') yen_exclude_stops = [i for i in yen if i.lower() not in english_stopwords] fd2 = FreqDist(i.lower() for i in yen_exclude_stops) sfd2 = sorted(fd2.values(), reverse=True)
words = (word for word in words if word not in nltk.corpus.stopwords.words('english')) #a = nltk.word_tokenize(word_list) b = nltk.pos_tag(word_list) c = nltk.ne_chunk(b,binary=True) tokencount = tokencount + len(word_list) fdist = FreqDist() for x in c.subtrees(): if x.node == "NE": words = [w[0] for w in x.leaves()] name = " ".join(words) #print name fdist.inc(name) bigfdist.inc(name) nercount = nercount + 1 a = [f, tokencount, nercount,fdist.keys(), fdist.values()] print a #mycsv = csv.writer(ofile) mycsv.writerow(a) mycsv2 = csv.writer(namefile) for word in bigfdist: thepair = word+ ',' + str(bigfdist[word]) mycsv2.writerow(thepair) mycsv.close() mycsv2.close()
in_str = sys.stdin.read(BUF_SIZE) rest = '' read_count = 0 while (rest + in_str).strip() != '': read_count += 1 if read_count % 100 == 0: sys.stderr.write('.') sys.stderr.flush() tokens = (rest + in_str).split() rest = tokens.pop() if not tokens: vocab.update(rest) break else: vocab.update(tokens) in_str = sys.stdin.read(BUF_SIZE) print for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]: if i > len(vocab.values()): break print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
class Model(): def __init__(self ): self.__letters = {'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', u'\xe0', u'\xe1', u'\xe8', u'\xe9', u'\xec', u'\xed', u'\xf2', u'\xf3', u'\xf9', u'\xfa' } self.__encodings = ['utf8', 'iso-8859-1'] self.__defaultTag = '' self.__defTagger = nltk.DefaultTagger(self.__defaultTag) self.__tags = {'NOUN', 'ADV', 'ADJ', 'PRON', 'DPREP', 'VERB', 'NUM', 'PREP', 'ART', 'CONJ', 'PRONVERB', 'PUNCT', 'SPECIAL'} self.__manualTags = {tag: set() for tag in self.__tags} self.__tmpPath = os.path.dirname(os.path.abspath(__file__))+ '\\' self.parseSyntaxRules() def initFields(self): self.__allowedForeign = set(codecs.open(self.__tmpPath + 'allowedForeign.txt', encoding='utf-8').read().split()) self.__ignoredCommon = set(codecs.open(self.__tmpPath + 'commonIgnoredWords.txt', encoding='utf-8').read().split()) self.__ignoredColl = set(codecs.open(self.__tmpPath + 'wordsIgnoredInCollocations.txt', encoding='utf-8').read().split()) self.__concordanceIndex = None ################################ # # # GETTERS # # # ################################ ######### GENERAL DATA ######### def getSentences(self): return self.__sentences def getTokens(self): return self.__tokens def getRawText(self): return self.__rawText def getWordCount(self): return self.__wordCount def getWordTypesCount(self): return len(self.__freqDist.items()) def getAvgWordLength(self): return self.__avgWordLength def getAvgSentLength(self): return self.__avgSentLength def getLexicalDiversity(self): return self.__lexicalDiversity ######### FREQUENCY TAB ######### def getIgnoredCommon(self): return self.__ignoredCommon def setIgnoredCommon(self, value): self.__ignoredCommon = value def getMostCommon(self, count): out = [] i=0 while len(out)<count: if self.__freqDist.items()[i][0] not in self.__ignoredCommon: out.append(self.__freqDist.items()[i]) i+=1 return out def getHapaxes(self): return self.__freqDist.hapaxes() def getHapaxPercentage(self): return round(len(self.__freqDist.hapaxes())*100/float(len(self.__freqDist.items())), 2) ### ZIPF'S PLOT def getRelZipfError(self): return self.__relZipfError def getLogfreqDist(self): return self.__logfreqDist def getLogX(self): return self.__logx def getPoly(self, x): return np.poly1d(self.__polyFit)(x) def getPolyFit(self): return self.__polyFit ########## PATTERNS TAB ######### def setAllowedForeignWordSet(self, newSet): self.__allowedForeign = newSet def getAllowedForeignWordSet(self): return self.__allowedForeign def getForeignWords(self): return self.__foreignWords def getForeignWordsCount(self): return self.__foreignWordsCount def getForeignPercentage(self): return round(self.__foreignWordsCount*100/float(self.__wordCount), 2) def getPatternWords(self): return self.__patternWords def getPatternWordsCount(self): return self.__patternWordsCount def getPatternPercentage(self): return round(self.__patternWordsCount*100/float(self.__wordCount), 2) ### PARTS OF SPEECH TAGGING TAB def getTokensFromPOSCorpus(self): return self.__POStokens def getTokensCount(self): return len(self.__tokens) def getTaggedTokensCount(self): return len(self.__POStokens) def getTagCount(self): return self.__tagCount def getTagErrorCount(self): return self.__tagErrorCount def getTaggedCorpus(self): return self.__taggedCorpus def getWrongTags(self): return self.__wrongTags ####### COLLOCATIONS TAB ###### def getIgnoredColl(self): return self.__ignoredColl def setIgnoredColl(self, value): self.__ignoredColl = value def getCollocations(self): return self.__collocations ################################ # # # METHODS # # # ################################ def loadCorpus(self, path): for encoding in self.__encodings: try: self.__path = path fileName = codecs.open( self.__path,'r', encoding=encoding ) self.__rawText = fileName.read() break except UnicodeDecodeError: encoding = '' continue if encoding!='': self.initFields() #SENTENCES # more abbreviations with dots punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag']) punkt_param = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_param) text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText) #text = re.sub('(\d+)', r' \1 ', text) sentences = sentence_splitter.tokenize(text) #TOKENS self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))] wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+') #wordTokenizer = RegexpTokenizer('[\w]+') sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0] words = list(itertools.chain(*sentences)) self.__words = words self.__sentences = sentences self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3) self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3) self.__freqDist = FreqDist(words) self.__wordCount = len(words) self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5) ### resetting members self.__concordanceIndex = None self.__bigrams = None return encoding def computeZipf(self, unit): if unit == 'word': self.__logx = np.array([math.log(i, 10) for i in range(1, len(self.__freqDist.values())+1) ] ) self.__logfreqDist = np. array([math.log(i, 10) for i in self.__freqDist.values() ]) if unit == 'bigram': bigramFreqDist = dict() for first in self.__letters: for second in self.__letters: bigramFreqDist[first+second] = 0 for token in self.__freqDist.items(): for ii in range(len(token[0])-1): try: bigram = token[0][ii]+token[0][ii+1] bigramFreqDist[bigram] += token[1] except KeyError: print "Key error on token: ", token self.__sortedBigrams = sorted([x for x in bigramFreqDist.items() if x[1]>0], key=itemgetter(1)) self.__sortedBigrams.reverse() self.__logx = np.array([math.log(i, 10) for i in range(1, len(self.__sortedBigrams)+1) ] ) self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedBigrams]) if unit == 'letter': letterFreqDist = dict() for letter in self.__letters: letterFreqDist[letter] = 0 for token in self.__freqDist.items(): for ii in range(len(token[0])): try: letter = token[0][ii] letterFreqDist[letter] += token[1] except KeyError: print "Key Error on token: ", token self.__sortedLetters = sorted([x for x in letterFreqDist.items() if x[1]>0], key=itemgetter(1)) self.__sortedLetters.reverse() self.__logx = np.array([math.log(i, 10) for i in range(1, len(self.__sortedLetters)+1) ] ) self.__logfreqDist = np. array([math.log(i[1], 10) for i in self.__sortedLetters]) self.__polyFit = np.polyfit(self.__logx, self.__logfreqDist, 1) poweredPoly = [np.power(10, self.getPoly( self.__logx[i] ) ) for i in range(len(self.__logx))] relativeErrors = [ abs( self.__freqDist.values()[i] - poweredPoly[i] ) / float( self.__freqDist.values()[i] ) for i in range(len(self.__logx)) ] self.__relZipfError = np.mean( relativeErrors ) * 100 def prepareFreqDist(self, areBigramsChecked): if areBigramsChecked: return self.__sortedBigrams else: return self.__sortedLetters ######### PATTERNS TAB ########### def findForeignWords(self, rules): foreignWords = [] if 'consonant' in rules: cond = re.compile('.*[qrtpsdfghlzcvbnm]$') foreignWords += [item for item in self.__freqDist.items() if cond.match(item[0]) and len(item[0])>2] if 'wyjkx' in rules: cond = re.compile('.*[wyjkx].*') foreignWords += [item for item in self.__freqDist.items() if cond.match(item[0]) and len(item[0])>2] self.__foreignWords = [item for item in foreignWords if item[0] not in self.__allowedForeign] self.__foreignWordsCount = sum([word[1] for word in self.__foreignWords]) def findPatternWords(self, pattern): try: cond = re.compile(unicode(pattern)) self.__patternWords = [item for item in self.__freqDist.items() if cond.match(item[0])] self.__patternWordsCount = sum([word[1] for word in self.__patternWords]) return 0 except re.error: return -1 ######## PARTS OF SPEECH TAGGING TAB ########## def loadPOSCorpus(self, path): for encoding in self.__encodings: try: POSfile = codecs.open(path,'r',encoding=encoding) POScorpus = [] for line in POSfile.readlines(): words = line.split() if len(words) > 1: if words[1] in {'NOUN', 'ADV', 'ADJ', 'PRON', 'DPREP', 'VERB', 'NUM', 'PREP', 'ART', 'CONJ', 'PRONVERB', 'PUNCT', 'SPECIAL'}: POScorpus.append((words[0], words[1])) else: print 'Unknown tag!: ' + words[1] POSfile.close() break except UnicodeDecodeError: print 'UnicodeDecodeError' except UnicodeEncodeError: print 'UnicodeEncodeError' self.__taggedCorpus = POScorpus def applyTaggers(self, taggers, fromPOSCorpus = False): self.resetTags(fromPOSCorpus) for tagger in taggers: if tagger == 'manual': self.applyManualTagger(self.__POStokens if fromPOSCorpus else self.__tokens) if tagger == 'regex': self.applyRegexTagger(self.__POStokens if fromPOSCorpus else self.__tokens) if tagger == 'syntax': self.applySyntaxTagger(self.__POStokens if fromPOSCorpus else self.__tokens) if tagger == 'probability': self.applyProbabilityTagger(self.__POStokens if fromPOSCorpus else self.__tokens) tagCount = 0 notTagged = [] self.__wrongTags = [] if fromPOSCorpus: errorCount = 0 #wrongTags = [] for i in range(len(self.__POStokens)): if self.__POStokens[i][1]!=self.__defaultTag: tagCount+=1 if self.__POStokens[i][1] != self.__taggedCorpus[i][1]: errorCount+=1 #wrongTags.append((i, self.__POStokens[i][0], self.__POStokens[i][1], self.__taggedCorpus[i][1])) self.__wrongTags.append([str(i+1), self.__POStokens[i][0], self.__POStokens[i][1], self.__taggedCorpus[i][1]]) else: notTagged.append(self.__POStokens[i][0]) self.__tagErrorCount = errorCount else: for token in self.__tokens: if token[1]!=self.__defaultTag: tagCount+=1 else: notTagged.append(token[0]) self.__tagCount = tagCount def resetTags(self, fromPOSCorpus=False): if fromPOSCorpus: self.__POStokens = self.__defTagger.tag([token[0] for token in self.__taggedCorpus]) else: self.__tokens = self.__defTagger.tag([token[0] for token in self.__tokens]) def applyManualTagger(self, tokens): for line in codecs.open(self.__tmpPath + 'manualTaggingRules.txt', encoding='utf-8').readlines(): if len(line)>4 and line[0]!= '#': words = line.split() self.__manualTags[words[0]] = self.__manualTags[words[0]].union(set(words[1:])) for i in range(len(tokens)): if tokens[i][1] == self.__defaultTag: for tag in self.__manualTags: if tokens[i][0].lower() in self.__manualTags[tag]: tokens[i] = (tokens[i][0], tag) def applyRegexTagger(self, tokens): self.__regexTagRules = dict() for line in set(codecs.open(self.__tmpPath + 'regexpTaggingRules.txt', encoding='utf-8').readlines()): if len(line)>4 and line[0]!= '#': words = line.split() self.__regexTagRules[re.compile(unicode(words[1]))] = (words[0], words[2:]) for i in range(len(tokens)): if tokens[i][1] == self.__defaultTag: for rule in self.__regexTagRules: if tokens[i][1] == self.__defaultTag: word = tokens[i][0].lower() if word not in self.__regexTagRules[rule][1] and rule.match(word): tokens[i] = (tokens[i][0], self.__regexTagRules[rule][0]) def parseSyntaxRules(self): self.__syntaxTagRules = [] for line in set(codecs.open(self.__tmpPath + 'syntaxTaggingRules.txt', encoding='utf-8').readlines()): if line[0]!= '#': words = line.split() before = [] after = [] insertedTag = "" i = 0 for i in range(0, len(words)): if words[i][0] == '$': insertedTag = words[i][1:] break before.append(words[i]) for j in range(i+1, len(words)): after.append(words[j]) if (insertedTag!="" and (before!=[] or after!=[])): self.__syntaxTagRules.append(SyntaxTaggingRule(before, insertedTag, after)) def applySyntaxTagger(self, tokens): self.parseSyntaxRules() for i in range(len(tokens)): if tokens[i][1] == self.__defaultTag: for rule in self.__syntaxTagRules: #rule lenghts check if i >= len(rule.before) and len(tokens) - i >= len(rule.after): poniechaj = False tagsCount = len(rule.before) for before_it in range(tagsCount): if rule.before[before_it] in self.__tags: if tokens[i - tagsCount + before_it][1] != rule.before[before_it]: poniechaj = True else: if tokens[i - tagsCount + before_it][0] != rule.before[before_it]: poniechaj = True if (poniechaj): continue for after_it in range(len(rule.after)): if rule.after[after_it] in self.__tags: if rule.after[after_it] != tokens[i + 1 + after_it][1]: poniechaj = True else: if rule.after[after_it] != tokens[i + 1 + after_it][0]: poniechaj = True if (poniechaj): continue tokens[i] = (tokens[i][0], rule.tag) break def findMostCommonTag(self): tagsFreqDistMap = dict() for word in self.__taggedCorpus: if word[0] not in tagsFreqDistMap.keys(): tagsFreqDistMap[word[0]] = FreqDist([word[1]]) else: tagsFreqDistMap[word[0]].inc(word[1]) self.__mostCommonTagMap = dict() for word in tagsFreqDistMap.keys(): self.__mostCommonTagMap[word] = tagsFreqDistMap[word].keys()[0] def applyProbabilityTagger(self, tokens): self.findMostCommonTag() for i in range(len(tokens)): if tokens[i][1] == self.__defaultTag: if tokens[i][0] in self.__mostCommonTagMap.keys(): tokens[i] = (tokens[i][0], self.__mostCommonTagMap[tokens[i][0]]) def getTaggingRules(self, tagger): f = codecs.open(self.__tmpPath + tagger + 'TaggingRules.txt', encoding='utf-8') rules = f.read() f.close() return rules def setTaggingRules(self, tagger, rules): f = codecs.open(self.__tmpPath + tagger + 'TaggingRules.txt', 'w', encoding='utf-8' ) f.seek(0) f.write(rules) f.truncate() f.close() ########## COLLOCATIONS TAB ############ def collIgnoreListHasChanged(self): self.__collIgnoredListChanged = True def findCollocations(self, test, window, minFreq, count, searchedWord): if self.__bigrams == None or self.__collIgnoredListChanged or self.__collCurrentWindow != window or self.__collCurrentSearchedWord != searchedWord or self.__collCurrentMinFreq != minFreq: self.prepareBigrams(window, searchedWord) self.__collIgnoredListChanged = False self.__bigrams.apply_freq_filter(minFreq) self.__collCurrentWindow = window self.__collCurrentSearchedWord = searchedWord self.__collCurrentMinFreq = minFreq bfd = self.__bigrams.getBigramFd() scored_bigrams = [] bigram_measures = nltk.collocations.BigramAssocMeasures() if test == 'Raw Frequency': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.raw_freq)[:count] if test == 'T Student Test': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.student_t)[:count] if test == 'Pearson\'s Chi Square Test': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.chi_sq)[:count] if test == 'Pointwise Mutual Information': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.pmi)[:count] if test == 'Dice': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.dice)[:count] if test == 'Jaccard': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.jaccard)[:count] if test == 'Likelihood Ratio': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.likelihood_ratio)[:count] if test == 'Variant of Mutual Information': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.mi_like)[:count] if test == 'Poisson Stirling': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.poisson_stirling)[:count] if test == 'Phi square': scored_bigrams = self.__bigrams.score_ngrams(bigram_measures.phi_sq)[:count] self.__collocations = [[unicode(x[0]+' '+x[1]), y, bfd[x]] for x,y in scored_bigrams] def prepareBigrams(self, window_size, word): wfd = FreqDist() bfd = FreqDist() if word == '': for sentence in self.__sentences: if len(sentence) > 1: for window in ingrams(sentence, window_size, pad_right=True): if window[0] not in self.__ignoredColl: w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass wfd.inc(w1) for w2 in set(window[1:]): if w2 is not None and w2 not in self.__ignoredColl: bfd.inc((w1, w2)) else: for sentence in self.__sentences: if len(sentence) > 1: for window in ingrams(sentence, window_size, pad_right=True): if window[0] not in self.__ignoredColl: w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass bigramOK = False for w2 in set(window[1:]): if w2 is not None and w2 not in self.__ignoredColl and (w1 == word or w2==word): bfd.inc((w1, w2)) bigramOK = True if bigramOK: wfd.inc(w1) self.__bigrams = MyBigramCollFinder(wfd, bfd) ######### CONTEXT TAB ########### def findWordContext(self, word, lines, wordCount): if not self.__concordanceIndex: self.__concordanceIndex = nltk.ConcordanceIndex([token[0] for token in self.__tokens], key=lambda s:s.lower()) contexts = [] offsets = self.__concordanceIndex.offsets(unicode(word)) if offsets: lines = min(lines, len(offsets)) for i in offsets: if lines <= 0: break left = (' '.join([token[0] for token in self.__tokens[i-wordCount:i]])) right = ' '.join([token[0] for token in self.__tokens[i+1:i+wordCount+1]]) contexts.append( left + ' ' + self.__tokens[i][0].upper() + ' ' + right) lines -= 1 return contexts