def __FreqFromCorpus (self): r""" Questo metodo estrae le frequenze dal corpus """ print "Calcolo bigrams..." bi = FreqDist(bigrams(self.words)) print "Calcolo FreqDist..." wfr = FreqDist(self.words) print "Coda di elaborazione..." print tot = len(bi.keys()) i = 0 for eles in bi.keys(): a = wfr[eles[0]] b = wfr[eles[1]] ab = bi[eles] N = wfr.N() try: self.__col_logl.append (nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood (a, b, ab, N)) print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t -> %f" % (i, tot,eles[0], eles[1], self.__col_logl[-1]) except UnicodeEncodeError: #catturo eventuali errori di codifica pass i += 1
def pmi(features): ''' Compute the PMI value for all features ''' dic = FreqDist() dic_pos = FreqDist() pos = 0.0 N = 0.0 for i,feature in enumerate(features): N = N + 1 for f in feature: if f[-1] == 1: pos = pos + 1 for t in f[:-3]: dic_pos.inc(t) dic.inc(t) else: for t in f[:-3]: dic.inc(t) N = N + len(dic.keys()) pos = pos + len(dic.keys()) pmi_pos = {} for t in dic.keys(): pmi_pos[t]=np.log(float((dic_pos[t]+1)*N)/float((dic[t]+1)*pos)) pmi_pos = dict(sorted(pmi_pos.items(), key=itemgetter(1))) return pmi_pos
def wordprefixsuffixsubstringsprobdist(): for w in englishdicttxt: wtok=w.split() if len(wtok) > 0: computeprefixessuffixessubstrings(wtok[0]) wordlist.append(wtok[0]) #prefixf=open("WordPrefixesProbabilities.txt","w") #suffixf=open("WordSuffixesProbabilities.txt","w") prefixdict=FreqDist(prefixes) suffixdict=FreqDist(suffixes) substringsdict=FreqDist(suffixes) totalprefixes=sum(prefixdict.values()) totalsuffixes=sum(suffixdict.values()) totalsubstrings=sum(substringsdict.values()) for pk,pv in zip(prefixdict.keys(), prefixdict.values()): prefixprobdict[pk] = float(pv)/float(totalprefixes) for pk,pv in zip(suffixdict.keys(), suffixdict.values()): suffixprobdict[pk] = float(pv)/float(totalsuffixes) for pk,pv in zip(substringsdict.keys(), substringsdict.values()): substringsprobdict[pk] = float(pv)/float(totalsubstrings) #json.dump(prefixprobdict,prefixf) #json.dump(suffixprobdict,suffixf) #print "prefix probabilities:",prefixprobdict #print "suffix probabilities:",suffixprobdict return (prefixprobdict, suffixprobdict, substringsprobdict)
def __FreqFromCorpus(self): r""" Questo metodo estrae le frequenze dal corpus """ print "Calcolo bigrams..." bi = FreqDist(bigrams(self.words)) print "Calcolo FreqDist..." wfr = FreqDist(self.words) print "Coda di elaborazione..." print tot = len(bi.keys()) i = 0 for eles in bi.keys(): a = wfr[eles[0]] b = wfr[eles[1]] ab = bi[eles] N = wfr.N() try: self.__col_logl.append( nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood( a, b, ab, N)) print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t -> %f" % ( i, tot, eles[0], eles[1], self.__col_logl[-1]) except UnicodeEncodeError: #catturo eventuali errori di codifica pass i += 1
class VocabBuilder: """ Creates a vocabulary after scanning a corpus. """ def __init__(self, lang="english", min_length=3, cut_first=100): """ Set the minimum length of words and which stopword list (by language) to use. """ self._counts = FreqDist() self._stop = set(stopwords.words(lang)) self._min_length = min_length self._cut_first = cut_first print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10])) def scan(self, words): """ Add a list of words as observed. """ for ii in [x.lower() for x in words if x.lower() not in self._stop \ and len(x) >= self._min_length]: self._counts.inc(ii) def vocab(self, size=5000): """ Return a list of the top words sorted by frequency. """ if len(self._counts) > self._cut_first + size: return self._counts.keys()[self._cut_first:(size + self._cut_first)] else: return self._counts.keys()[:size]
def get_most_common_ngrams(self, n, nb_ngrams=None): """ Compute and return the set of the most common ngrams in the documents. This set is cached inside the object. Args: n: The number of grams. Must be a positive interger. nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'. Returns: A list of the most common ngrams. """ try: # return cached value return self._most_common_ngrams[n] except KeyError: pass # compute all ngrams all_ngrams = [] for document in self.training_set: all_ngrams.extend(self.compute_ngrams(document, n)) # get the frequency or return all ngrams freq = FreqDist(ngram for ngram in all_ngrams) # store and return the nb_ngrams most common ngrams if nb_ngrams: self._most_common_ngrams[n] = freq.keys()[:nb_ngrams] else: self._most_common_ngrams[n] = freq.keys() return self._most_common_ngrams[n]
def get_bot_nouns_verbs(pos_tags, tagmap, n): # get_func_words('/home1/c/cis530/hw4/funcwords.txt') funcwords = get_func_words('funcwords.txt') fdNoun = FreqDist() fdVerb = FreqDist() for (word, tag) in pos_tags: if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets(word): fdVerb.inc(word) elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets(word): fdNoun.inc(word) return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
def get_all_nouns_verbs(tok_sents, tagmap): # get_func_words('/home1/c/cis530/hw4/funcwords.txt') funcwords = get_func_words('funcwords.txt') fdNoun = FreqDist() fdVerb = FreqDist() for sent in tok_sents: for tup in sent: if tagmap[tup[2]] == "VERB" and tup[1] not in funcwords and wn.synsets(tup[0]): fdVerb.inc(tup[1]) elif tagmap[tup[2]] == "NOUN" and tup[1] not in funcwords and wn.synsets(tup[0]): fdNoun.inc(tup[1]) return (fdNoun.keys(), fdVerb.keys())
def get_bot_nouns_verbs(pos_tags, tagmap, n): # get_func_words('/home1/c/cis530/hw4/funcwords.txt') funcwords = get_func_words('funcwords.txt') fdNoun = FreqDist() fdVerb = FreqDist() for (word, tag) in pos_tags: if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets( word): fdVerb.inc(word) elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets( word): fdNoun.inc(word) return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
def get_all_nouns_verbs(tok_sents, tagmap): # get_func_words('/home1/c/cis530/hw4/funcwords.txt') funcwords = get_func_words('funcwords.txt') fdNoun = FreqDist() fdVerb = FreqDist() for sent in tok_sents: for tup in sent: if tagmap[tup[2]] == "VERB" and tup[ 1] not in funcwords and wn.synsets(tup[0]): fdVerb.inc(tup[1]) elif tagmap[tup[2]] == "NOUN" and tup[ 1] not in funcwords and wn.synsets(tup[0]): fdNoun.inc(tup[1]) return (fdNoun.keys(), fdVerb.keys())
class ExtractorOfWords(): def __init__(self, pos_words, neg_words, type_of_Feature_extractor = 0): self.pos_words_training = reduce(lambda words,review: words + review.words(), pos_words, []) self.neg_words_training = reduce(lambda words,review: words + review.words(), neg_words, []) if type_of_Feature_extractor == 1: formated_pos_words_training = self.Feature_extractor1(self.pos_words_training) formated_neg_words_training = self.Feature_extractor1(self.neg_words_training) elif type_of_Feature_extractor == 2: formated_pos_words_training = self.Feature_extractor2(self.pos_words_training) formated_neg_words_training = self.Feature_extractor2(self.neg_words_training) elif type_of_Feature_extractor == 3: formated_pos_words_training = self.Feature_extractor3(self.pos_words_training) formated_neg_words_training = self.Feature_extractor3(self.neg_words_training) elif type_of_Feature_extractor == 4: formated_pos_words_training = self.Feature_extractor4(self.pos_words_training) formated_neg_words_training = self.Feature_extractor4(self.neg_words_training) else: formated_pos_words_training = self.pos_words_training formated_neg_words_training = self.neg_words_training self.pos_words_freqdist = FreqDist(formated_pos_words_training) self.neg_words_freqdist = FreqDist(formated_neg_words_training) #Extract n most Freq. words def Extraxt_n_most_Freq_Words (self, n): return self.pos_words_freqdist.keys()[:n], self.neg_words_freqdist.keys()[:n] #list of all words with their number of occurrences over *number_count* def Extraxt_words_above_count (self, number_count): return [word for word,count in self.pos_words_freqdist.iteritems() if count > number_count], [word for word,count in self.neg_words_freqdist.iteritems() if count > number_count] #PorterStemmer def Feature_extractor1(self, in_list): ps = PorterStemmer() return [ps.stem(w) for w in in_list] #lowercase versions of all the words def Feature_extractor2(self, in_list): return [w.lower() for w in in_list] #Replace all number tokens with "NUM" def Feature_extractor3(self, in_list): return ["NUM" if w.isdigit() else w for w in in_list] #combination of fiters 1 and 2 def Feature_extractor4(self, in_list): return [w.lower() for w in in_list if w.isalpha() and w.lower() not in stopwords.words('english')]
def _train(self, tagged_corpus, cutoff=0, verbose=False): token_count = hit_count = 0 useful_contexts = set() fd = ConditionalFreqDist() tag_prob = FreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 tag_prob.inc(tag) context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, # calculate the entropy. Only include contexts that # lower then `cutoff` . total_tags = float(sum(tag_prob.values())) tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()] useful_contexts_after_filter = useful_contexts.copy() most_high = FreqDist() for context in useful_contexts: dd = fd[context] # total_tags = float(sum(dd.values())) # tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()] h = self.H(dd.keys(),tags_probs) if h > cutoff: useful_contexts_after_filter.remove(context) continue most_high[context] = h print most_high.keys() # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. for context in useful_contexts_after_filter: best_tag = fd[context].max() hits = fd[context][best_tag] self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
def find_stop_words(file, lower_bound, upper_bound): all_posts = [] stop_words = [] for raw_post in file: all_posts += raw_post[1] frequency = FreqDist(all_posts) total_count = frequency.B() lower_bound = total_count * lower_bound upper_bound = total_count * (1 - upper_bound) stop_words += frequency.keys()[-int(lower_bound):] stop_words += frequency.keys()[:int(upper_bound)] return stop_words
class termBasedConsiderBackgroundModel(AbstractGenerativeModel): def __init__(self, analyser, backgroundDistribution, probOfBackgroundModel): self.backGroundDistro = backgroundDistribution self.analyser = analyser self.ProbBackground = probOfBackgroundModel def generateProbabilityDistribution(self, document_list): tokens = [] for doc in document_list: tokens += self.analyser(doc) self.freqDist = FreqDist(tokens) foreground_prob = 1 - self.ProbBackground prob_distro = {} backDistro = FreqDist() for word in self.freqDist.keys(): backDistro[word] = self.backGroundDistro[word] for word in self.freqDist.keys(): if word not in self.backGroundDistro.keys(): prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word)) else: prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word) - (self.ProbBackground * backDistro.freq(word))) self.prob_distro = prob_distro flag = True for key in prob_distro: if prob_distro[key] < 0 or prob_distro[key] > 1: flag = False break return flag def getProbabilityDistribution(self): return self.prob_distro def probOfDocument(self, document): tokens = self.analyser(document) prob = 1.0 for token in tokens: if token in self.prob_distro: prob *= self.prob_distro[token] if prob == 1.0: return 0.0 return prob
def demo_similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. @param word: The word used to seed the similarity search @type word: C{str} @param num: The number of words to generate (default=20) @type num: C{int} @seealso: L{ContextIndex.similar_words()} """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = nltk.text.ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) while 1: word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); print "word='"+ word + "'" if word == '0': break word = word.decode('utf-8') wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] print tokenwrap(words) else: print "No matches"
def getsimilar(self, word, num =20): """ @param word: The word used to seed the similarity search @type word: C{str} @param num: The number of words to generate (default=20) @type num: C{int} @seealso: L{ContextIndex.similar_words()} """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) #words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] #lists of words #print tokenwrap(words) return words else: print "No matches" return None
def get_word_features(wordlist): """ tra ve cac tu co tan suat xuat hien nhieu """ wordlist = FreqDist(wordlist) #print wordlist.keys(),"------->",wordlist.values() return wordlist.keys()
def getAllWords(lines, stop_words): all_words = {} try: for line in lines: words = line.split() for word in words: if word not in stop_words: all_words[word] = True temp = all_words.keys() # removePunctuationFromList(temp) top_words = FreqDist(temp) print("All Words list length : ", len(top_words)) # print(str(list(all_words1.keys())[:100])) # use top 20000 words return list(top_words.keys())[:20000] # word_features = list(all_words.keys())[:6000] # featuresets = [(find_features(rev, word_features), category) # for (rev, category) in documents] # print("Feature sets list length : ", len(featuresets)) except Exception as e: print("type error: " + str(e)) exit()
def run(self): # Preprocessing train_corpus = self.preprocess(self.trainData) self.test_corpus = self.preprocess(self.testData) # Generate dictionary wordFreq = FreqDist([ word for phrase in train_corpus + self.test_corpus for word in phrase.split(" ") ]) self.vocabulary = list(wordFreq.keys())[:2000] # Extracting features self.extractFeatures(train_corpus) # Polarize matrix self.modifyMatrix() # pickle.dump(self.X, open("dataMatrix", "wb")) # self.X = pickle.load(open("dataMatrix", "rb")) self.trainX, self.testX, self.trainY, self.testY = train_test_split( self.X, self.y, test_size=0.1, random_state=0) del self.X # Classify classifier_name = "DNN" self.classify(classifier_name)
def tokenize_clean(text): """Return list of items from tokenized text.""" tokens = word_tokenize(text.lower()) fdist = FreqDist(tokens) words = [w.lower() for w in fdist.keys() if w not in stopwords.words('english') and w.isalpha()] return words
class MyMarkovModel(MarkovModel): def __init__(self, order): self.order = order self.filename = NGRAM_FILES[self.order] if 3 >= self.order >= 2: self.backoff = MyMarkovModel(order - 1) self.cfd = ConditionalFreqDist() self.charset = self.backoff.charset for ngram, count in self.get_data(): context, char = tuple(ngram[:-1]), ngram[-1] self.cfd[context][char] = count elif self.order == 1: self.backoff = None self.n = 0 self.fd = FreqDist() for char, count in self.get_data(): self.fd[char] = count self.charset = set(self.fd.keys()) else: raise NotImplemented def get_data(self): with open(self.filename) as fp: for line in fp.readlines(): ngram, count = line.lower().split() count = int(count) yield ngram, count
def pkmpopana(): df=pd.read_csv("pkm-19-clean.csv") for i in range(20,27): df=df.append(pd.read_csv(f'pkm-{str(i)}-clean.csv'),ignore_index=True) sen='' for j in df['text']: sen+=j sen=sen.lower() toker=RegexpTokenizer(r'\w+') words=toker.tokenize(sen) stop_words = set(stopwords.words('english')) filtered_sentence = [w for w in words if not w in stop_words] fdist=FreqDist(filtered_sentence) pk=pd.read_csv('pokemon.csv') pk=pk[pk['id']<152] pkmname=list(pk['pokemon']) re={} for n in pkmname: if n in fdist.keys(): re[n]=fdist[n] so=sorted(re.items(),key=lambda item:item[1],reverse = True) l,p=[],[] tar=so[0:2] for i in tar: l.append(i[1]) p.append(i[0]) plt.barh(list(range(len(tar))),width=l[::-1],align='center') plt.xlabel('count') plt.ylabel('name') plt.yticks(list(range(len(tar))),p[::-1]) plt.show()
def getFreq(self, text, normalize=True): stop_words = stopwords.words(self.detectLanguage(text)) words = self.getTokens(text) clean_words = filter( lambda word: not word in stop_words and not word in punctuation, words) fdist = FreqDist(clean_words) #============================================================================== # # same result # fdist = FreqDist() # for word in word_tokenize(text): # word = word.lower() # if not word in stop_words and not word in punctuation: # fdist[word] += 1 #============================================================================== # normalization by dividing on max freqency if normalize: norm = float(max(fdist.values())) for word in fdist.keys(): fdist[word] = fdist[word] / norm # remove too frequent and too rare words if fdist[word] >= self._upper_bound or fdist[ word] <= self._lower_bound: del fdist[word] return fdist
def unigramAll(): to_save_folder = "./#Unigram[.]/" folder_list = os.listdir("./") for folder in folder_list: if folder.find(".") != -1: continue folder_name = "./" + folder + "/" data_path = folder_name + "data.doc" fw = open(data_path, "r", encoding="utf8") text = fw.read() words = word_tokenize(text) fdist = FreqDist(w for w in words if len(w) > 1 and w != "``") keys = fdist.most_common(len(fdist.keys())) dataFreq = "" for key in keys: dataFreq += str(key[0]) + " , " + str(key[1]) + "\n" make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[unigram].csv", "w+", encoding="utf8") writer.write(dataFreq) fw.close() writer.close()
def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = islice(fd.keys(), num) print(tokenwrap(words)) else: print("No matches")
def run(self): # Preprocessing train_corpus = self.preprocess(self.trainData) test_corpus = self.preprocess(self.testData) # Generate dictionary wordFreq = FreqDist([word for phrase in train_corpus+test_corpus for word in phrase.split(" ")]) self.vocabulary = list(wordFreq.keys()) self.word2id = {word: i for i, word in enumerate(self.vocabulary)} # Extracting features self.X = self.extractFeatures(train_corpus) self.testData = self.extractFeatures(test_corpus) # Determine max sequence length lenStats = sorted([len(phrase) for phrase in self.X+self.testData]) maxLength = lenStats[int(len(lenStats)*0.8)] # Pad sequences self.X = sequence.pad_sequences(np.array(self.X), maxlen=maxLength) self.testData = sequence.pad_sequences(np.array(self.testData), maxlen=maxLength) # Split validation set self.trainX, self.testX, self.trainY, self.testY = train_test_split(self.X, self.y, test_size=0.1, random_state=0) # Classify classifier_name = "XGBoost" self.classify(classifier_name)
def convert_real_data(df): docs = [] for index, row in df.iterrows(): uid = row['id'] text = row['text'] content = Content() content.raw = text content.tokens = nltk.word_tokenize(text) content.date = datetime.datetime.utcnow fdist = FreqDist(content.tokens) freq_list = [] for k in fdist.keys(): freq_list.append((k, fdist[k])) content.construct_word_freq_list(freq_list) # content.construct_word_freq_list([('basketbal', 1), ('document', 1), ( # 'footbal', 1), ('gener', 1), ('golf', 1), ('sport', 1), ('talk', 1), ('tenni', 1)]) doc = TestDocument(uid, "test_name", "test_name", datetime.datetime.utcnow, content, "no_url", 0) docs.append(doc) return docs
def answer_four(): wordfreq = FreqDist(text1) freqAnswerFour = [ w for w in wordfreq.keys() if len(w) > 5 and wordfreq[w] > 150 ] return sorted(freqAnswerFour) # Your answer here
def get_term_freq_dict(data): # Change it to lower case lower_data = data.lower() # Tokenize it tokens = word_tokenize(lower_data) freq_dist = FreqDist(tokens) # Lemmatize it word_freq = {} for term in freq_dist.keys(): lemmatize_term = wordnet.lemmatize(term) val = freq_dist.get(term) # If it exist in word_freq, add value if lemmatize_term in word_freq: freq = word_freq[lemmatize_term] word_freq[lemmatize_term] = freq + val # Else, assign value else: word_freq[lemmatize_term] = val return word_freq
def write(file, final_list, labels, tokens): num_tracker = [] freq = FreqDist([word for sublist in final_list for word in sublist]) #block of code to get freq of word in instance for iter in range(0, len(final_list)): row_list = [] line_freq = FreqDist(final_list[iter]) for word in freq: if word in line_freq: row_list.append(u'{}'.format(line_freq[word])) else: row_list.append(u'0') num_tracker.append(row_list) write_list = tokens tokens = [] counter = 0 #code to write to csv file with io.open(file, 'w', encoding='utf8') as outfile: headers = freq.keys() header = ','.join(headers) label_word = ',' + "label" + '\n' outfile.write(header + label_word) for num in num_tracker: num_occurances = ','.join(num) end_label = ',' + labels[counter] + '\n' counter += 1 outfile.write(num_occurances + end_label)
def mostCommWords(self, tag, pos_tag_pattern): """ This is a help method for mostCommNouns and mostCommVerbs. Argument: tag -- a hashtag that we want to compute the most commonly hashtag with pos_tag_pattern -- the regular expression that used to match the POS tags return: a list of the top 20 nouns associated with the input hashtag """ words = {} topTwenty = [] j = 0 for line in self.lines: hasTag = False for t in self.tokenizer(line, hashtag_pattern): if t == tag: hasTag = True break if hasTag: counts = FreqDist() tokens = self.tokenizer(line, word_pattern) pos = nltk.pos_tag(tokens) for p in pos: if re.match(pos_tag_pattern, p[1]): counts.inc(p[0]) for n in counts.keys(): if words.has_key(n): words[n] = words[n] + counts[n] else: words[n] = counts[n] words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True) for i in range(0, 20): topTwenty.append(words_sorted_by_counts[i][0]) return topTwenty
def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = islice(fd.keys(), num) print(tokenwrap(words)) else: print("No matches")
def get_top_words(directory, n, file): num_docs = 0.0 flist = {} result = {} for f in os.listdir(directory): #stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt" num_docs+=1 rawContents = load_file_tokens(directory+'/'+f) fdist = FreqDist( rawContents ) normalF = max(fdist.values()) for key in fdist.keys(): fdist[key]=float(float(fdist[key])/normalF) flist[directory+'/'+f] = fdist for key in flist[file].keys(): num_appear=0 for key_file in flist.keys(): if key in flist[key_file].keys(): num_appear+=1 result[key] = flist[file][key]*math.log(num_docs/(num_appear)) sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True) top_x = sorted_x[:n] result = [] for item in top_x: result.append(item[0]) return result
def fun10(): """frequency distribution""" fdist1 = FreqDist(text1) # print fdist1 vocabulary1 = fdist1.keys() # print vocabulary1[:50] fdist1.plot(50, cumulative=True)
def mostCommWords(self, tag, pos_tag_pattern): """ This is a help method for mostCommNouns and mostCommVerbs. Argument: tag -- a hashtag that we want to compute the most commonly hashtag with pos_tag_pattern -- the regular expression that used to match the POS tags return: a list of the top 20 nouns associated with the input hashtag """ words={} topTwenty=[] j = 0 for line in self.lines: hasTag = False for t in self.tokenizer(line, hashtag_pattern): if t == tag: hasTag = True break if hasTag: counts = FreqDist() tokens = self.tokenizer(line, word_pattern) pos = nltk.pos_tag(tokens) for p in pos: if re.match(pos_tag_pattern,p[1]): counts.inc(p[0]) for n in counts.keys(): if words.has_key(n): words[n] = words[n]+counts[n] else: words[n] = counts[n] words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True) for i in range(0,20): topTwenty.append(words_sorted_by_counts[i][0]) return topTwenty
def ngram4All(): to_save_folder = "./#Ngram_4[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8") text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"] nlist4 = [] vlen = len(valid_word); for i in range(0,vlen-3): nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3]) fdist = FreqDist(w for w in nlist4) keys = fdist.most_common(len(fdist.keys())) dataFreq = "" for key in keys: dataFreq += str(key[0])+ "," + str(key[1]) + "\n" make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8") writer.write(dataFreq) fw.close() writer.close()
def get_frequent_pos_bigrams(all_tokens, mf_pos_bigrams): """Get frequent part-of-speech bigrams.""" # Get all part-of-speech tags all_pos = [t.sim_pos_full for t in all_tokens] # Get bigrams and frequencies pos_bigrams = nltk.bigrams(all_pos) pos_fdist = FreqDist(pos_bigrams) # Set dict pos_bigram_freq_dict = {bigram: 0 for bigram in mf_pos_bigrams} # Fill dict for key in pos_bigram_freq_dict.keys(): if key in pos_fdist.keys(): pos_bigram_freq_dict[key] = pos_fdist[key] # Normalize frequencies pos_bigram_freq = [ pos_bigram_freq_dict[k] / len(pos_fdist) for k in sorted(pos_bigram_freq_dict) ] return pos_bigram_freq
def BigramAll(): to_save_folder = "./#Bigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1 : continue; folder_name = "./" + folder + "/" data_path = folder_name+"data.doc"; fw = open(data_path,"r",encoding="utf8"); text = fw.read(); words = word_tokenize(text); big = list(bigrams(w for w in words if len(w) > 1 and w != "``")); myBig = [] for bi in big: myBig.append(bi[0]+" "+bi[1]); fdist = FreqDist(str(w) for w in myBig); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n"; make_sure_path_exists(to_save_folder+folder) writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def trigramAll(): to_save_folder = "./#Trigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1: continue; folder_name = "./" + folder + "/" data_path = folder_name + "data.doc"; fw = open(data_path, "r", encoding="utf8"); text = fw.read(); words = word_tokenize(text); valid_word = [w for w in words if len(w) > 1 and w != "``"]; tri_list = []; vlen = len(valid_word); for i in range(0,vlen-2): tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]); fdist = FreqDist(w for w in tri_list); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n"; make_sure_path_exists(to_save_folder + folder) writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def answer_six(): dist = FreqDist(moby_tokens) vocab = dist.keys() frequency = [(dist[w], w) for w in vocab if w.isalpha() and dist[w] > 2000] return sorted(frequency, reverse=True)
def answer_four(): dist = FreqDist(moby_tokens) vocab = dist.keys() freqwords = [w for w in vocab if len(w) > 5 and dist[w] > 150] return sorted(freqwords)
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def get_word_features(wordlist): # print(wordlist) wordlist = FreqDist(wordlist) word_features = wordlist.keys() # print ("Word frequency list\n") # pprint(wordlist) return word_features
def FREQ(self, threshold): tagged = [] nouns = [] noun_phrases = [] sorted_fdist = [] result = [] for s in self.tokens: print(s) temp = nltk.pos_tag(s) print(temp) tagged.append(temp) nouns = nouns + list( filter(lambda x: x[1].__contains__("NN"), temp)) noun_phrases = noun_phrases + self.get_noun_phrases(s) fdist = FreqDist(word.lower() for word in s) for x in fdist.keys(): sorted_fdist.append((fdist.get(x), x)) sorted_fdist.sort() nouns_r = set([x[0] for x in nouns]) noun_phrases = set(noun_phrases) print("=================================") print("NOUNS:", nouns) print("NOUNSPHRA:", noun_phrases) print("FREQ:", sorted_fdist) t = list( filter(lambda x: x[0] >= threshold and x[1] in nouns_r, sorted_fdist)) print(t) t_r = [x[1] for x in t] print("T_R", t_r) result = t_r + list(noun_phrases) print("RESULT", set(result)) return set(result)
def char_freq(lines): """ 返回 DataFrame,按字符频率倒序排列 """ corpus = nltk.Text(chain.from_iterable(lines)) # 需要一个长字符串,而不是字符串列表 wc = FreqDist(corpus) df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()}) df.sort('freq', ascending=False, inplace=True) df['idx'] = np.arange(len(wc.values())) return df
def tabulate_categorized_words(self, reader, number_of_cat): wordtypes = reader.words() print '\n%s %5s %7s %14s\n' %('rank', 'fi', 'Fi*', 'wordtype') fd = FreqDist(wordtypes) cumulative = 0.0 rank = 0 for word in fd.keys()[:number_of_cat]: rank += 1 cumulative += fd[word] * 100.0 / fd.N() print "%4d %6d %4d%% %15s" %(rank, fd[word], cumulative, word)
def get_top_words(path, n): files = get_all_files(path) # returns [] if path is a file fdist = FreqDist() if(len(files) == 0): for word in load_file_tokens(path): fdist.inc(word) else: for word in load_collection_tokens(path): fdist.inc(word) li = fdist.keys() return li[:n]
def report(self, words, top_words=5): """ Return the most likely translations of English words """ for ii in [lower(x).strip() for x in words]: probability = FreqDist() for jj in self._trans.vocab(): probability.inc(jj, self._trans.score(ii, jj)) for jj in probability.keys()[:top_words]: yield ii, jj, probability[jj]
def construct_model(copusPath, modelPath): mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') stop = stopwords.words('french') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = list(word_features.keys()) numtrain = int(len(documents) * 100 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]""" classifier = nbc.train(train_set) mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mrtest.fileids()] word_features_test = FreqDist(chain(*[i for i, j in documentsTest])) word_features_test = list(word_features_test.keys()) numtrain_test = int(len(documentsTest) * 100 / 100) test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag in documentsTest[:numtrain_test]] save_classifier(classifier, modelPath)
def collocations(words,defined_terms): # Count the words and bigrams wfd = FreqDist([w[2] for w in words]) #tri = [tuple(x[1:] for x in words[i:i+3]) for i in range(len(words)-2) if # tuple(x[1:] for x in words[i:i+3])[1][1] in [token.split('\t')[2] for token in defined_terms]] tri = [tuple(x[1:] for x in words[i:i+7]) for i in range(len(words)-6) if tuple(x[1:] for x in words[i:i+7])[3][1] == defined_terms ]#in [token.split('\t')[2] for token in defined_terms] ] #tuple(x[1:] for x in words[i:i+2])[2][1] in [token.split('\t')[2] for token in defined_terms] ] #tri = nltk.bigrams([w[1:] for w in words]) ite = [itertools.combinations(trigram,2) for trigram in tri] #for i in ite[:10]: print i bigrams_tag_fd = FreqDist([ff for it in ite for ff in it]) filtre = nltk.bigrams([w[1:] for w in words]) ''' for ii in bigrams_tag_fd.keys(): if ((ii[0][1] or ii[1][1]) in [token.split('\t')[2] for token in defined_terms]) and (bigrams_tag_fd[ii]>1): bigrams_tag_fd[ii] = bigrams_tag_fd[ii]- filtre.count(tuple(ii)) ''' ADJ = r"JJ|VVG|VVN|VVD" NOM = r"NN|VV$" VER = r"VVP|VVZ" adj = "ADJ|VER:[(ppre|pper)]" nom = "NAM|NOM|VER:infi" ver = "VER:[^(ppre|pper|infi)]" pfd = { (a,b):bigrams_tag_fd[(a,b)] for (a, b) in sorted(bigrams_tag_fd.keys()) if (b[1] == defined_terms and#in [token.split('\t')[2] for token in defined_terms] and re.match(ver,a[0])) or (a[1] == defined_terms and#in [token.split('\t')[2] for token in defined_terms] and re.match(ver,b[0])) } # score them scored = [((w1,w2), score(w1, w2, wfd, pfd)) for w1, w2 in pfd] scored.sort(key=itemgetter(1), reverse=True) return scored#map(itemgetter(0), scored)
def filter_words(words): new_words = FreqDist(words) stopwords = get_stop_words('ar') keys = new_words.keys() for word in keys: if word in stopwords: new_words.pop(word) if len(word) <= 2: new_words.pop(word) return new_words