def simhash(raw_text): """Compute the simhash value for a string.""" fdist = FreqDist() for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'): fdist.inc(word.lower()) v = [0] * 128 for word in fdist: projection = bitarray() projection.fromstring(hashlib.md5(word).digest()) #print "\tw:%s, %d" % (word, fdist[word]) #print "\t\t 128 bit hash: " + str(b) for i in xrange(128): if projection[i]: v[i] += fdist.get(word) else: v[i] -= fdist.get(word) hash_val = bitarray(128) hash_val.setall(False) for i in xrange(128): if v[i] > 0: hash_val[i] = True return hash_val
def add_documents(self, document_entries): """ Add new documents to be indexed. :param document_entries: a set of objects from the class DocumentEntry """ if document_entries: forward = { key: {(document_entries[key][0])} for key in document_entries.keys() } for key in document_entries.keys(): freq_dist = FreqDist(document_entries[key][1]) for token in document_entries[key][1]: if len(self.inverted_index) is 0 \ or self.__normalize(token) not in self.inverted_index.keys(): self.inverted_index[self.__normalize(token)] \ .add((freq_dist.get(token), key, freq_dist.freq(token))) else: if not freq_dist.get(token) is None: self.inverted_index[self.__normalize(token)] \ .add((freq_dist.get(token), key, freq_dist.freq(token))) self.forward_index.update(forward) self.tf_idf() self.dal.save(self.forward_index, 'forward_index.csv') self.dal.save(self.inverted_index, 'inverted_index.csv')
def calc_bigram_probability(self, string, bigrams): bigram_dist = FreqDist( bigrams) #Create Bigram distribution using FreqDist() function unigrams, string = generate_ngram(string, 1) self.unigram_dist = FreqDist( unigrams) #Create Unigram distribution using FreqDist() function self.vocab_size = len(set( unigrams)) #vocab size is the toal distinct characters in dataset for bigram in set(bigrams): bigram_count = bigram_dist.get(bigram) #get count of bigram unigram_count = self.unigram_dist.get( bigram[:-1]) #get count of unigram probability = Fraction( bigram_count + 1, unigram_count + self.vocab_size) #Calculate probability with add-one smoothing temp = { bigram[-1]: probability } #Storing in temp dictionary - last char in bigram and probability if bigram[0] in self.bigram_probabilities: self.bigram_probabilities[bigram[0]].update( temp ) #update main dictionary with temp- {bigram[0]: {bigram[1]:probability}} else: self.bigram_probabilities.setdefault( bigram[0], {} ) #If encountered a character first time, add it to main dictionary as a dict object self.bigram_probabilities[bigram[0]].update(temp)
def replace_unknown(tokens, cut_off): """ Replace tokens which appear (> cutoff) times in the corpus with <UNK>. :param tokens: (list of str) the tokens comprising the corpus. :param cut_off: (int) the bound of cutting the token. :return: The same list of tokens with each singleton replaced by <UNK>. [Hint] use nltk.FreqDist when you build a vocab. [Example] tokens = ['<s>', 'don't', 'put', 'off', 'until', 'tomorrow', 'what', 'you', 'can', 'do', today'] Let's suppose counts for each words in tokens 'don't' - 1, 'put' - 1, 'off' - 0, 'until' - 0, 'tomorrow' - 1, 'what' - 1, 'you' - 1, 'can' - 1 'do' - 1, 'today' - 1 and cutoff = 0 Then output should be ['<s>', 'don't', 'put', <UNK>, <UNK>, 'tomorrow', 'what', 'you', 'can', 'do', 'today'] """ from nltk.probability import FreqDist fdist = FreqDist(tokens) #for word in tokens: # fdist[word.lower()] += 1 tokens = [ UNK if word != SOS and word != EOS and fdist.get(word) <= cut_off else word for word in tokens ] print(tokens) return tokens
def get_term_freq_dict(data): # Change it to lower case lower_data = data.lower() # Tokenize it tokens = word_tokenize(lower_data) freq_dist = FreqDist(tokens) # Lemmatize it word_freq = {} for term in freq_dist.keys(): lemmatize_term = wordnet.lemmatize(term) val = freq_dist.get(term) # If it exist in word_freq, add value if lemmatize_term in word_freq: freq = word_freq[lemmatize_term] word_freq[lemmatize_term] = freq + val # Else, assign value else: word_freq[lemmatize_term] = val return word_freq
def FREQ(self, threshold): tagged = [] nouns = [] noun_phrases = [] sorted_fdist = [] result = [] for s in self.tokens: print(s) temp = nltk.pos_tag(s) print(temp) tagged.append(temp) nouns = nouns + list( filter(lambda x: x[1].__contains__("NN"), temp)) noun_phrases = noun_phrases + self.get_noun_phrases(s) fdist = FreqDist(word.lower() for word in s) for x in fdist.keys(): sorted_fdist.append((fdist.get(x), x)) sorted_fdist.sort() nouns_r = set([x[0] for x in nouns]) noun_phrases = set(noun_phrases) print("=================================") print("NOUNS:", nouns) print("NOUNSPHRA:", noun_phrases) print("FREQ:", sorted_fdist) t = list( filter(lambda x: x[0] >= threshold and x[1] in nouns_r, sorted_fdist)) print(t) t_r = [x[1] for x in t] print("T_R", t_r) result = t_r + list(noun_phrases) print("RESULT", set(result)) return set(result)
class UnigramModel: def __init__(self, text): self.unigrams, self.string = generate_ngram( text, 1 ) #Initialize list of unigrams by calling generate ngram method with n=1 self.unigram_probabilities = { } #Dictionary to store unigram and corresponding probability self.vocab_size = 0 self.unigram_dist = FreqDist() def calc_unigram_probability(self, unigrams): self.unigram_dist = FreqDist( unigrams) #Create Unigram distribution using FreqDist() function total_count = len(unigrams) #Total count of unigrams in training set self.vocab_size = len( set(unigrams)) #Count of distinct unigrams (characters) for unigram in set(unigrams): unigram_count = self.unigram_dist.get( unigram) #get count of occurrence for particular unigram #Add one to numerator and V to denominator for Add-one smoothing self.unigram_probabilities.update({ unigram: Fraction(unigram_count + 1, total_count + self.vocab_size) }) #Create dictionary of unigram and its corresponding probability
def replace_unknown(tokens, cut_off): """ Replace tokens which appear (<= cutoff) times in the corpus with <UNK>. :param tokens: (list of str) the tokens comprising the corpus. :param cut_off: (int) the bound of cutting the token. :return: The same list of tokens with each singleton replaced by <UNK>. [Hint] use nltk.FreqDist when you build a vocab. [Example] tokens = ['<s>', 'I', 'love', 'cake', 'I', 'like', 'cake', '</s>'] Then, here are the counts for each words in tokens: 'I': 2, 'love': 1, 'cake': 2, 'like': 1 and cutoff = 1 Then output should be ['<s>', 'I', UNK, 'cake', 'I', UNK, 'cake', '</s>'] """ from nltk.probability import FreqDist fdist = FreqDist(tokens) #for word in tokens: # fdist[word.lower()] += 1 tokens = [ UNK if word != SOS and word != EOS and fdist.get(word) <= cut_off else word for word in tokens ] return tokens
def get_term_freq_dict(data): # Change it to lower case lower_data = data.lower() # Tokenize it tokens = word_tokenize(lower_data) # tokens = re.sub(r"[']+", r"",tokens) # tokens = [i.replace("[\']+","") for i in tokens] tokens2 = [] for i in tokens: i = re.sub(r'[-]+', ' ', i) i = re.sub(r"[']+", '', i) i = re.sub(r'[com]$', '', i) # re.sub('') i = re.sub(r'[0-9]+', '', i) i = re.sub(r'\*+', '', i) i = re.sub(r'[\n]+', '', i) i = re.sub(r'[_]+', ' ', i) i = re.sub(r'[\d.*?]+', '', i) # if(i[0] == "\'"): # i= i[1:] a = word_tokenize(i) for x in a: tokens2.append(x) # print tokens print tokens2 tokens = tokens2 # for i in tokens: # i.replace(r"[']+",r"") # print tokens freq_dist = FreqDist(tokens) # Lemmatize it word_freq = {} for term in freq_dist.keys(): lemmatize_term = wordnet.lemmatize(term) val = freq_dist.get(term) # If it exist in word_freq, add value if lemmatize_term in word_freq: freq = word_freq[lemmatize_term] word_freq[lemmatize_term] = freq + val # Else, assign value else: word_freq[lemmatize_term] = val word_freq = remove_stop_words(word_freq) print word_freq # word_freq = remove_punctuation(word_freq) # print word_freq # word_freq = word_freq.map(lambda x: str_stemmer_wo_parser(x)) return word_freq
class TrigramModel: def __init__(self, text): self.trigrams, self.string = generate_ngram( text, 3) #Initialize list of trigrams by calling generate ngram method #self.string will be used to generate bigram and unigram distributions for later use self.trigram_probabilities = { } #Dictionary to store trigram and corresponding probability self.vocab_size = 0 self.bigram_dist = FreqDist() def calc_trigram_probability(self, string, trigrams): trigram_dist = FreqDist( trigrams) #Create Trigram distribution using FreqDist() function bigrams, string = generate_ngram(string, 2) self.bigram_dist = FreqDist( bigrams) #Create Bigram distribution using FreqDist() function unigrams, string = generate_ngram(string, 1) unigram_dist = FreqDist( unigrams ) #Create Unigram distribution using FreqDist() function to calculate vocab size of dataset. self.vocab_size = len(set(unigrams)) for trigram in set(trigrams): trigram_count = trigram_dist.get( trigram) #get count of occurrence of trigram bigram_count = self.bigram_dist.get( trigram[:-1]) #get count of occurrence of preceding bigram probability = Fraction( trigram_count + 1, bigram_count + self.vocab_size) #Calculate probability with add-one smoothing temp = {trigram[2]: probability} #Store in temp ditionary if trigram[0] in self.trigram_probabilities: if trigram[1] in self.trigram_probabilities.get(trigram[0]): self.trigram_probabilities.get( trigram[0])[trigram[1]].update( temp) #Storing in 3 level dictionary as - else: #{trigram[0]:{trigram[1]:{trigram[2]:probability}}} self.trigram_probabilities.get(trigram[0]).setdefault( trigram[1], {}) self.trigram_probabilities.get( trigram[0])[trigram[1]].update(temp) else: self.trigram_probabilities.setdefault(trigram[0], {}) self.trigram_probabilities.get(trigram[0]).setdefault( trigram[1], {}) self.trigram_probabilities.get( trigram[0])[trigram[1]].update(temp)
def featureList(corpus): featList = [] for trFile in corpus.fileids(): listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(corpus.words(trFile)) i =0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def featureList(corpus): featList = [] for post in corpus: listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i =0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def featureList(corpus): featList = [] for post in corpus: listItem = [0] * noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i = 0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i = i + 1 featList.append(listItem) return featList
def createFeatures(sentVect, ordList): noFeat = len(ordList) featList = [] for post in sentVect: listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i =0 for key in ordList: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def get_frequency(self, flag): fdist = {} sentences = [] if flag == "text": sentences = self.tokens_text else: sentences = self.tokens_corpus for s in sentences: for w in s: fdist[w.lower()] = 0 for s in sentences: t = FreqDist(word.lower() for word in s) # print("FRECUENCIAS") for x in t.keys(): # print("{}->{}".format(x,t.get(x))) fdist[x] += t.get(x) return fdist
def createFeatures(sentVect, ordList): noFeat = len(ordList) featList = [] for post in sentVect: listItem = [0] * noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i = 0 for key in ordList: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i = i + 1 featList.append(listItem) return featList
def createProbDist(readerWordlist,writerUniqueWordlist): ### create a dictionary to store the frequency of each term prob_dist = [] ### using nltk calcuate frequency of each word unigramWordList = FreqDist(readerWordlist) datalen = len(readerWordlist) ### total words in the that document #print len(unigramWordList) #print datalen for word in writerUniqueWordlist: if word in unigramWordList: #print word #print unigramWordList.get(word) #print unigramWordList.get(word)/float(datalen) prob_dist.append(unigramWordList.get(word)/float(datalen)) else: prob_dist.append(0) #print prob_dist return prob_dist
class BigramModel: def __init__(self, text): self.bigrams, self.string = generate_ngram( text, 2 ) #Initialize list of bigrams by calling generate ngram method with n = 2 self.bigram_probabilities = { } #Dictionary to store bigram and corresponding probability self.vocab_size = 0 self.unigram_dist = FreqDist() def calc_bigram_probability(self, string, bigrams): bigram_dist = FreqDist( bigrams) #Create Bigram distribution using FreqDist() function unigrams, string = generate_ngram(string, 1) self.unigram_dist = FreqDist( unigrams) #Create Unigram distribution using FreqDist() function self.vocab_size = len(set( unigrams)) #vocab size is the toal distinct characters in dataset for bigram in set(bigrams): bigram_count = bigram_dist.get(bigram) #get count of bigram unigram_count = self.unigram_dist.get( bigram[:-1]) #get count of unigram probability = Fraction( bigram_count + 1, unigram_count + self.vocab_size) #Calculate probability with add-one smoothing temp = { bigram[-1]: probability } #Storing in temp dictionary - last char in bigram and probability if bigram[0] in self.bigram_probabilities: self.bigram_probabilities[bigram[0]].update( temp ) #update main dictionary with temp- {bigram[0]: {bigram[1]:probability}} else: self.bigram_probabilities.setdefault( bigram[0], {} ) #If encountered a character first time, add it to main dictionary as a dict object self.bigram_probabilities[bigram[0]].update(temp)
def calc_trigram_probability(self, string, trigrams): trigram_dist = FreqDist( trigrams) #Create Trigram distribution using FreqDist() function bigrams, string = generate_ngram(string, 2) self.bigram_dist = FreqDist( bigrams) #Create Bigram distribution using FreqDist() function unigrams, string = generate_ngram(string, 1) unigram_dist = FreqDist( unigrams ) #Create Unigram distribution using FreqDist() function to calculate vocab size of dataset. self.vocab_size = len(set(unigrams)) for trigram in set(trigrams): trigram_count = trigram_dist.get( trigram) #get count of occurrence of trigram bigram_count = self.bigram_dist.get( trigram[:-1]) #get count of occurrence of preceding bigram probability = Fraction( trigram_count + 1, bigram_count + self.vocab_size) #Calculate probability with add-one smoothing temp = {trigram[2]: probability} #Store in temp ditionary if trigram[0] in self.trigram_probabilities: if trigram[1] in self.trigram_probabilities.get(trigram[0]): self.trigram_probabilities.get( trigram[0])[trigram[1]].update( temp) #Storing in 3 level dictionary as - else: #{trigram[0]:{trigram[1]:{trigram[2]:probability}}} self.trigram_probabilities.get(trigram[0]).setdefault( trigram[1], {}) self.trigram_probabilities.get( trigram[0])[trigram[1]].update(temp) else: self.trigram_probabilities.setdefault(trigram[0], {}) self.trigram_probabilities.get(trigram[0]).setdefault( trigram[1], {}) self.trigram_probabilities.get( trigram[0])[trigram[1]].update(temp)
print("2.", len(cess_esp.words())) # 3 print("3.", len(cess_esp.sents())) # 4 from nltk.probability import FreqDist first_file = cess_esp.fileids()[0] cess_freq0 = FreqDist(cess_esp.words(first_file)) print("4.", cess_freq0.most_common(20)) # 5 print("5.", [w for w, k in cess_freq0.most_common()]) # 6 print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2]) # 7 print("7.", [k for w, k in cess_freq0.most_common()]) print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0)) # 8 print("8. No de palabras que aparecen una sola vez:", len([w for w, k in cess_freq0.items() if k == 1])) # 9 print("9. La palabra más frecuente es", cess_freq0.max()) # 10 from nltk.corpus import PlaintextCorpusReader mycorpus = PlaintextCorpusReader("../res/", ".*") # 11 print("11.") for doc in mycorpus.fileids(): print(doc, len(mycorpus.words(doc)), len(set(mycorpus.words(doc))), len(mycorpus.sents(doc)))
print("% 5.2f" % durata + " milisecunde ") print("------------------------------------------------------------") start = timeit.timeit() medie = sum(len(cuv) for cuv in vocabT2) / len(vocabT2) print("Lungimea medie a cuvintelor este:") print(medie) end = timeit.timeit() durata = 1000 * abs(end - start) print("Procesarea a durat ", end=" ") print("% 5.2f" % durata + " milisecunde ") print("------------------------------------------------------------") start = timeit.timeit() fdist = FreqDist(text2) apare1 = [cv for cv in fdist.keys() if fdist.get(cv) == 1] print("Cuvintele care apar doar o dată:") print(apare1) end = timeit.timeit() durata = 1000 * abs(end - start) print("Procesarea a durat ", end=" ") print("% 5.2f" % durata + " milisecunde ") print("------------------------------------------------------------") start = timeit.timeit() bigram_measures = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(text2) print("Primele 10 colocații (pentru bigrame) sînt:") print(finder.nbest(bigram_measures.pmi, 10)) end = timeit.timeit()
for (token, is_gene) in goldstandard_words: dist = gene_substrings if is_gene else notgene_substrings dist.update(substrings(token)) difference = defaultdict(float) size1 = float(gene_substrings.N()) size2 = float(notgene_substrings.N()) # difference ist eine Hashtabelle, wobei die Schlüssel von # Substrings und die Werte Differenzen in ihrer Häufigkeit sind. # Positiv bedeutet: häufig in Gennamen; negativ bedeutet: selten # in Gennamen. for substr in (gene_substrings + notgene_substrings).iterkeys(): v1 = gene_substrings.get(substr) or 0 v2 = notgene_substrings.get(substr) or 0 difference[substr] = v1 / size1 - v2 / size2 r = xrange(-10, 10 + 1) A = array([array(r), zeros_like(r)]) A.dtype = dtype('float32') # for (i, e) in enumerate(xrange(-10, 10 + 1)): # if e != 0: # epsilon = 0.2 / e # print "epsilon = ", epsilon # klassi = OrClassifier([ # DictionaryClassifier(given_genes, stopwords), # DifferenceClassifier(difference),
token_collection.append(token) root = get_root(token) if root != '' and root != None: lemmatised_sentences[len(lemmatised_sentences) - 1] = lemmatised_sentences[ len(lemmatised_sentences) - 1].replace(token, root) lemmatised_tokens.append(root) else: lemmatised_tokens.append(token) if len(token_collection) != 0: fdist = FreqDist(token_collection) for key in list(fdist.keys()): print('word: ', key) print(fdist.get(key)) out_file.write(key + ',' + str(fdist.get(key)) + '\n') if len(lemmatised_tokens) != 0: fdist = FreqDist(lemmatised_tokens) for key in list(fdist.keys()): print('word: ', key) print(fdist.get(key)) out_lemma_file.write(key + ',' + str(fdist.get(key)) + '\n') #td-idf - lemmas? #instantiate CountVectorizer() cv_words = CountVectorizer() cv_lemmas = CountVectorizer() # this steps generates word counts for the words in your docs word_count_vector = cv_words.fit_transform(sentences)
text = nltk.Text(tokens) dist = FreqDist(tokens) top_ten = dist.most_common(20) top_ten = pd.DataFrame(dist.most_common(10), columns = ['Word', 'Count']) top_ten.plot.bar(x='Word', y='Count') a = nltk.pos_tag([a[0] for a in dist.most_common(1000)]) word = [] for k, pos in a: if pos not in ['NN', 'IN', 'CD', 'JJ', 'JJS', 'VBN', 'VB', 'VBD']: word.append((k, dist.get(k))) c = 0 for k,v in dist.most_common(20): print(k, "--->", v) c += 1 if c == 5: break all_chunks = []
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist) BiwMLE = MLEProbDist(TagWordfdist)
def main(): parser = argparse.ArgumentParser(description='Categorize (nouns, verbs, etc.) all words in a text') parser.add_argument('file', type=argparse.FileType('rU'), help='text file to categorize') parser.add_argument('--include-stopwords', action='store_true', default=DEFAULT_STOPWORDS, help='include very common words in results (default: {})' .format(DEFAULT_STOPWORDS)) parser.add_argument('--sort', type=str, default=DEFAULT_SORT, choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS], help='''how to sort output; by word alphabetically, number of occurrences, word length, or word class (default: {})''' .format(DEFAULT_SORT)) args = parser.parse_args() if not args.file: sys.exit('You must specify a text file to categorize') # Download required nltk data if needed. nltk.download('maxent_treebank_pos_tagger') if not args.include_stopwords: nltk.download('stopwords') # Create tokenized lists of words from the text. text = args.file.read() text = text.lower() tokens = nltk.word_tokenize(text) # Only keep words consisting entirely of letters (i.e. remove punctuation, # numbers, etc.) tokens = [word for word in tokens if word.isalpha()] # Save number of occurrences for each word for later reference, which could # as well be through a collections.Counter but FreqDist will cover more # cases if expanded on later. fdist = FreqDist(tokens) # Remove stopwords, i.e. very common ones. tokens = set(tokens) if not args.include_stopwords: tokens -= STOPWORDS print('Found {} unique words'.format(len(tokens))) # Make a list of (word, occurrences, length, word_class) tuples for final # output. `word` is the word itself, `occurrences` the number of times it # occurs in its source text, `length` the length of the word, and # `word_class` what type of word it is, e.g. noun, verb, etc. matches = [(word, fdist.get(word), len(word), word_class(word)) for word in tokens] headers = ['word', 'occurrences', 'length', 'word_class'] # Sort output accordingly. if args.sort == SORT_ALPHA: matches = sorted(matches, key=itemgetter(0)) elif args.sort == SORT_OCCURRENCES: matches = sorted(matches, key=itemgetter(1, 0), reverse=True) elif args.sort == SORT_LENGTH: matches = sorted(matches, key=itemgetter(2, 0), reverse=True) elif args.sort == SORT_CLASS: matches = sorted(matches, key=itemgetter(3, 0)) # Finally, print results in a nice table. print(tabulate(matches, headers=headers))
fdist_wla = FreqDist(would_like_abl_words) fdist_wla.plot(30,cumulative=False) plt.show() plt.tight_layout() #Plot would like to be able to gram fdist_wlag = FreqDist(would_like_abl_gram) fdist_wlag.plot(30,cumulative=False) plt.show() plt.tight_layout() ''' #Remove anything with less than frequency of 5 or top 5 listed words also escalation_comment_high_freq = [] high_freq_words = [ key for key in fdist.keys() if fdist.get(key) > 4 and key not in [item[0] for item in list(fdist.most_common(5))] ] for s in escalation_comment_filtered: comment = [] for w in s: if w in high_freq_words: comment.append(w) escalation_comment_high_freq.append(comment) escalation_comment_filtered = escalation_comment_high_freq ###############################################Topic Modell1ing ############################## # Create Dictionary id2word = corpora.Dictionary(escalation_comment_filtered) # Create Corpus
def frequency(self, word): text1 = self.txt freq = FreqDist(text1) return freq.get(word, 0)
def main(): parser = argparse.ArgumentParser( description="Categorize (nouns, verbs, etc.) all words in a text") parser.add_argument("file", type=argparse.FileType("rU"), help="text file to categorize") parser.add_argument( "--include-stopwords", action="store_true", default=DEFAULT_STOPWORDS, help="include very common words in results (default: {})".format( DEFAULT_STOPWORDS), ) parser.add_argument( "--sort", type=str, default=DEFAULT_SORT, choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS], help="""how to sort output; by word alphabetically, number of occurrences, word length, or word class (default: {})""" .format(DEFAULT_SORT), ) args = parser.parse_args() if not args.file: sys.exit("You must specify a text file to categorize") # Download required nltk data if needed. nltk.download("maxent_treebank_pos_tagger") if not args.include_stopwords: nltk.download("stopwords") # Create tokenized lists of words from the text. text = args.file.read() text = text.lower() tokens = nltk.word_tokenize(text) # Only keep words consisting entirely of letters (i.e. remove punctuation, # numbers, etc.) tokens = [word for word in tokens if word.isalpha()] # Save number of occurrences for each word for later reference, which could # as well be through a collections.Counter but FreqDist will cover more # cases if expanded on later. fdist = FreqDist(tokens) # Remove stopwords, i.e. very common ones. tokens = set(tokens) if not args.include_stopwords: tokens -= STOPWORDS print("Found {} unique words".format(len(tokens))) # Make a list of (word, occurrences, length, word_class) tuples for final # output. `word` is the word itself, `occurrences` the number of times it # occurs in its source text, `length` the length of the word, and # `word_class` what type of word it is, e.g. noun, verb, etc. matches = [(word, fdist.get(word), len(word), word_class(word)) for word in tokens] headers = ["word", "occurrences", "length", "word_class"] # Sort output accordingly. if args.sort == SORT_ALPHA: matches = sorted(matches, key=itemgetter(0)) elif args.sort == SORT_OCCURRENCES: matches = sorted(matches, key=itemgetter(1, 0), reverse=True) elif args.sort == SORT_LENGTH: matches = sorted(matches, key=itemgetter(2, 0), reverse=True) elif args.sort == SORT_CLASS: matches = sorted(matches, key=itemgetter(3, 0)) # Finally, print results in a nice table. print(tabulate(matches, headers=headers))
word_pair = [' '.join([pair[0], pair[1]]) for pair in bigrams(corpus_tokens)] pair_fdist = FreqDist(word_pair) # In[7]: pmi = {} tokens_num = len(corpus_tokens) for pair in word_pair: # Get the freq of the pair w1w2_freq = pair_fdist.get(pair) # Only consider bigrams that occur with frequency above that threshold if w1w2_freq : pair_split = pair.split(' ') # Get the freq of each of the words pair w1_freq = fdist.get(pair_split[0]) w2_freq = fdist.get(pair_split[1]) # Compute the unigram probabilities in the corpus p_w1 = w1_freq / tokens_num p_w2 = w2_freq / tokens_num # Compute the bigram probability p_w1w2 = w1w2_freq / w1_freq