def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1): """Train a probability model on a korp statistics file and save it as a pickle file. The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys and smoothed probabilities as values.""" fdist = FreqDist() with open(stats_infile, encoding='utf-8') as f: for line in f: fields = line[:-1].split('\t') word = fields[0] # Skip word forms that occur fewer times than min_freq if int(fields[4]) < min_freq: break # Get rid of all urls if word.startswith("http://"): continue # # Words that only occur once may only contain letters and hyphens # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word): # continue # if len(word) > 100: # continue simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1] fdist[(word, simple_msd)] += int(fields[4]) pd = LidstoneProbDist(fdist, smoothingparam, fdist.B()) # Save probability model as pickle with open(picklefile, "wb") as p: pickle.dump(pd, p, protocol=protocol)
def make_model(nst_infile, picklefile, protocol=-1): """ Train a POS probability model on the NST lexicon and save it as a pickle file. The model is a LidstoneProbDist (NLTK) which has compounded POS tags (SUC set) as keys (e.g. "NN+NN") and smoothed probabilities as values.""" # Collect all compounds from nst data nst_full_compounds = set() with open(nst_infile, encoding='UTF-8') as f: for line in f: fields = line[:-1].split('\t') word = fields[0] comp = fields[3].replace("!", "") pos = fields[4] if "+" in comp and "_" not in word and not (comp.startswith("+") or comp.startswith("-")): nst_full_compounds.add((word, comp, pos)) # Build POS probability model pos_fdist = FreqDist() for _w, _c, pos in nst_full_compounds: if '+' in pos: pos = re.sub(r"\+LN", "", pos) pos_fdist[pos] += 1 pd = LidstoneProbDist(pos_fdist, 0.001, pos_fdist.B()) # Save probability model as pickle with open(picklefile, "wb") as f: pickle.dump(pd, f, protocol=protocol)
def main(): # matplotlib.use('Qt5Agg') # import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [['start0'] + [ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] + ['end0'] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary = list(word_frequency_distribution) vocabulary_length = word_frequency_distribution.B() # Calculate bigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) # Calculate the conditional frequency distribution for bigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) # Calculate the conditional probability distribution for bigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) lower_case_letters = string.ascii_lowercase error_test = copy.deepcopy(test) for sentence in error_test: word = random.randrange(1, len(sentence) - 1) sentence[word] = random.choice(vocabulary) word = random.choice(sentence[1:-2]) word = random.randrange(1, len(sentence) - 1) letter = random.randrange(0, len(sentence[word])) sentence[word] = sentence[word][0:letter] + random.choice( lower_case_letters) + sentence[word][letter + 1:] corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram) print('Corrected:{}'.format(corrected)) print('Original:{}'.format(test[25]))
def paper_title_NLP(title_corpus): # title_corpus is a list of tuple # keys like (19,1), means 2019/01 # value is a list of paper titles after tokenized # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer title_dict = {} pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) for t in title_corpus: key = (t[3], t[4]) if key in title_dict: filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) else: title_dict[key] = [] filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) # extract keywords with year span title_years = {} for k, v in title_dict.items(): key = (k[0], ) # year index if key in title_years.keys(): title_years[key].append(v) else: title_years[key] = [] title_years[key].append(v) deep_freq = [] for k, v in title_years.items(): fd = FreqDist() vs = [item for sublist in v for item in sublist] for v_ in vs: for word in v_: fd[word] += 1 print('The keywords for year:20{}'.format(str(k[0]))) print("Total number of words:{}".format(str( fd.N()))) # total number of samples print("Total number of unique words:{}".format(str( fd.B()))) # number of bins or unique samples fd.pprint(50) # The maximum number of items to display, default is 10 deep_freq.append(fd.freq('Deep') + fd.freq('deep')) print(deep_freq) plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq) plt.ylabel('frequency of deep word') plt.xlabel('years') plt.show()
def show(): print gutenberg.fileids() # 频率分布实例化 fd = FreqDist() for word in gutenberg.words('austen-persuasion.txt'): fd[word] += 1 print fd.N() print fd.B() # 得到前10个按频率排序后的词 for word, value in sorted(fd.items(), key=lambda item: -item[1])[:10]: print word, value
def get_stop_words(input_file="data/tickets_word2vec.model", threshold=0.02) -> List[str]: """ Get a list of step words base on relative frequency. The input could either be the raw CSV file or word2vec model build with genism. The input format will be determined by the input_file extension <filename>.[csv|model]. The `eval` method is a function which takes a float variable, word frequency, as a single argument and return a boolean value which represent whether a word is a stop word or not. By default, we consider the words within the top 2 percentile as stop words. >>> from canosp2020.preprocessing import Preprocess >>> stopwords = Preprocess.get_stop_words(input_file="data/tickets_word2vec.model", eval=lambda x: x <= 0.2) :param input_file: Path to tickets data csv file or genism word2vec model. :param eval: A function to evaluate whether a word is stop word of not :rtype: A list of words. """ _, extension = os.path.splitext(os.path.basename(input_file)) if extension == ".csv": nlp = spacy.load("en_core_web_sm") # Load csv file and merge title and content column df = pd.read_csv(input_file) df[TITLE_CONTENT] = df["title"] + " " + df["content"] df[TITLE_CONTENT].replace("", np.nan, inplace=True) df.dropna(subset=[TITLE_CONTENT], inplace=True) docs = list( nlp.pipe(df["title_content"], disable=["tagger", "parser", "ner"])) sents = [[token.text for token in doc] for doc in docs] big_words = itertools.chain(*sents) # Build frequency distribution fdist = FreqDist(big_words) elif extension == ".model": model = Word2Vec.load(input_file) counter = { word: vocab.count for word, vocab in model.wv.vocab.items() } counter = dict( sorted(counter.items(), key=lambda x: x[1], reverse=True)) fdist = FreqDist(counter) # stopwords = [word for word in fdist if eval(fdist.freq(word))] stopwords = [ each[0] for each in fdist.most_common(int(threshold * fdist.B())) ] return stopwords
def __init__(self, data, vocab=None, min=10000): ''' By default, the vocabulary size (vocab) is taken to be twice the number of observed items, with a minimum size (min) of 10,000. This is somewhat ad hoc, but considering Zipf's Law, we would expect vocabulary size to be infinite. ''' if type(data) == Counter or type(data) == dict: data = FreqDist(data) if vocab == None: vocab = max(2 * len(data), min) assert vocab >= data.B() self._freqdist = data self._bins = vocab r, nr = self._r_Nr() self.find_best_fit(r, nr) self._switch(r, nr) self.log_renormalise(r, nr)
def save_parameters_to_file(read_filename, write_filename): corrected_text = read_file(read_filename, 0) corrected_tokens = [c for word in corrected_text for c in word] max_variance = 4 extra_tokens = 2 corrected_freq = FreqDist(corrected_tokens) max_decoder_seq_length = max(len(x) for x in corrected_text) + extra_tokens num_decoder_tokens = corrected_freq.B() + extra_tokens max_encoder_seq_length = max_decoder_seq_length - extra_tokens + max_variance num_encoder_tokens = num_decoder_tokens input_characters = sorted( set(corrected_tokens).union(chars).union({'\n', '\t'})) target_characters = sorted(set(corrected_tokens).union({'\n', '\t'})) # print(input_characters) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) with open(write_filename, 'w') as file: file.write(str(max_decoder_seq_length)) file.write("\n") file.write(str(num_decoder_tokens)) file.write("\n") file.write(str(max_encoder_seq_length)) file.write("\n") file.write(str(num_encoder_tokens)) file.write("\n") file.write(str(input_characters)) file.write("\n") file.write(str(target_characters)) file.write("\n") file.write(str(input_token_index)) file.write("\n") file.write(str(target_token_index)) file.write("\n")
from nltk.corpus import gutenberg from nltk import FreqDist import matplotlib.pyplot as plt fd = FreqDist() for word in gutenberg.words('bible-kjv.txt'): fd[word] += 1 print(fd.N()) print(fd.B()) for word in list(fd.keys()): print(word, fd[word]) fd2 = FreqDist() for text in gutenberg.fileids(): for word in gutenberg.words(text): fd2[word] += 1 ranks = [] freqs = [] for rank, word in enumerate(fd2): ranks.append(rank + 1) freqs.append(fd2[word]) plt.loglog(ranks, freqs) plt.xlabel('frequency(f)', fontsize=14, fontweight='bold') plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
class PosNgram: def __init__(self, deg=1): self.order = deg self.__sentence = "" # storing tokens and frequency self.train_data = FreqDist() self.test_sents = None # to prevent from illegral argument if deg < 1: self.order = 1 def poses2tokens(self, pos_terms, include_freq=False, default_dict=None): """ # The token_terms must be the element of ngram_model # whose order is 1 smaller than that of the current one. """ if default_dict is None: default_dict = self.train_data for (tokens, poses), freq in default_dict.items(): if pos_terms == poses: yield tokens if\ not include_freq\ else (tokens, freq) def tokens2poses(self, token_terms, include_freq=False, default_dict=None): """ # The token_terms must be the element of ngram_model # whose order is 1 smaller than that of the current one. """ if default_dict is None: default_dict = self.train_data for (tokens, poses), freq in default_dict.items(): if token_terms == tokens: yield poses if\ not include_freq\ else (poses, freq) def pre_process(self, file_id, training_size=90): start_processing = time.time() self.train_data = FreqDist() sents = gutenberg.sents(file_id) t_size = floor((training_size / 100) * len(sents)) train_sents = sents[:t_size] self.test_sents = sents[t_size:] p_title = "file_id = <{}>, ngram's order = {}, split_ratio = {}-{}" print( p_title.format(file_id, self.order, training_size, 100 - training_size)) with ICB('Processing...', max=len(train_sents), suffix='%(percent)d%%') as bar: for sent in train_sents: bar.next() self.__sentence = " ".join(sent) self.train_data.update(self._token_pos_pairs) print('dict_size = {}'.format(self.train_data.B())) print("loading time = {}".format(time.time() - start_processing)) def _is_subcontent(self, w1, w2): assert len(w1) <= len(w2) w1 = list(w1) w2 = list(w2) for w in w1: if w not in w2: return False w2.remove(w) return True def fetch_if(self, cond, term, pos_is_target=True, include_pair=False): tmp_freq_dist = FreqDist() conditions = { ng_prefix: ["pos[:-1] == term", "token[:-1] == term"], ng_suffix: ["pos[-len(term):] == term", "token[-len(term):] == term"], ng_contain: [ "self._is_subcontent(term, pos)", "self._is_subcontent(term , token)" ], ng_equal: ["pos == term", "token == term"] } if cond not in conditions: cond = prefix # Fetching Choice Configuration p_key, t_key = "", "" if include_pair: p_key = "(pos, token)" t_key = "(token, pos)" else: p_key = "pos" t_key = "token" cmp_p = compile(p_key, '<string>', 'eval') cmp_t = compile(t_key, '<string>', 'eval') if pos_is_target: cmp_cond = compile(conditions[cond][0], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_p): freq}) else: cmp_cond = compile(conditions[cond][1], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_t): freq}) return tmp_freq_dist @property def _token_pos_pairs(self): """ This function maps terms to POS (The previous version's name was phi1) """ for elems in self._ngram_tokens_pos: poses = [elem[1] for elem in elems] tokens = [elem[0] for elem in elems] yield (tuple(tokens), tuple(poses)) @property def _sent2pos_tag(self): sent = self.__sentence tokens = word_tokenize(sent) return pos_tag(tokens) @property def _ngram_tokens_pos(self): # this returns the tuples of token pos pair return ngrams(self._sent2pos_tag, self.order)
def main(): matplotlib.use('Qt5Agg') import matplotlib.pyplot as plt download('punkt') # Download and load the english europarl corpus downloader.download('europarl_raw') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') words = english.words() # Calculate the frequency distribution of the words in the corpus word_frequency_distribution = FreqDist([word.lower() for word in words]) # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>" sentences = [[ word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence ] for sentence in english.sents()] # create train and test dataset train = sentences[0:int(len(sentences) * 0.8)] test = sentences[int(len(sentences) * 0.8):] vocabulary_length = word_frequency_distribution.B() # Calculate bigrams and trigrams bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2))) trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3))) # Calculate the conditional frequency distributions for bigrams and trigrams bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train) trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train]) # Calculate the conditional probability distributions for bigrams and trigrams cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length) cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length) bigrams_test = ngrams_sentences(test, 2) bigram_length_probabilities = defaultdict(list) for sentence in bigrams_test: logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) bigram_length_probabilities[len(sentence)].append(logprob) x = 0 s = None for sentence in bigrams_test: if (len(sentence) > x): x = len(sentence) s = sentence trigrams_test = ngrams_sentences(test, 3) trigram_length_probabilities = defaultdict(list) for sentence in trigrams_test: logprob = [ cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence ] logprob = sum(logprob) trigram_length_probabilities[len(sentence)].append(logprob) average_bigram_length_probabilities = { length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in bigram_length_probabilities.keys() } average_trigram_length_probabilities = { length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length in trigram_length_probabilities.keys() } random_sentences = [[ words[random.randint(0, len(words) - 1)].lower() for i in range(key) ] for key in bigram_length_probabilities.keys()] bigrams_random = ngrams_sentences(random_sentences, 2) random_bigram_length_probabilities = defaultdict(list) for sentence in bigrams_random: logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence] logprob = sum(logprob) random_bigram_length_probabilities[len(sentence)].append(logprob) trigrams_random = ngrams_sentences(random_sentences, 3) random_trigram_length_probabilities = defaultdict(list) for sentence in trigrams_random: logprob = [ cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence ] logprob = sum(logprob) random_trigram_length_probabilities[len(sentence)].append(logprob) bigram = plt.scatter(list(average_bigram_length_probabilities.values()), list(average_bigram_length_probabilities.keys()), color='red') trigram = plt.scatter(list(average_trigram_length_probabilities.values()), list(average_trigram_length_probabilities.keys()), color='blue') random_bigram = plt.scatter( list(random_bigram_length_probabilities.values()), list(random_bigram_length_probabilities.keys()), color='green') random_trigram = plt.scatter( list(random_trigram_length_probabilities.values()), list(random_trigram_length_probabilities.keys()), color='black') plt.xlabel('$log_2(P(W_1^k))$') plt.ylabel('$k$') plt.legend((bigram, trigram, random_bigram, random_trigram), ('Bigram', 'Trigram', 'Random bigram', 'Random trigram')) plt.ylim(ymin=0) # plt.show() plt.savefig('logprob') seed = 'this' for i in range(30): newword = predict_word(cpd_bigram, seed, 'bigram') if newword != None: seed += ' ' + newword else: break print( 'Given the seed word "this", the bigram model produced this text of length 30: {}' .format(seed)) seed = 'this' for i in range(30): newword = predict_word(cpd_trigram, seed, 'trigram') if newword != None: seed += ' ' + newword else: break print( 'Given the seed word "this", the trigram model produced this text of length 30: {}' .format(seed)) test_bigrams = [] for sentence in bigrams_test: test_bigrams += sentence bigram_entropy, bigram_perplexity = centropy_perplexity( cpd_bigram, test_bigrams) print( 'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}' .format(bigram_entropy, bigram_perplexity)) test_trigrams = [] for sentence in trigrams_test: test_trigrams += sentence trigram_entropy, trigram_perplexity = centropy_perplexity( cpd_trigram, test_trigrams) print( 'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}' .format(trigram_entropy, trigram_perplexity))
def recordStatsData(corpusname, csvwritter): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname, filtered=False) numcomments_pnm = 0 numcomments_pm = 0 numcomments_cm = 0 numcomments_cnm = 0 # post no mention if path.exists(processed_corpus_texts[0]): print("reading: " + processed_corpus_texts[0]) freqs_pnm = collectFreqData(processed_corpus_texts[0]) totalFQ = totalFQ + freqs_pnm junk, numcomments_pnm = collectAudienceFreqData( processed_corpus_texts[0]) # post mention if path.exists(processed_corpus_texts[1]): print("reading: " + processed_corpus_texts[1]) freqs_pm = collectFreqData(processed_corpus_texts[1]) totalFQ = totalFQ + freqs_pm junk, numcomments_pm = collectAudienceFreqData( processed_corpus_texts[1]) # comment no mention if path.exists(processed_corpus_texts[2]): print("reading: " + processed_corpus_texts[2]) freqs_cnm = collectFreqData(processed_corpus_texts[2]) totalFQ = totalFQ + freqs_cnm junk, numcomments_cnm = collectAudienceFreqData( processed_corpus_texts[2]) # comment mention if path.exists(processed_corpus_texts[3]): print("reading: " + processed_corpus_texts[3]) freqs_cm = collectFreqData(processed_corpus_texts[3]) totalFQ = totalFQ + freqs_cm junk, numcomments_cm = collectAudienceFreqData( processed_corpus_texts[3]) print("writing") towrite = dict() towrite["Subreddit"] = corpusname towrite["N"] = totalFQ.N() towrite["B"] = totalFQ.B() towrite[ "Num Utterences"] = numcomments_pnm + numcomments_pm + numcomments_cm + numcomments_cnm towrite["Num Utterences - Post NM"] = numcomments_pnm towrite["Num Utterences - Post M"] = numcomments_pm towrite["Num Utterences - Comment"] = numcomments_cnm towrite["Num Utterences - Comment M"] = numcomments_cm if path.exists(processed_corpus_texts[0]): towrite["N-Post"] = freqs_pnm.N() towrite["B-Post"] = freqs_pnm.B() else: towrite["N-Post"] = 0 towrite["B-Post"] = 0 if path.exists(processed_corpus_texts[1]): towrite["N-Post with Mention"] = freqs_pm.N() towrite["B-Post with Mention"] = freqs_pm.B() else: towrite["N-Post with Mention"] = 0 towrite["B-Post with Mention"] = 0 if path.exists(processed_corpus_texts[2]): towrite["N -Comment"] = freqs_cnm.N() towrite["B -Comment"] = freqs_cnm.B() else: towrite["N -Comment"] = 0 towrite["B -Comment"] = 0 if path.exists(processed_corpus_texts[3]): towrite["N -Comment with Mention"] = freqs_cm.N() towrite["B -Comment with Mention"] = freqs_cm.B() else: towrite["N -Comment with Mention"] = 0 towrite["B -Comment with Mention"] = 0 csvwritter.writerow(towrite)
# tokenize words and add the label, random the order and close the db comments = [(word_tokenize(c[0]), c[1]) for c in cur] random.shuffle(comments) db.close() # Gather all words from both labels all_words = [] for c in comments: for w in c[0]: word = w if word[:2] != "//": if '*' in word: word = word.replace('*','') all_words.append(word.lower()) all_words = FreqDist(all_words) print(all_words.B()) # Get a random set of the words to use as features word_features = list(all_words.keys())[:4000] # make feature sets from each comment and mark it with a label # function returns a feature set of form {"example" : True, "word" : False} # it will be the length of word_features def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(comment), label) for (comment, label) in comments]
from nltk import FreqDist from common.books import text1 t1 = text1() fdist1 = FreqDist(t1) print(fdist1) print("moby dick has {0} words, {1} unique ones.".format( fdist1.N(), fdist1.B())) voc1 = fdist1.keys() # the type is dict_keys # print(type(voc1)) # change back to a normal list # voc1 = list(voc1) # print(voc1[:50]) # i = 0 # for k in voc1: # if i < 50: # print(k) # i += 1 # else: # break print(fdist1.most_common(50)) print("'whale' has {0} occurences.".format(fdist1['whale'])) fdist1.plot(50, cumulative=True)
#!/usr/bin/env python3 from nltk.corpus import gutenberg from nltk import FreqDist # Count each token in austen-persuasion.txt of the Gutenberg collection list_of_words = gutenberg.words("austen-persuasion.txt") fd = FreqDist(list_of_words) # Frequency distribution object print("Total number of tokens: " + str(fd.N())) # number of words: 98171 print("Number of unique tokens: " + str(fd.B())) # unique words: 6132 print("Top 10 tokens:") # third common most token is `to` for token, freq in fd.most_common(10): print(token + "\t" + str(freq))
if args.stop_punctuation: stoplist += [x.decode('UTF8') for x in set(list(punctuation))] stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014'] stoplist.append('--') words = [word for word in word_tokenize(text) if word not in stoplist] if args.stem: st = LancasterStemmer() words = [st.stem(word) for word in words] freq_dist = FreqDist(words) print('Total words: ' + str(orig_freq_dist.N())) print('Total after filter: ' + str(freq_dist.N())) # B() gives list of unique words print('Unique words: ' + str(freq_dist.B())) print('Unique words ratio: ' + str(float(freq_dist.B()) / float(freq_dist.N()))) print('\n') if args.words: for word in args.words: print(word + ': ' + str(freq_dist[word])) print(word + ' freq: ' + str(freq_dist.freq(word))) print('\n') # Show top 30 print('Top ' + str(args.num_words) + ' words:') freq_dist.tabulate(args.num_words)
#!/local/bin/python3 from nltk.corpus import gutenberg from nltk import FreqDist list_of_words = gutenberg.words("austen-persuasion.txt") fd = FreqDist(list_of_words) print("Total number of tokens: " + str(fd.N())) #98171 print("Number of unique tokens: " + str(fd.B())) #6132 print("Top 10 tokens:") #to for token, freq in fd.most_common(10): print(token + "\t" + str(freq))
sWordFreq = FreqDist(word_tokenize(i)) # 每一句的词频数字典 for j in sWordFreq: if j in unigramsDist: unigramsDist[j] += sWordFreq[j] else: unigramsDist[j] = sWordFreq[j] # 加入未登录词 for i in testset: word = word_tokenize(i) # 每一句的词频数字典 for j in word: if j not in unigramsDist: unigramsDist[j] = 0 # 频数转化为频率 使用加一平滑法 unigramsDist.B()表示每个词都加一后的增加量 s = unigramsDist.N() + unigramsDist.B() unigramsFreq = FreqDist() for i in unigramsDist: unigramsFreq[i] = (unigramsDist[i] + 1) / s X = sum(unigramsFreq.values()) ppt = [] for sentence in testset: logprob = 0 wt = 0 for word in word_tokenize(sentence): if word in unigramsFreq: logprob += log(unigramsFreq[word], 2) wt += 1 if wt > 0:
removed_stopword_count = all_word_count - interesting_word_count removed_stopword_percentage = round( (100 * removed_stopword_count) / all_word_count, 2) print("Removed {} stopwords from the corpus ({}%)".format( removed_stopword_count, removed_stopword_percentage)) removed_vocab_count = all_vocab_count - interesting_vocab_count removed_stopword_vocab_percentage = round( (100 * removed_vocab_count) / all_vocab_count, 2) print("Removed {} stopwords from the vocab ({}%)".format( removed_vocab_count, removed_stopword_vocab_percentage)) # Get a frequency distribution for the interesting words. fd = FreqDist(interesting_word_list) print("Number of words: {}".format(fd.N())) print("Number of distinct words: {}".format(fd.B())) # The most common words. fd.most_common(10) freq_list = [] for word in fd.keys(): freq_list.append([word, fd[word]]) # Sort the words by frequency, from high to low. sorted_freq_list = sorted(freq_list, key=lambda t: t[1], reverse=True) # Create a Words Rank Frequency list save each elements as a sublist. rank = 1 freq_rank_list = [] for word in sorted_freq_list:
from nltk.corpus import gutenberg from nltk import FreqDist import matplotlib import matplotlib.pyplot as plt fd = FreqDist() for text in gutenberg.fileids(): print(text, end=' ') for word in gutenberg.words(text): fd[word] += 1 print("......done") samples = fd.most_common() freqs = [freq for _, freq in samples] ranks = [i for i in range(1, fd.B() + 1)] # print(freqs) # print(ranks) plt.loglog(ranks, freqs) plt.xlabel('requency(f)', fontsize=14, fontweight='bold') plt.ylabel('rank(r)', fontsize=14, fontweight='bold') plt.grid(True) plt.show()
words = reduce(word_split, corpus) #计算词频,索引 fd = FreqDist(words) index = bidict() pos = 0 for k, c in fd.items(): index[k] = pos pos = pos + 1 #=====利用nltk的biggrams函数,建立gram矩阵========================== grams = list(bigrams(words)) gc = np.zeros((fd.B(), fd.B()), dtype=np.int32) #统计gram次数 for p1, p2 in grams: gc[index[p1], index[p2]] += 1 #统计gram概率 gp = np.zeros((fd.B(), fd.B())) #平滑系数 ratio = 0.9 for row in range(0, fd.B()): for col in range(0, fd.B()): gp[row, col] = ratio * (gc[row, col] / fd[index.inv[row]]) + (
# 都有些什么语料在这个集合里? print(gutenberg.fileids()) # ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt'] # 导入 FreqDist 类 from nltk import FreqDist # 频率分布实例化 fd = FreqDist() # 统计文本中的词例 for word in gutenberg.words('austen-persuasion.txt'): fd.inc(word) print(fd.N()) # total number of samples # 98171 print(fd.B()) # number of bins or unique samples # 6132 # 得到前 10 个按频率排序后的词 for word in fd.keys()[:10]: print(word, fd[word]) # ================运行时间计时================ run_time = time.time() - start_time if run_time < 60: # 两位小数的秒 print("耗时:{:.2f}秒".format(run_time)) elif run_time < 3600: # 分秒取整 print("耗时:{:.0f}分{:.0f}秒".format(run_time // 60, run_time % 60)) else: # 时分秒取整 print("耗时:{:.0f}时{:.0f}分{:.0f}秒".format(run_time // 3600, run_time % 3600 // 60, run_time % 60))