def unigramtrainingset(a): # create frequency distribution of word, tag pairs in the training set fd = FreqDist(a) # seperate words from tags x = [y[0] for y in a] # create frequency distribution of words in the training set fd2 = FreqDist(x) # create list of unique words words = unique_list([x[0] for x in fd]) # create list of unique tags (all possible tags) tags = [ 'NOUN', 'ADP', 'ADV', 'NUM', 'VERB', '.', 'PRON', 'DET', 'ADJ', 'PRT', 'CONJ' ] # initialise output list out = [] # loop through each unique word for word in words: # reinitialise tagso list tagso = [] # store frequency of current word denom = fd2.freq(word) # loop through each tag for tag in tags: # compute probability of current tag being paired with current word prob = fd.freq((word, tag)) / denom # create list of tag, probability pairs tagso.append((tag, prob)) # append word, tag-probabilities to out list out.append((word, tagso)) return out
def load_book_features(file_name): with open(file_name, 'r') as file_handler: text = file_handler.read() morph = pymorphy2.MorphAnalyzer() sentence_list = sent_tokenize(text) usual_book_words = [] sentences_length_dist = [] words_length_dist = [] pron_dist = [] conj_dist = [] for sentence in sentence_list: if sentence != ".": pron_count = 0 conj_count = 0 sentence_words = re.findall(r"[\w]+", sentence) sentences_length_dist.append(len(sentence_words)) for word in sentence_words: words_length_dist.append(len(word)) if word in NOMINATIVE_PRONOUNS: pron_count += 1 if morph.parse(word)[0].tag.POS == 'CONJ': conj_count += 1 if word not in STOPWORDS: usual_book_words.append(word) conj_dist.append(conj_count) pron_dist.append(pron_count) sentence_length_freq_dist = FreqDist(sentences_length_dist) sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] sentences_length_dist.append(1 - sum(sentences_length_dist)) words_length_freq_dist = FreqDist(words_length_dist) words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)] words_length_dist.append(1 - sum(words_length_dist)) pron_freq_dist = FreqDist(pron_dist) pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)] pron_dist.append(1 - sum(pron_dist)) conj_freq_dist = FreqDist(conj_dist) conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)] conj_dist.append(1 - sum(conj_dist)) words_freq_dist = FreqDist(usual_book_words) num_unique_words = len(words_freq_dist.keys()) num_total_words = len(usual_book_words) hapax = len(words_freq_dist.hapaxes()) / num_unique_words dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words richness = num_unique_words / num_total_words return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
def paper_title_NLP(title_corpus): # title_corpus is a list of tuple # keys like (19,1), means 2019/01 # value is a list of paper titles after tokenized # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer title_dict = {} pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) for t in title_corpus: key = (t[3], t[4]) if key in title_dict: filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) else: title_dict[key] = [] filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) # extract keywords with year span title_years = {} for k, v in title_dict.items(): key = (k[0], ) # year index if key in title_years.keys(): title_years[key].append(v) else: title_years[key] = [] title_years[key].append(v) deep_freq = [] for k, v in title_years.items(): fd = FreqDist() vs = [item for sublist in v for item in sublist] for v_ in vs: for word in v_: fd[word] += 1 print('The keywords for year:20{}'.format(str(k[0]))) print("Total number of words:{}".format(str( fd.N()))) # total number of samples print("Total number of unique words:{}".format(str( fd.B()))) # number of bins or unique samples fd.pprint(50) # The maximum number of items to display, default is 10 deep_freq.append(fd.freq('Deep') + fd.freq('deep')) print(deep_freq) plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq) plt.ylabel('frequency of deep word') plt.xlabel('years') plt.show()
def vectorize_string_tfidf(doc, idf): words = word_tokenize(doc) words = [word.lower() for word in words] words = [word for word in words if word not in stops] fdist = FreqDist(words) freqs = [] # to address sparsity issues: currently uses dictionaries for word in set(words): try: freqs += [(word, fdist.freq(word) / idf[word])] except KeyError: freqs += [(word, fdist.freq(word))] return dict(freqs)
def run(self): swear_words = set(utils.stem_words(corpus.swear_words())) with self.input().open('rb') as in_file: songs = pickle.load(in_file) normalized_word_frequencies = {} for song in songs: dist = FreqDist(song['word_tokens']) for sw in swear_words: if not sw in normalized_word_frequencies: normalized_word_frequencies[sw] = 0 normalized_word_frequencies[sw] += dist.freq(sw) for w, v in normalized_word_frequencies.items(): normalized_word_frequencies[w] = v / len(songs) df = pd.DataFrame.from_dict(normalized_word_frequencies, orient='index') title = 'Swear Word Frequency\n%s' % (self.artist) word_freq = df.nlargest(5, 0)[0:5].plot(kind='bar', title=title, legend=False) word_freq.set_xlabel("Swear Word") word_freq.set_ylabel("Distribution") with self.output().open('wb') as out_file: word_freq.get_figure().savefig(out_file, dpi='figure') clear_plots()
def test(): global N, words, network print 'In testing.' gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth.""" tokenizer = RegexpTokenizer('\w+') gettysburg_tokens = tokenizer.tokenize(gettysburg) samples = [] for token in gettysburg_tokens: word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) dist = FreqDist(samples) V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) pred = network.forward(V).w topics = [] while len(topics) != 5: max_act = max(pred) topic_idx = pred.index(max_act) topic = words[topic_idx] if topic in gettysburg_tokens: topics.append(topic) del pred[topic_idx] print 'Topics of the Gettysburg Address:' print topics
def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab, pos_vocab, synset_vocab, stemmer): d = Document() assert language == self.lang if self._id: d.id = self._id else: d.id = num d.language = language d.title = self.title.strip() num_sentences = max(self._sentences) + 1 tf_token = FreqDist() for ii in self.tokens(): tf_token.inc(ii) for ii in xrange(num_sentences): s = d.sentences.add() for jj in self._sentences[ii]: w = s.words.add() w.token = token_vocab[jj.word] w.lemma = lemma_vocab[jj.lemma] w.pos = pos_vocab[jj.pos] w.relation = pos_vocab[jj.rel] w.parent = jj.parent w.offset = jj.offset w.tfidf = token_df.compute_tfidf(jj.word, tf_token.freq(jj.word)) return d
def max_dist(emoList): x = {} for e in emoList: fd = FreqDist(emoList[e]) m = fd.max() x[m] = fd.freq(m) return max(x, key=lambda k: x[k])
def vectorize_string(doc): words = word_tokenize(doc) words = [word.lower() for word in words] words = [word for word in words if word not in stops] fdist = FreqDist(words) # to address sparsity issues: currently uses dictionaries freqs = [(word, fdist.freq(word)) for word in set(words)] return dict(freqs)
class Models: #Constructor def __init__(self, corpura): corpus = udhr.raw(corpura) self.TrainingSet = corpus[0:1000] token = list(self.TrainingSet) self.Uni = token self.Bi = list(nltk.bigrams(token)) self.Tri = list(nltk.trigrams(token)) self.UniFreq = FreqDist(self.Uni) self.BiFreq = ConditionalFreqDist(self.Bi) self.TriFreq = ConditionalFreqDist( list(((w1, w2), w3) for w1, w2, w3 in self.Tri)) #method to calculate Unigrams def CalUni(self, Words): Words = Words.strip().lower() Character = list(Words) i = 1 for a in Character: i *= self.UniFreq.freq(a) return i #method to calculate Bigrams def CalBi(self, Words): Words = Words.strip().lower() Character = list(Words) i = 1 for a, b in enumerate(Character): if a == 0: continue i *= self.BiFreq[Character[a - 1]].freq(b) return i #method to calculate Trigrams def CalTri(self, Words): Words = Words.strip().lower() Character = list(Words) i = 1 for a, b in enumerate(Character): if a <= 1: continue i *= self.TriFreq[(Character[a - 2], Character[a - 1])].freq(b) return i
def doc_tfidf(self, doc: str) -> Dict[Tuple[str, int], float]: """Given a document, create a dictionary representation of its tfidf vector doc -- raw string of the document""" counts = FreqDist(self.tokenize(doc)) d = {} for ii in self._tokenizer(doc): ww = self.vocab_lookup(ii) d[(ww, ii)] = counts.freq(ww) * self.inv_docfreq(ww) return d
def extract_ngrams(text, low=1, high=2, lowercase=False, filter_punctuation=True, binary=False, least_common=None, most_common=None, normalize=False, sample=False): #text = ' '.join(review.paragraphs) tokens = None # Make lowercase if lowercase: tokens = word_tokenize(text.lower()) else: tokens = word_tokenize(text) # Remove Punctuation if filter_punctuation: words = [t for t in tokens if t not in PUNCTUATION] else: words = [t for t in tokens] # Do the N Gram Thing ngram_counts = {} assert not ( sample and binary ), "Please don't make sample and binary True. One or the other or neither pls" for n in range(low, high + 1): ngram_freqdist = FreqDist(ngrams(words, n)) grams_to_consider = ngram_freqdist if least_common: assert least_common > 0.0 and least_common <= 1.0, \ 'Least common must be a proportion, not %.3f' % least_common num_least_common = int(least_common * ngram_freqdist.N()) grams_to_consider = [] for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]: gram, count = bleh grams_to_consider.append(gram) for gram in grams_to_consider: if sample: ngram_counts[gram] = ngram_freqdist.freq(gram) elif binary: ngram_counts[gram] = True else: ngram_counts[gram] = ngram_freqdist[gram] if normalize: total_counts = sum(count for ngram, count in ngram_counts.items()) for gram, count in ngram_counts.items(): ngram_counts[gram] = count / total_counts return ngram_counts
def model_select(sentences, seq, vocab_size): if len(sentences) > 50: fd = FreqDist(seq) frequencies = np.fromiter((fd.freq(i) for i in range(vocab_size)), dtype=np.float64) emission_prob = np.stack([frequencies] * 8) model = hmm.MultinomialHMM(n_components=8, init_params='st') model.emissionprob_ = emission_prob else: model = hmm.MultinomialHMM(n_components=8, init_params='ste') return model
def calc_metrics_for_one_sample(id_): # Identify the file with the relevant counts rfh = open(COUNTS_DIR + "counts-" + id_, "r") # Adapted from the BitCounter's method get_freq_dists d_freq_dist = FreqDist() r_freq_dist = FreqDist() # Skip header rfh.readline() line = rfh.readline() while line.strip(): party, phrase, count = line.strip().split("|") assert party in ("D", "R") count = int(count) if party == "D": d_freq_dist[phrase] += count else: r_freq_dist[phrase] += count line = rfh.readline() vocab = list(set(d_freq_dist.keys()).union(set(r_freq_dist.keys()))) # L1 smoothing for phrase in vocab: d_freq_dist[phrase] += 1 r_freq_dist[phrase] += 1 # Adapted from the BitCounter's method get_signal # N.B. If denom == "q" is passed to get_signal, get_signal *should* be # redundant to get_log_odds with a change of base signal = lambda pi, qi: math.log(pi / qi, 2) signals = [] for phrase in vocab: ds = signal(d_freq_dist.freq(phrase), r_freq_dist.freq(phrase)) rs = signal(r_freq_dist.freq(phrase), d_freq_dist.freq(phrase)) signals.append((phrase, ds, rs)) df = pd.DataFrame(signals) df.columns = ["term", "dmetric", "rmetric"] df.set_index("term", inplace=True) df.to_pickle(METRICS_DIR + "signals-unigrams-" + id_)
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url https://www.google.com/search?q=personal+nutrition @scrapes pages to depth<=3, using priority-score based BFS """ doc = clean_html(response.body_as_unicode()) words = word_tokenize(doc) words = [word.lower() for word in words] words = [word for word in words if word not in self.stops] fdist = FreqDist(words) for word in set(words): if (fdist.freq(word) * fdist.N()) > 1: item = WordCount() item['word'] = word item['count'] = int(fdist.freq(word) * fdist.N()) yield item #for href in response.css("a::attr('href')"): # url = response.urljoin(href.extract()) # yield scrapy.Request(url, callback=self.parse)
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url https://www.google.com/search?q=personal+nutrition @scrapes pages to depth<=3, using priority-score based BFS """ doc = clean_html(response.body_as_unicode()) words = word_tokenize(doc) words = [word.lower() for word in words] words = [word for word in words if word not in self.stops] fdist = FreqDist(words) for word in set(words): if (fdist.freq(word) * fdist.N()) > 1: item = WordCount() item['word'] = word item['count'] = int(fdist.freq(word) * fdist.N()) yield item #for href in response.css("a::attr('href')"): # url = response.urljoin(href.extract()) # yield scrapy.Request(url, callback=self.parse)
def bigramTags(a): #create lit of all tags tags = [x[1] for x in a] #create list of tag bigrams btags = [(tags[i], tags[i + 1]) for i in range(len(tags) - 1)] #create frequency distribution of bigram tags btagsf = FreqDist(btags) #create list of unique bigram tags btagscombo = [(x, y) for x in unique_list(tags) for y in unique_list(tags)] out = [] #loop through unique bigram tags for i in range(len(btagscombo)): #add bigran tag with frequency probability to list out.append((btagscombo[i], btagsf.freq(btagscombo[i]))) return out
class LangModel: def __init__(self, file): corpus = udhr.raw(file) self.training_set = corpus[0:1000] token = list(self.training_set) self.unigram = token self.bigram = list(nltk.bigrams(token)) self.trigram = list(nltk.trigrams(token)) self.unigram_frequency = FreqDist(self.unigram) self.bigram_frequency = ConditionalFreqDist(self.bigram) self.trigam_frequency = ConditionalFreqDist( list(((x, y), z) for x, y, z in self.trigram)) #Creating a function cal_unigram for calculating the probability of each character in Uniigram model def cal_unigram(self, words): words = words.strip().lower() character = list(words) p = 1 for n in character: p = p * self.unigram_frequency.freq(n) return p #Creating a function cal_bigram for calculating the probability of each character in Bigram model def cal_bigram(self, words): words = words.strip().lower() character = list(words) p = 1 for m, n in enumerate(character): if m == 0: continue p = p * self.bigram_frequency[character[m - 1]].freq(n) return p #Creating a function cal_trigram for calculating the probability of each character in Trigram model def cal_trigram(self, words): words = words.strip().lower() character = list(words) p = 1 for m, n in enumerate(character): if m <= 1: continue p = p * self.trigam_frequency[(character[m - 2], character[m - 1])].freq(n) return p
def extract_doc_feats(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) occurences = defaultdict(lambda: 0) for doc in refactorized_documents: for x in set(doc): occurences[x] += 1 ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() glob_features = [{}]*doc_num for i in range(0, doc_num): doc_features = [0]*len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) doc_len = len(refactorized_documents[i]) for (tok,num) in doc_freqs.items(): max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len) # augmented #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq) tf = 1+math.log(num,10) idf = math.log( float(doc_num) / (float(occurences[tok])) ,10) tfidf = tf*idf indx = tokens.index(tok) doc_features[indx] = tfidf f_tmp = numpy.asarray(doc_features) f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps) glob_features[i] = f_tmp.tolist() glob_features = numpy.asarray(glob_features)*glob_freqs.N() print "Glob Freqs:", glob_freqs.N() return (glob_features,tokens)
class NumTranslationsFeatureExtractor(FeatureExtractor): # .f2e file def __init__(self, lex_prob_file, corpus_file): self.lex_prob = defaultdict(list) for line in open(lex_prob_file): chunks = line[:-1].split() self.lex_prob[chunks[1]].append(float(chunks[2])) corpus = TextCorpus(input=corpus_file) self.corpus_freq = FreqDist( [word for line in corpus.get_texts() for word in line]) self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5] def get_features(self, context_obj): if 'source_token' not in context_obj or len( context_obj['source_token']) == 0: return [0.0 for i in range(len(self.thresholds) * 2)] translations, translations_weighted = [], [] for thr in self.thresholds: all_words, all_words_weighted = [], [] for word in context_obj['source_token']: trans = [fl for fl in self.lex_prob[word] if fl >= thr] all_words.append(len(trans)) all_words_weighted.append( len(trans) * self.corpus_freq.freq(word)) translations.append(np.average(all_words)) translations_weighted.append(np.average(all_words_weighted)) return translations + translations_weighted def get_feature_names(self): return [ 'source_translations_001_freq', 'source_translations_005_freq', 'source_translations_01_freq', 'source_translations_02_freq', 'source_translations_05_freq', 'source_translations_001_freq_weighted', 'source_translations_005_freq_weighted', 'source_translations_01_freq_weighted', 'source_translations_02_freq_weighted', 'source_translations_05_freq_weighted' ]
def compute_features(self, s, count): # preprocess tok_sent = nltk.tokenize.word_tokenize(s) stop_tok_sent = [x for x in tok_sent if x not in cachedStopWords] # location features P = 1.0/count F5 = 1 if count <=5 else 0 LEN = len(stop_tok_sent)/30.0 # language modelling LM = LModel.score(s) # pos tagging features tag_fd = FreqDist(map_tag("en-ptb", "universal",tag) if map_tag("en-ptb", "universal",tag) not in cachedStopPOStags else "OTHER" for (word, tag) in pos_tagger(tok_sent)) NN = tag_fd.freq("NOUN") VB = tag_fd.freq("VERB") # headline-sentence similarity VS1 = 1 - spatial.distance.cosine(self.hl_vsv_1.toarray(), self.father.cv.transform([s]).toarray()) TFIDF = 1 - spatial.distance.cosine(self.hl_tfidf.toarray(), self.father.tv.transform([s]).toarray()) # topic description-sentence similarity CT = 1 - spatial.distance.cosine(self.father.desc_vsv.toarray(), self.father.cv.transform([s]).toarray()) Q = 1 - spatial.distance.cosine(self.father.title_vsv.toarray(), self.father.cv.transform([s]).toarray()) # security checks if math.isnan(VS1): VS1 = 0 print self.father.code, self.id if math.isnan(CT): CT = 0 print self.father.code, self.id if math.isnan(Q): Q = 0 print self.father.code, self.id # active features return np.asarray([P, F5, LEN, LM, VS1, TFIDF, VB, NN, CT, Q])
def getdict(self, content): wnl = nltk.WordNetLemmatizer() begin = clock() print('begin') tokens = nltk.word_tokenize(content) wordlist = nltk.corpus.words.words() stopwords = nltk.corpus.stopwords.words('english') fdist = FreqDist(wnl.lemmatize(wnl.lemmatize(wnl.lemmatize(word.lower(),'a')), 'v') for word in tokens if word.isalpha() and word not in stopwords) print(clock() - begin) js = {'samples': fdist.B(), 'outcomes': fdist.N()} wdict = {} count = 1 begin = clock() for w in fdist.most_common(): d = {'index': count, 'word': w[0], 'count': w[1], 'freq': round(fdist.freq(w[0]), 4)} d['basic'] = self.getexp(w[0]) wdict[w[0]] = d count = count + 1 print(clock() - begin) wdict = sorted(wdict.items(),key=lambda t: t[1]['index']) js['words'] = wdict return js
class NumTranslationsFeatureExtractor(FeatureExtractor): # .f2e file def __init__(self, lex_prob_file, corpus_file): self.lex_prob = defaultdict(list) for line in open(lex_prob_file): chunks = line[:-1].split() self.lex_prob[chunks[1]].append(float(chunks[2])) corpus = TextCorpus(input=corpus_file) self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line]) self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5] def get_features(self, context_obj): if 'source_token' not in context_obj or len(context_obj['source_token']) == 0: return [0.0 for i in range(len(self.thresholds)*2)] translations, translations_weighted = [], [] for thr in self.thresholds: all_words, all_words_weighted = [], [] for word in context_obj['source_token']: trans = [fl for fl in self.lex_prob[word] if fl >= thr] all_words.append(len(trans)) all_words_weighted.append(len(trans)*self.corpus_freq.freq(word)) translations.append(np.average(all_words)) translations_weighted.append(np.average(all_words_weighted)) return translations + translations_weighted def get_feature_names(self): return ['source_translations_001_freq', 'source_translations_005_freq', 'source_translations_01_freq', 'source_translations_02_freq', 'source_translations_05_freq', 'source_translations_001_freq_weighted', 'source_translations_005_freq_weighted', 'source_translations_01_freq_weighted', 'source_translations_02_freq_weighted', 'source_translations_05_freq_weighted']
class TextCollection: """ A collection of words that supports various Python operations. This is constructed by passing in an iterable of words. >>> tc = TextCollection(['hello', 'world']) >>> 'hello' in tc True >>> tc.freq('world') 0.5 """ def __init__(self, words): words = normalize(words) self.words = list(words) self.lexicon = set(self.words) self.fdist = FreqDist(self.words) def __contains__(self, word): return word in self.lexicon def __iter__(self): return iter(self.words) def __len__(self): return len(self.words) def words(self): return iter(self.words) def count(self, word): return self.fdist[word] def freq(self, word): return self.fdist.freq(word) def wordcounts(self): return self.fdist.items()
class BitCounter(object): def __init__(self, mode): self.mode = mode self.regex = r'%s-[0-9]{4}.txt' % self.mode self.get_freq_dists() def get_freq_dists(self): print 'Counting word occurrences...' self.d_freq_dist = FreqDist() self.r_freq_dist = FreqDist() for entry in os.listdir(NGRAM_DIR): if isinstance(re.match(self.regex, entry), type(None)): continue print 'Processing {}...'.format(entry) with open(NGRAM_DIR + entry, 'r') as fh: # Skip header fh.readline() line = fh.readline() while line.strip(): party, phrase, count = line.strip().split('|') if party not in ('D', 'R'): line = fh.readline() continue count = int(count) if party == 'D': self.d_freq_dist[phrase] += count elif party == 'R': self.r_freq_dist[phrase] += count line = fh.readline() self.vocab = list( set(self.d_freq_dist.keys()).union(set(self.r_freq_dist.keys()))) # L1 smoothing for phrase in self.vocab: self.d_freq_dist[phrase] += 1 self.r_freq_dist[phrase] += 1 def get_frequencies(self, save=True): print 'Getting frequencies...' frequencies = [] for phrase in self.vocab: frequencies.append((phrase, int(self.d_freq_dist[phrase]), int(self.r_freq_dist[phrase]))) df = pd.DataFrame(frequencies, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "frequencies-" + self.mode) return df def get_partial_kl(self, denom="q", pfun=None, save=True): assert denom in ("mixture", "q") if save: assert denom == "q" save = "partial_kls" if save == True else save partial_kl_mixture = lambda p, pi, qi: p * math.log( 2 * pi / (pi + qi), 2) partial_kl_q = lambda p, pi, qi: p * math.log(pi / qi, 2) partial_kl = partial_kl_q if denom == "q" else partial_kl_mixture print 'Computing partial KLs...' pkls = [] for phrase in self.vocab: dp = self.d_freq_dist.freq(phrase) rp = self.r_freq_dist.freq(phrase) dscale = dp if not pfun else pfun(phrase) rscale = rp if not pfun else pfun(phrase) dpkl = partial_kl(dscale, dp, rp) rpkl = partial_kl(rscale, rp, dp) pkls.append((phrase, dpkl, rpkl)) df = pd.DataFrame(pkls, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + save + "-" + self.mode) return df def get_signal(self, denom="q", save=True): assert denom in ("mixture", "q") if save: assert denom == "q" signal_mixture = lambda pi, qi: math.log(2 * pi / (pi + qi), 2) signal_q = lambda pi, qi: math.log(pi / qi, 2) signal = signal_q if denom == "q" else signal_mixture print 'Computing signal reliability...' signals = [] for phrase in self.vocab: dsr = signal(self.d_freq_dist.freq(phrase), self.r_freq_dist.freq(phrase)) rsr = signal(self.r_freq_dist.freq(phrase), self.d_freq_dist.freq(phrase)) signals.append((phrase, dsr, rsr)) df = pd.DataFrame(signals, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "signals-" + self.mode) return df def get_logps(self, save=True): logp = lambda pi: math.log(pi, 2) print 'Computing log p\'s...' logps = [] for phrase in self.vocab: dlp = logp(self.d_freq_dist.freq(phrase)) rlp = logp(self.r_freq_dist.freq(phrase)) logps.append((phrase, dlp, rlp)) df = pd.DataFrame(logps, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "logps-" + self.mode) return df def get_mixtures(self, save=True): print 'Computing mixtures...' ms = [] for phrase in self.vocab: m = (self.d_freq_dist.freq(phrase) + self.r_freq_dist.freq(phrase)) / 2 ms.append((phrase, m, m)) df = pd.DataFrame(ms, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "ms-" + self.mode) return df def get_probs(self, save=True): print 'Computing raw probabilities...' probs = [] for phrase in self.vocab: probs.append((phrase, self.d_freq_dist.freq(phrase), self.r_freq_dist.freq(phrase))) df = pd.DataFrame(probs, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "probs-" + self.mode) return df # TODO: Get rid of this function; I think it's redundant with get_signal(). def get_log_odds(self, save=True): print 'Computing log odds...' logodds = [] for phrase in self.vocab: logodds.append((phrase, math.log( self.d_freq_dist.freq(phrase) / self.r_freq_dist.freq(phrase)), math.log( self.r_freq_dist.freq(phrase) / self.d_freq_dist.freq(phrase)))) df = pd.DataFrame(logodds, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "logodds-" + self.mode) return df def get_conditional_probs(self, save=True): print 'Computing conditional probabilities...' cond_probs = [] n_d, n_r = sum(self.d_freq_dist.values()), \ sum(self.r_freq_dist.values()) n = n_d + n_r for phrase in self.vocab: marg_prob = (self.d_freq_dist[phrase] + self.r_freq_dist[phrase]) / n cp_d, cp_r = np.multiply( np.array([ self.d_freq_dist.freq(phrase), self.r_freq_dist.freq(phrase) ]), np.array([n_d, n_r])) / (marg_prob * n) #cp_d, cp_r = np.multiply(np.array([ self.d_freq_dist.freq(phrase), # self.r_freq_dist.freq(phrase) ]), # np.array([ 1, 1 ])) / ( marg_prob * 2 ) cond_probs.append((phrase, cp_d, cp_r)) df = pd.DataFrame(cond_probs, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "cond_probs-" + self.mode) return df def get_likelihood_ratios(self, save=True): print 'Computing likelihood ratios...' lrs = [] n_d, n_r = sum(self.d_freq_dist.values()), \ sum(self.r_freq_dist.values()) n = n_d + n_r for phrase in self.vocab: dp, rp = self.d_freq_dist.freq(phrase), self.r_freq_dist.freq( phrase) lrs.append((phrase, dp / rp, rp / dp)) df = pd.DataFrame(lrs, columns=["term", "dmetric", "rmetric"]).set_index("term") df["dmetric_std"] = stats.mstats.zscore(df["dmetric"]) df["rmetric_std"] = stats.mstats.zscore(df["rmetric"]) if save: df.to_pickle(METRICS_DIR + "likelihood_ratios-" + self.mode) return df def _get_valence(self, phrase): v, a, d = get_valence(phrase) return (phrase, v, a, d) def get_valence(self, save=True): print "Getting valence..." vals = [self._get_valence(phrase) for phrase in self.vocab] df = pd.DataFrame(vals, columns=["term", "valence", "arousal", "dominance"]).set_index("term") if save: df.to_pickle(METRICS_DIR + "valence-" + self.mode) return df def get_all(self, save=True): self.get_frequencies(save=save) self.get_partial_kl(save="partial_kls" if save else save) self.get_signal(save=save) self.get_probs(save=save) self.get_log_odds(save=save) self.get_conditional_probs(save=save)
class DirichletWords(object): def initialize_index(self): self.word_to_int = {} self.int_to_word = {} def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0, max_tables = 50000, sanity_check=False, initialize=False, report_filename="topic_history.txt"): self.max_tables = max_tables self._alphabet = FreqDist() # store all words seen in a list so they are associated with a unique ID. self.initialize_index() self._words = FreqDist() self.alpha_topic = alpha_topic self.alpha_word = alpha_word self._num_updates = 0 self._report = None if report_filename: self._report = open(report_filename, 'w') self.num_topics = num_topics self._topics = [FreqDist() for x in xrange(num_topics)] # the sanity_check flag is for testing only. if initialize and sanity_check == True: self.deterministic_seed() elif initialize: self.initialize_topics() def deterministic_seed(self): ''' if sanity_check = True, this will seed the topics with enough variance to evolve but do so in the most basic and deterministic way possible, so a user can follow along each step of the algorithm''' chars = "abcdefghijklmnopqrstuvwxyz" for i in xrange(3): word = random.choice(chars) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) def initialize_topics(self): ''' initializes the topics with some random seed words so that they have enough relative bias to evolve when new words are passed in. ''' # we are going to create some random string from /dev/urandom. to convert # them to a string, we need a translation table that is 256 characters. translate_table = (string.letters*5)[:256] # /dev/urandom is technically not as random as /dev/random, but it doesn't # block. r = open('/dev/urandom') # make random 'words' and add them to the topics. they'll never # realistically be seen again- which is good since we just want them to # seed the bias in the topics. for i in xrange(self.num_topics): word_length = random.randint(9,20) word = r.read(word_length).translate(translate_table) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) r.close() def __len__(self): return len(self._words) def num_words(self): return sum(1 for x in self._words if self._words[x] >= 1) def as_matrix(self): ''' Return a matrix of the probabilities of all words over all topics. note that because we are using topic_prob(), this is equivalent to he expectation of log beta, ie Elogbeta ''' # XXX TODO we should store this on the fly instead of recomputing it # all the time! # create a numpy array here because that's what the e_step in streamLDA # expects num_words = self.num_words() print("%i words" % num_words) lambda_matrix = n.zeros((self.num_topics, num_words)) for word_index, word in enumerate(x for x in self._words \ if self._words[x] >= 1): topic_weights = [log(self.topic_prob(k, word)) \ for k in xrange(self.num_topics)] # topic weights for this word-- a column vector. lambda_matrix[:,word_index] = topic_weights self._num_updates += 1 if self._report: self._report.write("%i %i %i %i\n" % (self._num_updates, len(self._alphabet), \ len(self._words), sum(x.B() for x in self._topics))) return lambda_matrix def forget(self, proportion): num_tables = len(self._words) number_to_forget = proportion * num_tables if num_tables > self.max_tables: number_to_forget += (num_tables - self.max_tables) # change this to weight lower probability tables_to_forget = random.sample(xrange(num_tables), number_to_forget) words = self._words.keys() self.initialize_index() word_id = -1 for ii in words: word_id += 1 if not word_id in tables_to_forget: self.index(ii) continue count = self._words[ii] for jj in self._topics: self._topics[jj][ii] = 0 del self._topics[jj][ii] for jj in ii: self._chars[jj] -= count self._words[ii] = 0 del self._words[ii] def seq_prob(self, word): val = 1.0 # Weighted monkeys at typewriter for ii in word: # Add in a threshold to make sure we don't have zero probability sequences val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING) # Normalize val /= 2**(len(word)) return val def merge(self, otherlambda, rhot): ''' fold the word counts in another DirichletWords object into this one, weighted by rhot. assumes self.num_topics is the same for both objects. ''' all_words = self._words.keys() + otherlambda._words.keys() distinct_words = list(set(all_words)) # combines the probabilities, with otherlambda weighted by rho, and # generates a new count by combining the number of words in the old # (current) lambda with the number in the new. here we essentially take # the same steps as update_count but do so explicitly so we can weight the # terms appropriately. total_words = float(self._words.N() + otherlambda._words.N()) self_scale = (1.0-rhot)*total_words/float(self._words.N()) other_scale = rhot*total_words/float(otherlambda._words.N()) for word in distinct_words: self.index(word) # update word counts new_val = (self_scale*self._words[word] + other_scale*otherlambda._words[word]) if new_val >= 1.0: self._words[word] = new_val else: self._words[word] = 0 del self._words[word] # update topic counts for topic in xrange(self.num_topics): new_val = (self_scale*self._topics[topic][word] + other_scale*otherlambda._topics[topic][word]) if new_val >= 1.0: self._topics[topic][word] = new_val else: self._topics[topic][word] = 0 del self._topics[topic][word] # update sequence counts all_chars = self._alphabet.keys() + otherlambda._alphabet.keys() distinct_chars = list(set(all_chars)) for ii in distinct_chars: self._alphabet[ii] = (self_scale*self._alphabet[ii] + other_scale*otherlambda._alphabet[ii]) def word_prob(self, word): return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \ (self._words.N() + self.alpha_word) def topic_prob(self, topic, word): return (self._topics[topic][word] + \ self.alpha_topic * self.word_prob(word)) / \ (self._topics[topic].N() + self.alpha_topic) def update_count(self, word, topic, count): # create an index for the word self.index(word) # increment the frequency of the word in the specified topic self._topics[topic][word] += count # also keep a separate frequency count of the number of times this word has # appeared, across all documents. self._words[word] += count # finally, keep track of the appearance of each character. # note that this does not assume any particular character set nor limit # recognized characters. if words contain punctuation, etc. then they will # be counted here. for ii in word: self._alphabet[ii] += count def index(self, word): assert not isinstance(word, int) if not word in self.word_to_int: self.word_to_int[word] = len(self.word_to_int) self.int_to_word[self.word_to_int[word]] = word return self.word_to_int[word] def dictionary(self, word_id): assert isinstance(word_id, int) return self.int_to_word[word_id] def print_probs(self, word): print "----------------" print word for ii in xrange(self.num_topics): print ii, self.topic_prob(ii, word) print "WORD", self.word_prob(word) print "SEQ", self.seq_prob(word)
# coding: utf-8 import nltk from nltk.corpus import gutenberg # 导入 gutenberg 集 ################################################################## ## FreqDist 跟踪分布中的采样频率 (sample frequencies) from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist(gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print(fd) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B()) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化 print(fdist.most_common(5)) # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
def test_freq_freqdist(self): """Probabilities are indentical to using FreqDist.""" freqdist = FreqDist(TEST_TOKENS) for word_type in set(TEST_TOKENS): self.assertEqual(self.model.prob(word_type, None), freqdist.freq(word_type))
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}): ''' Load features for each book in the corpus. There are 4 + RANGE*4 features for each instance. These features are: --------------------------------------------------------------------------------------------------------- No. Feature Name No. of features. --------------------------------------------------------------------------------------------------------- 1. number of hapax legomena divided by number of unique words 1 2. number of dis legomena divided by number of unique words 1 3. number of unique words divided by number of total words 1 4. flesch readability score divided by 100 1 5. no. of sentences of length in the range [1, RANGE] divided by the RANGE number of total sentences 6. no. of words of length in the range [1, RANGE] divided by the RANGE number of total words 7. no. of nominative pronouns per sentence in the range [1, RANGE] divided by the RANGE number of total sentences 8. no. of (coordinating + subordinating) conjunctions per sentence in the range RANGE [1, RANGE] divided by the number of total sentences ''' text = extract_book_contents(open(filename, 'r').read()).lower() contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes sentenceList = sent_tokenize(contents.strip()) cleanWords = [] sentenceLenDist = [] pronDist = [] conjDist = [] sentences = [] totalWords = 0 wordLenDist = [] totalSyllables = 0 for sentence in sentenceList: if sentence != ".": pronCount = 0 conjCount = 0 sentences.append(sentence) sentenceWords = re.findall(r"[\w']+", sentence) totalWords += len(sentenceWords) # record all words in sentence sentenceLenDist.append(len(sentenceWords)) # record length of sentence in words for word in sentenceWords: totalSyllables += count(word) wordLenDist.append(len(word)) # record length of word in chars if word in pronSet: pronCount+=1 # record no. of pronouns in sentence if word in conjSet: conjCount+=1 # record no. of conjunctions in sentence if word not in smartStopWords: cleanWords.append(word) pronDist.append(pronCount) conjDist.append(conjCount) sentenceLengthFreqDist = FreqDist(sentenceLenDist) sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE)) sentenceLengthDist.append(1-sum(sentenceLengthDist)) pronounFreqDist = FreqDist(pronDist) pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE)) pronounDist.append(1-sum(pronounDist)) conjunctionFreqDist = FreqDist(conjDist) conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE)) conjunctionDist.append(1-sum(conjunctionDist)) wordLengthFreqDist= FreqDist(wordLenDist) wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE)) wordLengthDist.append(1-sum(wordLengthDist)) # calculate readability avgSentenceLength = np.mean(sentenceLenDist) avgSyllablesPerWord = float(totalSyllables)/totalWords readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord))/100 wordsFreqDist = MyFreqDist(FreqDist(cleanWords)) #sentenceDist = FreqDist(sentences) #print sentenceDist.keys()[:15] # most common sentences #print wordsFreqDist.keys()[:15] # most common words #print wordsFreqDist.keys()[-15:] # most UNcommon words numUniqueWords = len(wordsFreqDist.keys()) numTotalWords = len(cleanWords) hapax = float(len(wordsFreqDist.hapaxes()))/numUniqueWords # no. words occurring once / total num. UNIQUE words dis = float(len(wordsFreqDist.dises()))/numUniqueWords # no. words occurring twice / total num. UNIQUE words richness = float(numUniqueWords)/numTotalWords # no. unique words / total num. words result = [] result.append(hapax) result.append(dis) result.append(richness) result.append(readability) result.extend(sentenceLengthDist) result.extend(wordLengthDist) result.extend(pronounDist) result.extend(conjunctionDist) return result, numTotalWords
return stem def lexical_diversity(text): return len(text) / len(set(text)) # process for author###### f = open('author.txt', encoding="latin-1") raw_author = f.read() author_list = [a for a in (re.split(r'[\t\n]+', raw_author)) if 2 < len(a) < 29] author_list = nltk.Text(author_list) fdist_author = FreqDist(author_list) fdist_author.max() fdist_author.freq('Vincent Granville') fdist_author.tabulate(10) fdist_author.plot(50, cumulative=True) fdist_author.most_common(10) popular_author = ['Vincent Granville', 'Michael Walker', 'Mirko Krivanek', 'Don Philip Faithful', 'William Vorhies', 'Bernard Marr'] total_author = len(set(author_list)) print("The total number of author in dsc is: " + str(total_author)) avg_post = 1700 / len(set(author_list)) print("Each author post " + str(int(avg_post)) + " blogs in dsc") # process for text ############# f = open('text.txt', encoding="latin-1") raw_text = f.read() # type type(raw_text)
if args.stop_punctuation: stoplist += [x.decode('UTF8') for x in set(list(punctuation))] stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014'] stoplist.append('--') words = [word for word in word_tokenize(text) if word not in stoplist] if args.stem: st = LancasterStemmer() words = [st.stem(word) for word in words] freq_dist = FreqDist(words) print('Total words: ' + str(orig_freq_dist.N())) print('Total after filter: ' + str(freq_dist.N())) # B() gives list of unique words print('Unique words: ' + str(freq_dist.B())) print('Unique words ratio: ' + str(float(freq_dist.B()) / float(freq_dist.N()))) print('\n') if args.words: for word in args.words: print(word + ': ' + str(freq_dist[word])) print(word + ' freq: ' + str(freq_dist.freq(word))) print('\n') # Show top 30 print('Top ' + str(args.num_words) + ' words:') freq_dist.tabulate(args.num_words)
from nltk import FreqDist from common.books import text1 fdist = FreqDist(len(w) for w in text1()) print(fdist) # print(fdist.keys()) # print(fdist.items()) print(fdist.most_common()) print(fdist.max()) print(fdist[3]) print(fdist.freq(3))
class Solution1: def __init__(self, dictionary_file, training_file): self.dictionary = self.read_json_file(dictionary_file) training_data = self.read_text_file(training_file) self.uni_words = None self.bi_words = None self.uni_words_pos = None self.bi_words_pos = None self.uni_pos = None self.bi_pos = None self.train(training_data) @staticmethod def read_text_file(filename): try: file = open(filename, 'r') except: print('Cannot read file ' + filename + '. Check the path', file=sys.stderr) sys.exit(1) output = [] for line in file: line = line.strip().lower() output.append(line) return output @staticmethod def read_json_file(filename): try: file = open(filename, 'r') except: print('Cannot read file ' + filename + '. Please check the path', file=sys.stderr) sys.exit(1) return json.load(file) @staticmethod def words_sentence(words): return ''.join([ word if word in string.punctuation else ' ' + word for word in words ]).strip() @staticmethod def print_translation(title, source, translation): print('%s' % title) print('%s' % source) print('%s' % translation) print('\n') def train(self, lines): uni_words = [] bi_words = [] tri_words = [] uni_words_pos = [] bi_words_pos = [] uni_pos = [] bi_pos = [] for line in lines: words = word_tokenize(line) words_pos = pos_tag(words) pos = [word[1] for word in words_pos] uni_words = uni_words + ['<s>'] + words + ['</s>'] uni_words_pos = uni_words_pos + words_pos uni_pos = uni_pos + ['<s>'] + pos + ['</s>'] bi_words = bi_words + list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) bi_words_pos = bi_words_pos + list( ngrams(words_pos, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) bi_pos = bi_pos + list( ngrams(words_pos, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) self.uni_words = FreqDist(uni_words) self.bi_words = FreqDist(bi_words) self.tri_words = FreqDist(tri_words) self.uni_words_pos = FreqDist(uni_words_pos) self.bi_words_pos = FreqDist(bi_words_pos) self.uni_pos = FreqDist(uni_pos) self.bi_pos = FreqDist(bi_pos) def bigram_words_probability(self, words): probability = 0 vocabulary_size = len(self.uni_words) bigrams = list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) for bigram in bigrams: probability += math.log(self.bi_words.freq(bigram) + 1) - math.log( self.uni_words.freq(bigram[1]) + vocabulary_size) return probability def trigram_words_probability(self, words): probability = 0 vocabulary_size = len(self.uni_words) trigrams = list( ngrams(words, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) for trigram in trigrams: probability += math.log(self.tri_words.freq(trigram) + 1) - math.log( self.bi_words.freq(trigram[1]) + vocabulary_size) return probability def bigram_pos_words_probability(self, words): words = pos_tag(words) probability = 0 vocabulary_size = len(self.uni_words_pos) bigrams = list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) for bigram in bigrams: probability += math.log(self.bi_words_pos.freq(bigram) + 1) - math.log( self.uni_words_pos.freq(bigram[1]) + vocabulary_size) return probability def bigram_pos_probability(self, words): probability = 0 vocabulary_size = len(self.uni_pos) bigrams = list(ngrams(words, 2)) for bigram in bigrams: probability += math.log(self.bi_pos.freq(bigram) + 1) - math.log( self.uni_pos.freq(bigram[1]) + vocabulary_size) return probability def probability_permutation(self, words, method): max_probability = -math.inf selected = None permutation_count = math.factorial( len(words)) if len(words) < 5 else 100 for _ in range(permutation_count): permutation = numpy.random.permutation(words) probability = getattr(self, method)(permutation) if probability > max_probability: max_probability = probability selected = permutation return selected def pos_model(self, words): words_pos = [('', '<s>')] + pos_tag(words) + [('', '</s>')] length = len(words_pos) for index, word in enumerate(words_pos): words_window = words_pos[index:index + 4] max_probability = -math.inf selected = None permutations = itertools.permutations(words_window) for permutation in permutations: pos = [word[1] for word in permutation] probability = self.bigram_pos_probability(pos) if probability > max_probability: max_probability = probability selected = permutation words_pos[index] = selected[0] words_pos[index + 1] = selected[1] words_pos[index + 2] = selected[2] words_pos[index + 3] = selected[3] if index == length - 4: break return [word[0] for word in words_pos] def swap_pos(self, words): words_pos = pos_tag(words) length = len(words_pos) for index, word in enumerate(words_pos): if (word[1] == 'PRP' or word[1] == 'PRP$' or word[1] == 'JJ') \ and (words_pos[index + 1][1] == 'VB' or words_pos[index + 1][1] == 'VBD' \ or words_pos[index + 1][1] == 'VBG' or words_pos[index + 1][1] == 'VBN' \ or words_pos[index + 1][1] == 'VBP' or words_pos[index + 1][1] == 'WP'): temp_word = words_pos[index + 1] words_pos[index + 1] = words_pos[index] words_pos[index] = temp_word return [word[0] for word in words_pos] def swap_verb_after_noun(self, words): words_pos = pos_tag(words) length = len(words_pos) for index, word in enumerate(words_pos): if (word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS') \ and (words_pos[index + 1][1] == 'VB' or words_pos[index + 1][1] == 'VBD' \ or words_pos[index + 1][1] == 'VBG' or words_pos[index + 1][1] == 'VBN' \ or words_pos[index + 1][1] == 'VBP' or words_pos[index + 1][1] == 'VBZ'): temp_word = words_pos[index + 1] words_pos[index + 1] = words_pos[index] words_pos[index] = temp_word return [word[0] for word in words_pos] def translate(self, line): words = word_tokenize(line) translated_words = [] for i, word in enumerate(words): if word not in string.punctuation: translated_words.append(self.dictionary[word]) else: translated_words.append(word) translated_sentence = self.words_sentence(translated_words) self.print_translation('Translation with 0 strategy', line, translated_sentence) #Swap the nearest adjective with the word after noun translated_words = self.swap_pos(translated_words) translated_sentence = self.words_sentence(translated_words) self.print_translation('Translation after swapping parts of speech', line, translated_sentence) #Swap the nearest verb with the word after noun translated_words = self.swap_verb_after_noun(translated_words) translated_sentence = self.words_sentence(translated_words) self.print_translation('Translation after swapping verb with noun', line, translated_sentence) #Bigram Language Model selected_translation = self.probability_permutation( translated_words, 'bigram_words_probability') translated_sentence = self.words_sentence(selected_translation) self.print_translation('Translation after applying Bigram Model', line, translated_sentence) #Trigram Language Model selected_translation = self.probability_permutation( translated_words, 'trigram_words_probability') translated_sentence = self.words_sentence(selected_translation) self.print_translation('Translation after applying Trigram Model', line, translated_sentence) #Bigram POS Language Model selected_translation = self.probability_permutation( translated_words, 'bigram_pos_words_probability') translated_sentence = self.words_sentence(selected_translation) self.print_translation( 'Translation after applying Bigram and POS Tagging', line, translated_sentence) #Rearrangement of POS selected_translation = self.pos_model(translated_words) translated_sentence = self.words_sentence(selected_translation) self.print_translation('Translation after POS rearrangement', line, translated_sentence) def execute(self, input_file): lines = self.read_text_file(input_file) for line in lines: self.translate(line)
# Counting the number of characters in each word in a text [len(w) for w in text1] # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() #### MOVIE REVIEWS #### import nltk from nltk.corpus import movie_reviews movie_reviews.categories() movie_reviews.fileids('pos') movie_reviews.fileids('neg') movie_reviews.words('neg/cv729_10475.txt') len(movie_reviews.words('neg/cv729_10475.txt')) documents = [(list(movie_reviews.words(fileid)), category)
class DirichletWords(object): def initialize_index(self): self.word_to_int = {} self.int_to_word = {} def __init__(self, num_topics, alpha_topic=1.0, alpha_word=1.0, max_tables=50000, sanity_check=False, initialize=False, report_filename="topic_history.txt"): self.max_tables = max_tables self._alphabet = FreqDist() # store all words seen in a list so they are associated with a unique ID. self.initialize_index() self._words = FreqDist() self.alpha_topic = alpha_topic self.alpha_word = alpha_word self._num_updates = 0 self._report = None if report_filename: self._report = open(report_filename, 'w') self.num_topics = num_topics self._topics = [FreqDist() for x in xrange(num_topics)] # the sanity_check flag is for testing only. if initialize and sanity_check == True: self.deterministic_seed() elif initialize: self.initialize_topics() def deterministic_seed(self): ''' if sanity_check = True, this will seed the topics with enough variance to evolve but do so in the most basic and deterministic way possible, so a user can follow along each step of the algorithm''' chars = "abcdefghijklmnopqrstuvwxyz" for i in xrange(3): word = random.choice(chars) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) def initialize_topics(self): ''' initializes the topics with some random seed words so that they have enough relative bias to evolve when new words are passed in. ''' # we are going to create some random string from /dev/urandom. to convert # them to a string, we need a translation table that is 256 characters. translate_table = (string.letters * 5)[:256] # /dev/urandom is technically not as random as /dev/random, but it doesn't # block. r = open('/dev/urandom') # make random 'words' and add them to the topics. they'll never # realistically be seen again- which is good since we just want them to # seed the bias in the topics. for i in xrange(self.num_topics): word_length = random.randint(9, 20) word = r.read(word_length).translate(translate_table) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) r.close() def __len__(self): return len(self._words) def num_words(self): return sum(1 for x in self._words if self._words[x] >= 1) def as_matrix(self): ''' Return a matrix of the probabilities of all words over all topics. note that because we are using topic_prob(), this is equivalent to he expectation of log beta, ie Elogbeta ''' # XXX TODO we should store this on the fly instead of recomputing it # all the time! # create a numpy array here because that's what the e_step in streamLDA # expects num_words = self.num_words() print("%i words" % num_words) lambda_matrix = n.zeros((self.num_topics, num_words)) for word_index, word in enumerate(x for x in self._words \ if self._words[x] >= 1): topic_weights = [log(self.topic_prob(k, word)) \ for k in xrange(self.num_topics)] # topic weights for this word-- a column vector. lambda_matrix[:, word_index] = topic_weights self._num_updates += 1 if self._report: self._report.write("%i %i %i %i\n" % (self._num_updates, len(self._alphabet), \ len(self._words), sum(x.B() for x in self._topics))) return lambda_matrix def forget(self, proportion): num_tables = len(self._words) number_to_forget = proportion * num_tables if num_tables > self.max_tables: number_to_forget += (num_tables - self.max_tables) # change this to weight lower probability tables_to_forget = random.sample(xrange(num_tables), number_to_forget) words = self._words.keys() self.initialize_index() word_id = -1 for ii in words: word_id += 1 if not word_id in tables_to_forget: self.index(ii) continue count = self._words[ii] for jj in self._topics: self._topics[jj][ii] = 0 del self._topics[jj][ii] for jj in ii: self._chars[jj] -= count self._words[ii] = 0 del self._words[ii] def seq_prob(self, word): val = 1.0 # Weighted monkeys at typewriter for ii in word: # Add in a threshold to make sure we don't have zero probability sequences val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING) # Normalize val /= 2**(len(word)) return val def merge(self, otherlambda, rhot): ''' fold the word counts in another DirichletWords object into this one, weighted by rhot. assumes self.num_topics is the same for both objects. ''' all_words = self._words.keys() + otherlambda._words.keys() distinct_words = list(set(all_words)) # combines the probabilities, with otherlambda weighted by rho, and # generates a new count by combining the number of words in the old # (current) lambda with the number in the new. here we essentially take # the same steps as update_count but do so explicitly so we can weight the # terms appropriately. total_words = float(self._words.N() + otherlambda._words.N()) self_scale = (1.0 - rhot) * total_words / float(self._words.N()) other_scale = rhot * total_words / float(otherlambda._words.N()) for word in distinct_words: self.index(word) # update word counts new_val = (self_scale * self._words[word] + other_scale * otherlambda._words[word]) if new_val >= 1.0: self._words[word] = new_val else: self._words[word] = 0 del self._words[word] # update topic counts for topic in xrange(self.num_topics): new_val = (self_scale * self._topics[topic][word] + other_scale * otherlambda._topics[topic][word]) if new_val >= 1.0: self._topics[topic][word] = new_val else: self._topics[topic][word] = 0 del self._topics[topic][word] # update sequence counts all_chars = self._alphabet.keys() + otherlambda._alphabet.keys() distinct_chars = list(set(all_chars)) for ii in distinct_chars: self._alphabet[ii] = (self_scale * self._alphabet[ii] + other_scale * otherlambda._alphabet[ii]) def word_prob(self, word): return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \ (self._words.N() + self.alpha_word) def topic_prob(self, topic, word): return (self._topics[topic][word] + \ self.alpha_topic * self.word_prob(word)) / \ (self._topics[topic].N() + self.alpha_topic) def update_count(self, word, topic, count): # create an index for the word self.index(word) # increment the frequency of the word in the specified topic self._topics[topic][word] += count # also keep a separate frequency count of the number of times this word has # appeared, across all documents. self._words[word] += count # finally, keep track of the appearance of each character. # note that this does not assume any particular character set nor limit # recognized characters. if words contain punctuation, etc. then they will # be counted here. for ii in word: self._alphabet[ii] += count def index(self, word): assert not isinstance(word, int) if not word in self.word_to_int: self.word_to_int[word] = len(self.word_to_int) self.int_to_word[self.word_to_int[word]] = word return self.word_to_int[word] def dictionary(self, word_id): assert isinstance(word_id, int) return self.int_to_word[word_id] def print_probs(self, word): print "----------------" print word for ii in xrange(self.num_topics): print ii, self.topic_prob(ii, word) print "WORD", self.word_prob(word) print "SEQ", self.seq_prob(word)
# dest = '/Users/asif/Sites/pmidx/journals.csv' # f = open(dest, 'w+') # f.write(journalsCSV) # f.close() # Tokenized titles tokenized_titles = [] tokenized_titles = [word_tokenize(titles[x]) for x in xrange(0,len(titles))] tkTitlesList = [] for n in xrange(0,len(tokenized_titles)): tkTitlesList = tkTitlesList + tokenized_titles[n] stops=['a','the','had','.','(',')','and','of',':',',','in','[',']','for','by','--','?','an','\'','\'s','to','on','is','as','from','-','at','can','does','or','but','use','its','with','using','during'] tokenizedTitles = [token.lower() for token in tkTitlesList if token.lower() not in stops] fdist = FreqDist(tokenizedTitles) sortedTitleWords = fdist.keys() sortedTitleProb = [fdist.freq(token) for token in sortedTitleWords] sortedTitleN = fdist.N() sortedTitleCounts = [int(prob*sortedTitleN) for prob in sortedTitleProb] titlesCounter = {} for x in xrange(0,60): titlesCounter[sortedTitleWords[x]] = sortedTitleCounts[x] # Returns collaborators as a dictionary matrix def collaborators_matrix(authors): coll = {} for x in xrange(0,len(authors)): if authors[x]: for y in xrange(0,len(authors[x])): for z in xrange(0,len(authors[x])): if authors[x][y] != authors[x][z]: if authors[x][y] in coll.keys(): # first author
return ' '.join( tl[tl.index('[') + 1 : tl.index(']')] ) else: return ' '.join( tl[ 0 : 5 ] ) return # 3. Unigrams from nltk import FreqDist # a. Lowercase the tokens in emma and create a frequency distribution from them. # (Do not throw away punctuation.) Store the result in fd1. fd1 = FreqDist( list( t.lower() for t in emma) ) # b. Set A3b to the count of the word 'town' in fd1. A3b = fd1['town'] # c. Set A3c to the relative frequency (probability) of the word 'town' in ud. A3c = fd1.freq('town') # d. Set A3d to the number of hapaxes in the distribution fd1. A3d = len( list( x for x in fd1 if fd1[x] == 1 ) ) # 4. When one formats floating-point numbers, one can specify the number of # digits after the decimal point as follows: # >>> '{:.4}'.format(1/7) # >>> '0.1429' # Write a function print_uni that takes a FreqDist as input and prints a table with # three columns: a word, its count, and its relative frequency. It should print the # words in alphabetic order. The first column should be 10 characters wide. If a word # is more than 10 characters long, truncate it to 10 characters. The second column # should be five characters wide, and the relative frequency should be printed with
# print allwords for i in range(len(allwords)): try: y[i] = int((complexity[allwords[i]])) except: y[i] = 0 # print y fdist = FreqDist(brown.words()) freqComplex = [] freqSimple = [] x = [] for i in range(len(allwords)): x.append([]) for i in range(len(allwords)): x[i].append(fdist.freq(allwords[i])) if (y[i] == 1): freqComplex.append(fdist.freq(allwords[i])) else: freqSimple.append(fdist.freq(allwords[i])) x[i].append(len(allwords[i])) x[i].append(synobj.synCount(allwords[i])) x[i].append(synobj.len_of_synonyms(allwords[i])) """NO OF VOWELS""" complex_vowels = [] word_weights_complex = [] simple_vowels = [] word_weights_simple = [] """all vowels 1,consonants 1.5,x&z 4,q 5""" alpha_weights = { 'a': 1,
stopwords.remove('i') # robin talks about himself a lot, let's include that for the sake of comedy robin = FreqDist( word.lower() for word in read_words('robin.txt') if word.lower() not in stopwords ) # would be more interesting to use lots of other prosam texts as a base, but I've already procrastinated enough... base = FreqDist( word.lower() for word in nltk.corpus.brown.words() if word.lower() not in stopwords ) # I run it like `python freq_analyse.py | sort -n | tail -n 15` for k,v in robin.items(): print('%.10f' % abs(robin.freq(k) - base.freq(k)),k) # difference in word frequency # sample result: # 0.0074393564 well # 0.0078216064 know # 0.0088326042 learned # 0.0103504380 like # 0.0104640871 feel # 0.0104676596 education # 0.0107427367 learning # 0.0112185385 good # 0.0121670187 really # 0.0126582278 i’ve # 0.0143654005 also
# # First # # Here we will determine the relative frequencies of English characters in the text # Then we will calculate the entropy of the distribution # here we use the expression list(var_name) to turn our string into a list # this basically separates each character for us to make it so that it works # directly in the freqdist function english_unigram_fdist = FreqDist(list(english_model_content)) english_unigram_entropy = 0.0 # now loop and get the entropy for english unigrams for unigram in english_unigram_fdist.samples(): english_unigram_entropy += english_unigram_fdist.freq(unigram) * math.log(english_unigram_fdist.freq(unigram), 2) english_unigram_entropy = -english_unigram_entropy print "The English Unigram Entropy is: " + str(english_unigram_entropy) # # Second # # Here we will determine the relative frequencies of English bigrams in the text # Then we will calculate the entropy of the bigram distribution # create a list to store bigrams in english_model_bigrams = []
def detect(request): #Entrada de datos if request.method == 'POST': identificacion=request.POST.get('dni') a=request.FILES['document'] documento=str(a) datos_doc=documento.split('.') nombre_doc=datos_doc[0] tipo_doc=datos_doc[1] if tipo_doc=='txt': name=request.FILES['document'].read().lower() print(datos_doc) #mul=set(stopwords.words("spanish")) mul=codecs.open('mul.txt', "r", encoding='UTF-8').read() remove('muletillas.txt') discurso=(name.decode('UTF-8')) #Separar muletillas de palabras comunes text_completo = wordpunct_tokenize(discurso) m = [] m = [w for w in text_completo if w in mul] muletillas= codecs.open('muletillas.txt', "a") for i in m: muletillas.write(i) muletillas.write(" ") muletillas.close() #Contabilizar muletillas tokenizador=RegexpTokenizer('\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1') frecuencia=FreqDist(corpus.words()) salida=codecs.open("muletillasR.txt","w",encoding="utf-8") palabras=[] repeticiones=[] #Agregar los datos extraidos en un txt para posterior presentacion for mc in frecuencia.most_common(): palabra=mc[0] frecuencia_absoluta=mc[1] frecuencia_relativa=frecuencia.freq(palabra) cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra palabras.append(palabra.upper()) repeticiones.append(frecuencia_absoluta) salida.write(cadena+"\n") try: collection.insert_one({ 'identificacion':identificacion, 'documento': documento, 'discurso':discurso, 'muletillas':palabras }) except Exception as e: print("Error : ", type(e), e) #Enviado de datos al front context={ 'documento': nombre_doc, 'muletillas':palabras[0:10], 'repeticiones': repeticiones[0:10] } return render(request, 'responde.html', context) else : messages.warning(request, "Verifique el tipo de archivo", extra_tags='file') return render(request, 'home.html') return render(request, 'home.html') # class LineChartJSONView(BaseLineChartView): # def get_labels(): # """Return 7 labels for the x-axis.""" # return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"] # def get_providers(self): # """Return names of datasets.""" # return ["Repeticiones"] # def get_data(self): # """Return 3 datasets to plot.""" # return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]] # line_chart = TemplateView.as_view(template_name='responde.html') # line_chart_json = LineChartJSONView.as_view()
class Solution1: """ Class that implements Direct Machine Translation, as required by the Problem 1 of the assignment """ def __init__(self, dictionary_file, training_file): """ Initialize the class instance :param dictionary_file: The JSON file of closed dictionary :param training_file: Training target language file (English file) """ # Read dictionary file self.dictionary = self.read_json_file(dictionary_file) # Read training file training_data = self.read_text_file(training_file) # Declare langauge model attributes self.unigram_words = None self.bigram_words = None self.unigram_pos_words = None self.bigram_pos_words = None self.unigram_pos = None self.bigram_pos = None # Prepare the language model self.train(training_data) @staticmethod def read_text_file(filename): """ Read the text file :param filename: Filename of the text file :return: list of lines of the text file """ try: file = open(filename, 'r') except IOError as e: print('Cannot read file ' + filename + '. Please check the path', file=sys.stderr) print('I/O error({0}): {1}'.format(e.errno, e.strerror), file=sys.stderr) sys.exit(1) output = [] for line in file: line = line.strip().lower() output.append(line) return output @staticmethod def read_json_file(filename): """ Read a json file :param filename: filename of the json file :return: dictionary object of json """ try: file = open(filename, 'r') except IOError as e: print('Cannot read file ' + filename + '. Please check the path', file=sys.stderr) print('I/O error({0}): {1}'.format(e.errno, e.strerror), file=sys.stderr) sys.exit(1) return json.load(file) """ # Google translation, not used @staticmethod def prepare_dictionary(lines, srclang, targetlang): words = [] for line in lines: line = line.strip().lower() words = words + word_tokenize(line) words = map(lambda word: word.lower(), words) words = set(words) output = dict() translate_client = translate.Client() for word in words: output[word] = translate_client.translate(word, targetlang, source_language=srclang) output[word] = output[word]['translatedText'] return output """ @staticmethod def words_to_sentence(words): return ''.join([ word if word in string.punctuation else ' ' + word for word in words ]).strip() @staticmethod def fix_determiners(words): """ Fix "A", "An", "The" determiners :param words: input words :return: fixed words """ words_pos = pos_tag(words) # Indexes of words to remove indices_to_remove = [] length = len(words_pos) for index, word in enumerate(words_pos): if word[1] == 'DT': # Determiner before pronouns if words_pos[index + 1][1] == 'PRP' or words_pos[index + 1][1] == 'PRP$': indices_to_remove.append(index) # Replace "A" with "An" elif word[0] == 'a' and words_pos[index + 1][0].startswith( ('a', 'e', 'i', 'o', 'u')): words_pos[index] = ('an', words_pos[index][1]) if index == length - 2: break # Remove words if len(indices_to_remove) > 0: for index in indices_to_remove: words_pos.pop(index) return [word[0] for word in words_pos] @staticmethod def remove_consecutive_prp(words): """ Remove consecutive pronouns :param words: input words :return: fixed words """ words_pos = pos_tag(words) indices_to_remove = [] length = len(words_pos) for index, word in enumerate(words_pos): # Identify consecutive pronouns if word[1] in ('PRP', 'PRP$') and words_pos[index + 1][1] in ('PRP', 'PRP$'): indices_to_remove.append(index) if index == length - 2: break # Remove words if len(indices_to_remove) > 0: for index in indices_to_remove: words_pos.pop(index) return [word[0] for word in words_pos] @staticmethod def swap_verb_prp(words): """ Swap reverse ordered verb and noun/pronoun :param words: input words :return: fixed words """ words_pos = pos_tag(words) length = len(words_pos) for index, word in enumerate(words_pos): # Identify consecutive pronouns if word[1] in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ') \ and words_pos[index + 1][1] in ('PRP', 'PRP$', 'NN', 'NNS', 'NNP', 'NNPS'): words_pos[index] = words_pos[index + 1] words_pos[index + 1] = word if index == length - 2: break return [word[0] for word in words_pos] @staticmethod def print_translation(translations, output_file): """ Print and write the translation to the output file :param translations: Translations output list :param output_file: Output file instance """ print( '------------------------------------------------------------------------------------------------------' ) print( '------------------------------------------------------------------------------------------------------', file=output_file) for translation in translations: print('\033[1m%s:\033[0m\n%s\n' % translation) print('%s:\n%s\n' % translation, file=output_file) print( '------------------------------------------------------------------------------------------------------' ) print( '------------------------------------------------------------------------------------------------------', file=output_file) def train(self, lines): """ Training unigram, bigram, unigram with pos and bigram with pos models :param lines: Training lines """ unigram_words = [] bigram_words = [] unigram_pos_words = [] bigram_pos_words = [] unigram_pos = [] bigram_pos = [] for line in lines: # Prepare word tokens words = word_tokenize(line) # Tag the tokens with POS words_pos = pos_tag(words) # Generate POS sequences pos = [word[1] for word in words_pos] # Prepare unigram lists with beginnging and end of sentences unigram_words = unigram_words + ['<s>'] + words + ['</s>'] unigram_pos_words = unigram_pos_words + words_pos unigram_pos = unigram_pos + ['<s>'] + pos + ['</s>'] # Prepare bigram lists for words, words_pos and pos bigram_words = bigram_words + list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) bigram_pos_words = bigram_pos_words + list( ngrams(words_pos, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) bigram_pos = bigram_pos + list( ngrams(words_pos, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) # Generate frequency distribution of all lists self.unigram_words = FreqDist(unigram_words) self.bigram_words = FreqDist(bigram_words) self.unigram_pos_words = FreqDist(unigram_pos_words) self.bigram_pos_words = FreqDist(bigram_pos_words) self.unigram_pos = FreqDist(unigram_pos) self.bigram_pos = FreqDist(bigram_pos) def get_bigram_words_probability(self, words): """ Calculate and returns bigram probability of the given arrangement of words :param words: Words list :return: Probability """ probability = 0 # Get vocabulary size vocabulary_size = len(self.unigram_words) # Generate bigrams bigrams = list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) # Calculate log probability with add-one smoothing for bigram in bigrams: probability += math.log(self.bigram_words.freq(bigram) + 1) - math.log( self.unigram_words.freq(bigram[1]) + vocabulary_size) return probability def get_bigram_pos_words_probability(self, words): """ Calculates and returns bigram probability of the given arrangement of words with POS :param words: Words list POS tagged :return: Probability """ # POS tag input words words = pos_tag(words) probability = 0 # Get vocabulary size vocabulary_size = len(self.unigram_pos_words) # Generate bigrams bigrams = list( ngrams(words, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) # Calculate log probability with add-one smoothing for bigram in bigrams: probability += math.log( self.bigram_pos_words.freq(bigram) + 1) - math.log( self.unigram_pos_words.freq(bigram[1]) + vocabulary_size) return probability def get_bigram_pos_probability(self, tags): """ Calculates and returns bigram probabilty of given arrangments of POS tags :param tags: Arrangement of POS tags :return: Probabilty """ probability = 0 # Get vocabulary size vocabulary_size = len(self.unigram_pos) # Generate bigrams bigrams = list(ngrams(tags, 2)) # Calculate log probability with add-one smoothing for bigram in bigrams: probability += math.log(self.bigram_pos.freq(bigram) + 1) - math.log( self.unigram_pos.freq(bigram[1]) + vocabulary_size) return probability def get_highest_probability_permutation(self, words, method): """ Implementation of argmax. Returns highest probability entry from the list words. :param words: List of list of words :param method: Method to calculate probability :return: Highest probability words arrangement """ max_probability = -math.inf selected = None # Get permutation counts. If the sentence is big, limit to 100 permutation_count = math.factorial( len(words)) if len(words) < 5 else 100 for _ in range(permutation_count): # Generate random permutation permutation = numpy.random.permutation(words) # Get probability of the permutation probability = getattr(self, method)(permutation) # Select the permutation with higher probability if probability > max_probability: max_probability = probability selected = permutation return selected def get_arrangement_with_pos_model(self, words): """ Returns arrangement of words with highest probability using POS ordering :param words: Input list of words :return: Arrangement of words with higest probability ordering """ # Tag words with POS words_pos = [('', '<s>')] + pos_tag(words) + [('', '</s>')] length = len(words_pos) for index, word in enumerate(words_pos): # Pick 4 words window words_window = words_pos[index:index + 4] max_probability = -math.inf selected = None # Generate all permutations of the window permutations = itertools.permutations(words_window) for permutation in permutations: # Get all POS tags pos = [word[1] for word in permutation] # Get the probability probability = self.get_bigram_pos_probability(pos) # Pick the arrangement with the highest probability if probability > max_probability: max_probability = probability selected = permutation # Apply the arrangment to the original list of words words_pos[index] = selected[0] words_pos[index + 1] = selected[1] words_pos[index + 2] = selected[2] words_pos[index + 3] = selected[3] if index == length - 4: break # Return the list of rearranged words return [word[0] for word in words_pos] def translate(self, source_sentence, original_translation, output_file): """ Perform translation of the given line :param source_sentence: Line of input file :param original_translation: Original translation from the test data :param output_file: File to write output to """ # Get word tokens words = word_tokenize(source_sentence) translated_words = [] # Perform direct machine translation using dictionary for i, word in enumerate(words): # Skip translating punctuations if word not in string.punctuation: translated_words.append(self.dictionary[word]) else: translated_words.append(word) output = list() output.append(('Source Sentence', source_sentence)) output.append(('Original Translation', original_translation)) # Normal translation output translated_sentence = self.words_to_sentence(translated_words) output.append(('Direct Machine Translation', translated_sentence)) # Improvement 1: Fixing determiners translated_words = self.fix_determiners(translated_words) translated_sentence = self.words_to_sentence(translated_words) output.append(('Fixing determiners', translated_sentence)) # Improvement 2: Removing consecutive pronouns translated_words = self.remove_consecutive_prp(translated_words) translated_sentence = self.words_to_sentence(translated_words) output.append(('Removing consecutive pronouns', translated_sentence)) # Improvement 3: Swapping reverse orders verbs and noun/pronouns translated_words = self.swap_verb_prp(translated_words) translated_sentence = self.words_to_sentence(translated_words) output.append(('Swapping reverse orders verbs and noun/pronouns', translated_sentence)) # Improvement 4: Bigram Language Model translated_words = self.get_highest_probability_permutation( translated_words, 'get_bigram_words_probability') translated_sentence = self.words_to_sentence(translated_words) output.append(('Bigram Language Model', translated_sentence)) # Improvement 5: Bigram POS Language Model translated_words = self.get_highest_probability_permutation( translated_words, 'get_bigram_pos_words_probability') translated_sentence = self.words_to_sentence(translated_words) output.append(('Bigram POS Language Model', translated_sentence)) # Improvement 6: Rearrangement of POS translated_words = self.get_arrangement_with_pos_model( translated_words) translated_sentence = self.words_to_sentence(translated_words) output.append(('Rearrangement of POS', translated_sentence)) self.print_translation(output, output_file) def execute(self, input_file, translation_file, output_file): """ Execute the tests on given input file :param input_file: Input file :param translation_file: File containing original translations :param output_file: File to write output to """ input_lines = self.read_text_file(input_file) # Open output file for writing try: output_file = open(output_file, 'w') except IOError as e: print('Cannot open file' + output_file + ' for writing', file=sys.stderr) print('I/O error({0}): {1}'.format(e.errno, e.strerror), file=sys.stderr) sys.exit(1) original_translation_lines = self.read_text_file(translation_file) for index, line in enumerate(input_lines): # Translate each line self.translate(line, original_translation_lines[index], output_file)
# # The following code is deprecated for text in gutenberg.fileids(): for word in gutenberg.words(text): fd[word] += 1 # fd.inc(word) # deprecated. superseded by the line above # Initialize two empty lists which will hold our ranks and frequencies ranks = [] freqs = [] # Generate a (rank, frequency) point for each counted token and # and append to the respective lists, Note that the iteration # over fd is automatically sorted. for rank, word in enumerate(fd): ranks.append(rank+1) freqs.append(fd.freq(word)) word # Plot rank vs frequency on a log-log plot plt.loglog(ranks, freqs) plt.ylabel('frequency(f)', fontsize=14, fontweight='bold') plt.xlabel('rank(r)', fontsize=14, fontweight='bold') plt.grid(True) plt.show() plt.close() ############################################################################### ############################################################################### ############################## PREDICTING WORDS ############################### ############################################################################### ### PREDICTING WORDS
# For each column - except 'tokenized_text' for col in (train_bigram_columns + features): # Convert all values to boolean train_X[col] = train_X[col] > 0 # Add columns and labels in one dataframe temp_df = pd.DataFrame() temp_df[col] = train_X[col] temp_df["label"] = train_y # For attribute 'col' equal to 0 # Get the respective lines df0 = temp_df[temp_df[col] == 0] # Count occurences of each class(positive/negative/neutral) freqdist0 = FreqDist(df0["label"]) # Get probability of each class probabilities0 = [freqdist0.freq(label) for label in freqdist0] # Calculate cross entropy of X=0 Hc0 = -sum(prob * math.log(prob, 2) for prob in probabilities0) # For attribute 'col' equal to 1 # Get the respective lines df1 = temp_df[temp_df[col] == 1] # Count occurences of each class(positive/negative/neutral) freqdist1 = FreqDist(df1["label"]) # Get probability of each class probabilities1 = [freqdist1.freq(label) for label in freqdist1] # Calculate cross entropy of X=1 Hc1 = -sum(prob * math.log(prob, 2) for prob in probabilities1) # Caclulate probabilities for each value of 'col' (0/1) freqdist = FreqDist(temp_df[col])
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import FreqDist fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成 fdist.inc(sample) # sampleで指定されたデータの数を1増やす fdist['データ'] # 指定されたデータの出現数 fdist.freq('データ') # 指定されたデータの頻度 fdist.N() # サンプルの総数 fdist.keys() # 頻度の順にソートされたサンプル for sample in fdist: # 頻度の順にサンプルをイテレート pass fdist.max() # 数の最も多いサンプル fdist.tabulate() # 頻度分布を表形式で表示 fdist.plot() # 頻度分布をプロット fdist.plot(cumulative=True) # 累積頻度をプロット fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
bigramsText1 = bigrams( text1) # bigramsText1[0] is the tuple containing the first bigram # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() # String methods s = "MatTias" s.lower() s.upper() s.startswith("ma") "T" in s # Find all the words in Moby Dick that ends with -ableness. Sort then alphabetically.
y = np.zeros(len(allwords)) # print allwords for i in range(len(allwords)): try: y[i] = int((complexity[allwords[i]])) except: y[i] = 0 # print y fdist = FreqDist(brown.words()) x = [] for i in range(len(allwords)): x.append([]) for i in range(len(allwords)): x[i].append(fdist.freq(allwords[i])) x[i].append(len(allwords[i])) x[i].append(synobj.synCount(allwords[i])) x[i].append(ww.wdweight(allwords[i])) x[i].append(vc.vCount(allwords[i])) x[i].append(synobj.len_of_synonyms(allwords[i])) classifier = RandomForestClassifier() classify = classifier.fit((x[0:int(len(x) * 0.8)]), y[0:int(len(y) * .8)]) ypred = classifier.predict(XTest) # print y[0:int (len(y)*.5)] a = [] b = [] for i in range(len(ypred)): if ypred[i] == 1: