def process(f, return_tokens=True, return_freqdist=True): """ Function to process deals data. Splits text into sentences. FreqDist is incremented from tokenization. Using PunktWordTokenizer, since it is a decent regexp-based tokenizer. Deals are also about domain names. Not intending to split it up :rtype : FreqDist, list() of str :param f: Input file with a deal per line """ fd = FreqDist() tokens = [] fh = open(f, 'r') sentences = [line.strip() for line in fh.readlines()] for line in sentences: t = [] for word in PunktWordTokenizer().tokenize(line.lower()): if word not in set(stopwords.words('english')) and word not in set(string.punctuation): if return_tokens: t.append(word) if return_freqdist: fd.inc(word) tokens.append(t) fh.close() return fd, sentences, tokens
class Index: """ The Index class stores an index for a document. """ def __init__(self): self._freq_dist = None self._document = None def index(self, document): self._document = document if self._freq_dist == None: self._freq_dist = FreqDist() for term in self.terms(): self._freq_dist.inc(term) def reset(self): "Reset the index" self._freq_dist = None def freq_dist(self): if self._freq_dist == None: self.index() return self._freq_dist # return the number of times a term appears in this document def freq(self, term): if not self._freq_dist: self.index() return self._freq_dist[term] def tf(self, term): if not self._freq_dist: self.index() return float(self._freq_dist[term]) / float(self._freq_dist.N())
def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab, pos_vocab, synset_vocab, stemmer): d = Document() assert language == self.lang if self._id: d.id = self._id else: d.id = num d.language = language d.title = self.title.strip() num_sentences = max(self._sentences) + 1 tf_token = FreqDist() for ii in self.tokens(): tf_token.inc(ii) for ii in xrange(num_sentences): s = d.sentences.add() for jj in self._sentences[ii]: w = s.words.add() w.token = token_vocab[jj.word] w.lemma = lemma_vocab[jj.lemma] w.pos = pos_vocab[jj.pos] w.relation = pos_vocab[jj.rel] w.parent = jj.parent w.offset = jj.offset w.tfidf = token_df.compute_tfidf(jj.word, tf_token.freq(jj.word)) return d
def dotranslate(sent, parser, tdop): # todo: tokenize sentence by maximizing unigram probabilities # in training corpus, to detect multiword units sent = sent.split() # parse sentence with bitpar, gives an n-best list try: parsetrees1 = list(parser.nbest_parse(sent)) except Exception as e: parsetrees1 = [] print "parsing failed", e return (), {} # undo binarization and auxilary POS tags introduced to accomodate bitpar: parsetrees = FreqDist() for tree in parsetrees1: tree.un_chomsky_normal_form() parsetrees.inc(removeforcepos(tree).freeze(), count=tree.prob()) # for each parsetree, get a list of translations resultfd = {} for m, tree in enumerate(parsetrees): print "parse tree", tree for nn, (result, prob) in enumerate( tdop.get_mlt_deriv_multi(tree, smoothing=True, verbose=False)): if not result: continue key = (undecorate_with_ids(result).freeze(), sum(1 if "@" in a.node else 0 for a in result.subtrees())) resultfd[key] = resultfd.get(key, 0.0) + prob return parsetrees, resultfd
def sent_length_fdist_single(address, exclude=excludePuncts(), corpus=inaugural): fd = FreqDist() for sent in corpus.sents(address): nopunct_sent = [word for word in sent if not word in exclude] fd.inc(len(nopunct_sent)) return fd
# return the frequency distribution as the result return adist # define a function to make a FreqDist from a list of tokens that has no tokens # that contain non-alphabetical characters or words in the stopword list def alphaStopFreqDist(words, stoplist): # make a new frequency distribution called asdist
def word_fdist_single(address, exclude=excludes(), corpus=inaugural): fd = FreqDist() for word in corpus.words(address): if not word.lower() in exclude: fd.inc(word.lower()) return fd
def content_FreqDist_generator(articles_list): # get the FreqDist of all articles all_fdist = FreqDist() for article in articles_list: for item in article.content_freqDist().iteritems(): key = item[0] value = item[1] all_fdist.inc(key, value) return all_fdist
def word_fdist(address_list, exclude=excludes(), corpus=inaugural): total_fd = FreqDist() for address in address_list: fd = word_fdist_single(address, exclude, corpus) for word in fd.keys(): total_fd.inc(word, fd[word]) return total_fd
def sent_length_fdist(address_list, exclude=excludePuncts(), corpus=inaugural): total_fd = FreqDist() for address in address_list: fd = sent_length_fdist_single(address, exclude, corpus) for len in fd.keys(): total_fd.inc(len, fd[len]) return total_fd
def __extract_level_words(self, levels_db, level, values): words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"] most_freq_words = {} for value in values: fdist = FreqDist() for word_dist in levels_db[level][value]: fdist.inc(word_dist[0], count = word_dist[1]) most_freq_words[value] = fdist.items()[:words_number_per_value] return most_freq_words
def kneser_ney(self, context, word): """ Return the log probability of a word given a context given Kneser Ney backoff """ bgram = (context, word) unigram_freq = FreqDist() theta = self._kn_concentration vocabulary = 1 / len(self._vocab_freq.keys()) discount_delta = self._kn_discount unigram_T = len(self._context_freq.keys()) bigram_T = self._context_freq[context] for i in self._gram_freq: unigram_freq.inc(i[1]) # Unigram Restaurant # C_0,x count_unirest_wordTable = unigram_freq[word] # C_0,. count_unirest_allTable = unigram_freq.N() # u_Bigram Restaurant # C_u,x count_birest_wordTable = self._gram_freq[bgram] # C_u,. count_birest_allTable = self._context_freq[context] existingTable_numer = count_birest_wordTable - discount_delta existingTable_denom = theta + count_birest_allTable existingTable = existingTable_numer / existingTable_denom if existingTable < 0: existingTable = 0 newTable_numer = theta + (bigram_T * discount_delta) newTable_denom = theta + count_birest_allTable newTable = newTable_numer / newTable_denom back_a_numer = count_unirest_wordTable - discount_delta back_a_denom = count_unirest_allTable + theta back_a = back_a_numer / back_a_denom if back_a < 0: back_a = 0 back_b_numer = theta + (unigram_T * discount_delta) back_b_denom = count_unirest_allTable + theta back_b = back_b_numer / back_b_denom back_b = back_b * vocabulary result = existingTable + (newTable * (back_a + back_b)) return lg(result)
def __getTimelineFeatures(self, timeline): logger.info(u"Get timeline features") tweets = [] self.__changePhase(PHASE["GET_TIMELINE_URLS"]) for t in timeline: try: tweet = TweetText(t, self.__urlBuilder, self.__userBuilder) except: logger.exception(u"Error: \"" + unicode(t) + u"\"") raise ValueError(t) logger.debug(u"Tweet:" + unicode(tweet)) tweets.append(tweet) urls = [] ti = 0 for tweet in tweets: for url in tweet.urls(): self.__breakIfStopped() self.__urlResolver.addUrlToQueue(url) urls.append(url) logger.info(u"Tweet:" + unicode(tweet)) ti += 1 self.__proc = 100 * float(ti) / float(len(tweets)) #Kategorie self.__changePhase(PHASE["GET_TIMELINE_FEATURES"]) url2labels = {} ui = 0 for url in urls: self.__breakIfStopped() if not url.isError(): logger.debug(u"Classify " + unicode(url.getUrl())) url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText()) ui += 1 self.__proc = 100 * float(ui) / float(len(urls)) labelsFreq = FreqDist() for labels in url2labels.values(): for label in labels: labelsFreq.inc(label) self.__catFreq = labelsFreq.items() logger.info(u"Categories: " + unicode(labelsFreq.items())) labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']] #normalizacja labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues} logger.info(u"Category factors: " + unicode(labelsFreqValues)) #Języki langFreq = FreqDist() for u in urls: langFreq.inc(u.lang()) self.__langFreq = langFreq.items() logger.info(u"Languages: " + unicode(langFreq.items())) return labelsFreqValues
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModelTagger @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: list @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def handle(self, *args, **options): fdist = FreqDist() print "Analyzing raw data" limit = 10 if args: raw_datas = RawData.objects.filter(pk__in=args) else: raw_datas = RawData.objects.all()[:limit] tagged_data = [] for raw_data in raw_datas: words = nltk.word_tokenize(raw_data.data) tagged_data.extend(nltk.pos_tag(words)) for word in words: word = word.strip() if word: fdist.inc(word) print "Anaylzed %s items" % len(raw_datas) print print "Top word: %s" % fdist.max() print print "Top 10 words" for word in fdist.keys()[:10]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Bottom 10 words" for word in fdist.keys()[-10:]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Words occurring between 50-100 times" words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ] print ", ".join(words) cfdist = ConditionalFreqDist() for (word, tag) in tagged_data: cfdist[tag].inc(word) print "Most popular noun: %s" % cfdist["NN"].max() print print "Top 50 nouns" for word in cfdist["NN"].keys()[:50]: times = cfdist["NN"][word] print " -- %s occurred %s times" % (word, times) print
def recompute_cluster_dists(text, cluster_descr): c_freqs = FreqDist() for c in text.clusters(cluster_descr): c_freqs.inc(c) c_dist = MLEProbDist(c_freqs) c_bi_freqs = FreqDist() for bi_c in bigrams(text.clusters(cluster_descr)): c_bi_freqs.inc(bi_c) c_bi_dist = MLEProbDist(c_bi_freqs) return c_dist, c_bi_dist
# and if it is not on the stop word list # add it to the frequency distribution for word in words: if not pattern.match(word): if not word in stoplist: asdist.inc(word) # return the frequency distribution as the result return asdist # Bigram frequency distribution function. # This version also makes sure that each word in the bigram occurs in a word # frequency distribution without non-alphabetical characters and stopwords # This will also work with an empty stopword list if you don't want stopwords.
def mostprobableparse(self, sent, sample=None): """warning: this problem is NP-complete. using an unsorted chart parser avoids unnecessary sorting (since we need all derivations anyway). @param sent: a sequence of terminals @param sample: None or int; if int then sample that many parses""" p = FreqDist() for a in self.parser.nbest_parse(sent, sample): p.inc(removeids(a).freeze(), a.prob()) if p.max(): return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()]) else: raise ValueError("no parse")
def classify(self, sentence, tokenizer_lang, ngram_length=3): features = [] for ii in self.tokenizers[tokenizer_lang].tokenize(sentence): d = {} for jj in ingrams(ii, ngram_length): d[jj] = d.get(jj, 0) + 1 features.append(d) data = SparseDataSet(features) f = FreqDist() for ii in [self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features))]: f.inc(ii) return f
def mapper(key,value): sentence = value.split() for (index, tagtuple) in enumerate(sentence): token, tag = get_token_tag(tagtuple) if we_like(token, tag): fd = FreqDist() token = token.lower() window = sentence[index+1:index+5] for windowtuple in window: wtoken, wtag = get_token_tag(windowtuple) if we_like(wtoken, wtag): wtoken = wtoken.lower() fd.inc(wtoken) yield token, tuple(fd.items())
def mapper(key, value): sentence = value.split() for (index, tagtuple) in enumerate(sentence): token, tag = get_token_tag(tagtuple) if we_like(token, tag): fd = FreqDist() token = token.lower() window = sentence[index + 1:index + 5] for windowtuple in window: wtoken, wtag = get_token_tag(windowtuple) if we_like(wtoken, wtag): wtoken = wtoken.lower() fd.inc(wtoken) yield token, tuple(fd.items())
def parse(self,doc,mode='list'): stream = self.makeTokenStream(doc) if mode == 'list': tokens = [] while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString()) elif mode == 'set': tokens = set() while stream.incrementToken(): tokens.add(stream.getAttribute(CharTermAttribute.class_).toString()) elif mode == 'FreqDist': tokens = FD() while stream.incrementToken(): tokens.inc(stream.getAttribute(CharTermAttribute.class_).toString()) else: raise TypeError("mode の type が違います。") stream.close() return tokens
def classify(self, sentence, tokenizer_lang, ngram_length=3): features = [] for ii in self.tokenizers[tokenizer_lang].tokenize(sentence): d = {} for jj in ingrams(ii, ngram_length): d[jj] = d.get(jj, 0) + 1 features.append(d) data = SparseDataSet(features) f = FreqDist() for ii in [ self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features)) ]: f.inc(ii) return f
def __call__(self, key, value): sent = value.split() for idx, tagged in enumerate(sent): token, tag = self.split_tagged(tagged) if self.valid(token, tag): dist = FreqDist() window = sent[idx + 1:idx + 5] for wtagged in window: wtoken, wtag = self.split_tagged(wtagged) if self.valid(wtoken, wtag): dist.inc(wtoken) yield token, tuple(dist.items())
def __call__(self, key, value): sent = value.split() for idx, tagged in enumerate(sent): token, tag = self.split_tagged(tagged) if self.valid(token, tag): dist = FreqDist() window = sent[idx+1:idx+5] for wtagged in window: wtoken, wtag = self.split_tagged(wtagged) if self.valid(wtoken, wtag): dist.inc(wtoken) yield token, tuple(dist.items())
def main(): """ Return the X most common stems from the dataset. X = VOC_NUMBER constant. """ fdist = FreqDist() for line in fileinput.input(): try: for stem in get_stems(line): if stem not in ENGLISH_STOPWORDS: fdist.inc(stem) except UnicodeDecodeError: pass keys = fdist.keys()[:VOC_NUMBER] for s in keys: print s
class Text(object): def __init__(self, source, gen_func=lambda x: x): self.dictionary = Dictionary([gen_func(source)]) self.gen_func = gen_func self.source = source self.word_freqs = FreqDist() for word in self.words(): self.word_freqs.inc(word) self.word_dist = MLEProbDist(self.word_freqs) def words(self): return (self.dictionary.token2id[token] for token in self.gen_func(self.source)) def clusters(self, cluster_descr): return (cluster_descr.index[word] for word in self.words())
def gen_word_freqs(self, train_sents): """ Generates word frequencies from the training sentences for the feature classifier. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. @rtype: C{FreqDist} @return: a L{frequency distribution<nltk.FreqDist()>}, counting how often each word occurs in the training sentences. """ word_freqdist = FreqDist() for tagged_sent in train_sents: for (word, _tag) in tagged_sent: word_freqdist.inc(word) return word_freqdist
class Model(): def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True): self.doc_freq = FreqDist() for count, (label, text) in enumerate(data, start=1): for word in set( utils.tokenize(text, include_ngrams, limit_ngrams=True)): self.doc_freq.inc(word) self.doc_count = count self.min_vocab_freq = 1 self.max_vocab_freq = .95 * self.doc_count print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq self.features = sorted(filter(self._is_valid_feature, self.doc_freq)) def _is_valid_feature(self, feature): doc_freq = self.doc_freq[feature] return doc_freq > self.min_vocab_freq and doc_freq < self.max_vocab_freq
class Model(): def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True): self.doc_freq = FreqDist() for count, (label, text) in enumerate(data, start=1): for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)): self.doc_freq.inc(word) self.doc_count = count self.min_vocab_freq = 1 self.max_vocab_freq = .95 * self.doc_count print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq self.features = sorted(filter(self._is_valid_feature, self.doc_freq)) def _is_valid_feature(self, feature): doc_freq = self.doc_freq[feature] return doc_freq > self.min_vocab_freq and doc_freq < self.max_vocab_freq
def build_vocabulary(save_state_file='state.pkl.gz'): counter = FreqDist() total_line_count = 0 for url_suffix in urls_1grams: print url_suffix current_line_num = 0 for line in buffered_download(base_url_1grams % url_suffix): current_line_num += 1 total_line_count += 1 try: tokens, year, total_count, _ = parse_line(line) counter.inc(tokens[0], total_count) except: print "error parsing line" print line if save_state_file: print 'saving state' save_state(save_state_file, counter, current_line_num, url_suffix) return counter
def parse(doc,mode='list'): ''' 文字列型である doc を Solr の JapaneseAnalyzerで 形態素解析し、mode で指定した型で返す。 mode : 'list' or 'set' or 'FreqDist' ''' stream = _makeTokenStream(doc) if mode == 'list': tokens = [] while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString()) elif mode == 'set': tokens = set() while stream.incrementToken(): tokens.add(stream.getAttribute(CharTermAttribute.class_).toString()) elif mode == 'FreqDist': tokens = FD() while stream.incrementToken(): tokens.inc(stream.getAttribute(CharTermAttribute.class_).toString()) else: raise TypeError("mode の type が違います。") stream.close() return tokens
def run_EM(no_of_iter, ten_de): #Run EM for specified number of iterations print("Running EM") ## pseudocode from http://www.statmt.org/mtm2/data/day2-1x2.pdf ## do until convergence ## set count(e|f) to 0 for all e,f ## set total(f) to 0 for all f ## for all sentence pairs (e_s,f_s) ## for all words e in e_s ## total_s(e) = 0 ## for all words f in f_s ## total_s(e) += t(e|f) ## for all words e in e_s ## for all words f in f_s ## count(e|f) += t(e|f) / total_s(e) ## total(f) += t(e|f) / total_s(e) ## for all f ## for all e ## t(e|f) = count(e|f) / total(f) #print(ten_de) N = len(de_inp) for i in range(no_of_iter): print("Doing iteration "+str(i)) totalde = FDist() counten_de = CondFDist() for sent in range(N): total_s = FDist() for en_word in en_inp[sent].split(): for de_word in de_inp[sent].split(): total_s.inc(en_word, ten_de[de_word][en_word]) ## print(en_word) ## print(de_word) ## print(total_s[en_word]) for en_word in en_inp[sent].split(): for de_word in de_inp[sent].split(): counten_de[de_word].inc(en_word, ten_de[de_word][en_word]/total_s[en_word]) totalde.inc(de_word, ten_de[de_word][en_word]/total_s[en_word]) for de_word in ten_de.conditions(): for en_word in ten_de[de_word].keys(): ten_de[de_word][en_word] = counten_de[de_word][en_word]/totalde[de_word] return ten_de
# 导入 gutenberg 集 from nltk.corpus import gutenberg # 都有些什么语料在这个集合里? print(gutenberg.fileids()) # ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt'] # 导入 FreqDist 类 from nltk import FreqDist # 频率分布实例化 fd = FreqDist() # 统计文本中的词例 for word in gutenberg.words('austen-persuasion.txt'): fd.inc(word) print(fd.N()) # total number of samples # 98171 print(fd.B()) # number of bins or unique samples # 6132 # 得到前 10 个按频率排序后的词 for word in fd.keys()[:10]: print(word, fd[word]) # ================运行时间计时================ run_time = time.time() - start_time if run_time < 60: # 两位小数的秒 print("耗时:{:.2f}秒".format(run_time)) elif run_time < 3600: # 分秒取整 print("耗时:{:.0f}分{:.0f}秒".format(run_time // 60, run_time % 60))
def __call__(self, key, values): dist = FreqDist() for fd in values: for k, v in fd: dist.inc(k, v) yield key, tuple(dist.items())
# # Second # # Here we will determine the relative frequencies of English bigrams in the text # Then we will calculate the entropy of the bigram distribution # create a list to store bigrams in english_model_bigrams = [] index = 0 english_bigram_fdist = FreqDist() # we'll be iterating through the string but stop one item short of normal # this allows us to create bigram windows while index < (len(english_model_content) - 1): english_bigram_fdist.inc(english_model_content[index:index + 2]) index += 1 english_bigram_entropy = 0.0 # now loop and get the entropy for english unigrams for bigram in english_bigram_fdist.samples(): english_bigram_entropy += english_bigram_fdist.freq(bigram) * math.log( english_bigram_fdist.freq(bigram), 2) english_bigram_entropy = -english_bigram_entropy print "The English Bigram Entropy is: " + str(english_bigram_entropy) # # Third
if __name__ == "__main__": import nltk from nltk import FreqDist from nltk.corpus import gutenberg, brown, treebank import re # Find all examples of "thou *th" thou_regexp = re.compile(r"[Tt]hou\s[\w]*t\s") thou_count = FreqDist() for ii in thou_regexp.findall(gutenberg.raw('bible-kjv.txt')): thou_count.inc(ii) print("\n".join("%s:%i" % (x, thou_count[x]) for x in thou_count.keys()[:10])) # Find everything that looks like a street street_regexp = re.compile(r"[A-Z]\w*\s[S]treet") for fileid in gutenberg.fileids(): print(fileid, street_regexp.findall(gutenberg.raw(fileid))) print("-----------------------------------------") # Find repeated words repeat_regexp = re.compile(r'\b(\w+)\s(\1\b)+') for fileid in gutenberg.fileids(): matches = list(repeat_regexp.finditer(gutenberg.raw(fileid))) print(fileid, [x.group(0) for x in matches]) print("-----------------------------------------") # Find repeated words separated by some other word repeat_regexp = re.compile(r"\b(\w+)\s\w+\s(\1\b)+")
class Cosine(): def __init__(self, stem=True, lemm=True): self.raw_inputs = [] self.inputs = [] self.vectors = [] self.words = [] self.fd = FreqDist() self.cos_values = [] self.stemmer = nltk.porter.PorterStemmer() self.lemmatizer = nltk.wordnet.WordNetLemmatizer() self.lemm = lemm self.stem = stem return def set_input(self, txt): self.raw_inputs.append(txt) temp = [] new_text = txt if self.stem: for word in nltk.word_tokenize(txt): if word.lower() in stopwords.words(): continue temp.append(self.stemmer.stem(word)) new_text = ' '.join(temp) if self.lemm: for word in nltk.word_tokenize(new_text): if word.lower() in stopwords.words(): continue temp.append(self.lemmatizer.lemmatize(word)) new_text = ' '.join(temp) self.inputs.append(new_text) return def setup_tftable(self): for txt in self.inputs: sents = nltk.sent_tokenize(txt) for sent in sents: # for each sentence in the given text words = nltk.word_tokenize(sent) for word in words: self.fd.inc(word) self.tftable = [[k, 0] for k in self.fd.keys()] return self.tftable def vectorize(self): tft = self.setup_tftable() vecs = [] for txt in self.inputs: vecs.append(self.vectorize_one(txt)) self.vectors = [] for v in vecs: self.vectors.append(tuple(i[1] for i in v)) return self.vectors def vectorize_one(self, txt): #we will take bag of words with word count myvector = copy.deepcopy(self.tftable) sents = nltk.sent_tokenize(txt) for sent in sents: # for each sentence in the given text words = nltk.word_tokenize(sent) for word in words: for item in myvector: if item[0] == word: item[1] += 1 return myvector #initialize a matrix that would contain cosine similarity value for each vector in the LVS against every other vector def init_cos_matrix(self, dim): values = [] for m in range(dim): row = [] for n in range(dim): row.append(None) values.append(row) return values def cosine( self, vecs=None ): #returns the cosine similarity of the input vectors taken from self.vectors self.cos_values = self.init_cos_matrix(len(self.vectors)) if vecs == None: vecs = self.vectors for u in range(len(vecs)): #self.cos_values.append([]) for v in range(u, len(vecs)): angle = nltk.cluster.cosine_distance(vecs[u], vecs[v]) value = math.cos(angle) self.cos_values[v][u] = self.cos_values[u][v] = ( angle, value, ) return self.cos_values def compute_similarity(self, messages, stem=True, lemm=True, threshold=0.75): #given a list of messages computes the similarity and returns the matrix #messages is of the form: [message1, message2, ..., messagen] self.stem = stem self.lemm = lemm for m in messages: self.set_input(m) self.vectorize() values = self.cosine() return values
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words', best_features=10000, processes=8, purge=False): """ Train with samples from sqlite database and stores the resulting classifier in Redis. Arguments: db_name (str) -- Name of the training database to use stored in ~/.synt Keyword arguments: samples (int) -- Amount of samples to train on. classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'. extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'. best_features (int) -- Amount of highly informative features to store. processes (int) -- The amount of processes to be used for counting features in parallel. purge (bool) -- If true will flush the redis database. """ m = RedisManager(purge=purge) extractor = get_extractor(extractor_type) if not db_exists(db_name): raise ValueError("Database '%s' does not exist." % db_name) if classifier_type in m.r.keys(): print("Classifier exists in Redis. Purge to re-train.") return classifier = config.CLASSIFIERS.get(classifier_type) if not classifier: #classifier not supported raise ValueError("Classifier '%s' not supported." % classifier_type) #retrieve training samples from database train_samples = get_samples(db_name, samples) m.store_feature_counts(train_samples, processes=processes) m.store_feature_scores() if best_features and best_features > 1: m.store_best_features(best_features) label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) #retreieve the actual samples processed for label neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get( 'positive_processed') label_freqdist.inc('negative', int(neg_processed)) label_freqdist.inc('positive', int(pos_processed)) labeled_feature_freqs = m.pickle_load('labeled_feature_freqs') labels = labeled_feature_freqs.keys() #feature extraction feat_ex = extractor() extracted_set = set([ feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True) for label in labels ][0]) #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses for label in labels: samples = label_freqdist[label] for fname in extracted_set: trues = labeled_feature_freqs[label].get(fname, 0) falses = samples - trues feature_freqdist[label, fname].inc(True, trues) feature_freqdist[label, fname].inc(False, falses) #create the P(label) distribution estimator = ELEProbDist label_probdist = estimator(label_freqdist) #create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=2) feature_probdist[label, fname] = probdist #TODO: naivebayes supports this prototype, future classifiers will most likely not trained_classifier = classifier(label_probdist, feature_probdist) m.pickle_store(classifier_type, trained_classifier) m.r.set('trained_to', samples) m.r.set('trained_db', db_name) m.r.set('trained_classifier', classifier_type) m.r.set('trained_extractor', extractor_type)
class CorpusReader: """ A collection of documents """ def __init__(self, base, doc_limit=-1, bigram_limit=-1): self._file_base = base self._files = defaultdict(set) self._total_docs = 0 self._bigram_finder = {} self._bigram_limit = bigram_limit self._author_freq = FreqDist() self._word_df = defaultdict(DfCalculator) self._word_freq = defaultdict(FreqDist) self._lemma_freq = defaultdict(FreqDist) self._bigram_freq = defaultdict(FreqDist) self._tag_freq = defaultdict(FreqDist) self._synset_freq = FreqDist() self._author_lookup = {} self._word_lookup = defaultdict(dict) self._lemma_lookup = defaultdict(dict) self._stop_words = defaultdict(set) self._pos_tag_lookup = defaultdict(dict) self._bigram_lookup = defaultdict(dict) self._synset_lookup = {} self._doc_limit = doc_limit self._stemmer = Snowball() def lang_iter(self, lang): print "DOC LIMIT %i" % self._doc_limit file_list = list(self._files[lang]) doc_num = 0 file_list.sort() random.seed(0) random.shuffle(file_list) if len(file_list) > 100: for ff in file_list: for dd in self.doc_factory(lang, ff): if self._doc_limit > 0 and doc_num >= self._doc_limit: return doc_num += 1 yield dd else: file_list = list(self.doc_factory(lang, x) for x in file_list) for dd in poll_iterator(file_list): if self._doc_limit > 0 and doc_num >= self._doc_limit: return doc_num += 1 yield dd def __iter__(self): """ Return documents. """ # We have two different types of behavior depending on the number of # files. If we have lots of files, then just go through them for ll in self._files: for ii in self.lang_iter(ll): yield ii def sample(self, num_docs=-1, rand_seed=0): """ Iterate over a subset of the documents. Given the same random seed, the results should be consistent. """ raise NotImplementedError def build_vocab(self): """ Create counts for all of the tokens. Does care about lemmatization and will create separate vocab for that. Also ignores tags. """ self._author_freq = FreqDist() print "Building vocab:" doc = 0 for ii in self: doc += 1 if doc % 100 == 0: print("Doc %i / %i (total estimated)" % \ (doc, self._total_docs)) self._total_docs = max(self._total_docs, doc) for jj in ii.authors(): self._author_freq.inc(jj) for jj in ii.lemmas(self._stemmer): try: jj.encode("utf-8", "replace") self._lemma_freq[ii.lang].inc(jj) except ValueError: None for jj in ii.tokens(): try: jj.encode("utf-8", "replace") self._word_freq[ii.lang].inc(jj) self._word_df[ii.lang].word_seen(doc, jj) except ValueError: None for jj in ii.synsets(): self._synset_freq.inc(jj) for jj in ii.pos_tags(): self._tag_freq[ii.lang].inc(jj) for jj in ii.relations(): self._tag_freq[ii.lang].inc(jj) self._total_docs = doc self.init_stop() if self._bigram_limit > 0: for ii in self._word_freq: bf = BigramFinder(language=LANGUAGE_ID[ii]) self._bigram_finder[ii] = bf bf.set_counts(self._word_freq[ii]) print("Finding bigrams in language %i" % ii) doc = 0 for ii in self: lang = ii.language() doc += 1 if doc % 100 == 0: print("Doc %i / %i" % (doc, self._total_docs)) self._bigram_finder[lang].add_ngram_counts(ii.tokens()) print("Scoring bigrams") bigrams = {} for lang in self._word_freq: bf = self._bigram_finder[lang] bf.find_ngrams([]) bigrams[lang] = bf.real_ngrams(self._bigram_limit) print("First 10 bigrams") for ii in bigrams[lang].keys()[:10]: print("%s_%s" % ii) print("Creating new counts after subtracting bigrams") doc = 0 for ii in self: doc += 1 lang = ii.language() bf = self._bigram_finder[lang] if doc % 100 == 0: print("Doc %i / %i" % (doc, self._total_docs)) # for jj in ii.tokens(): # self._bigram_freq[lang].inc(jj) for jj in ii.sentences(): for kk in iterable_to_bigram(jj, bigrams[lang], bf.normalize_word): self._bigram_freq[lang].inc(kk) def init_stop(self): """ Requires vocabulary to be built first to know which languages appear. """ s = StopWords() self._stop_words = defaultdict(set) for ll in self._word_freq: language_name = LANGUAGE_ID[ll] print "Loading stopwords for", ll, " from", language_name try: self._stop_words[ll] = s[language_name] except IOError: print "Could not load stop words for", language_name print "Loaded", len(self._stop_words[ll]), "words." # Make sure lemmatized versions are also in temp_stop = list(self._stop_words[ll]) for ii in temp_stop: self._stop_words[ll].add(self._stemmer(ll, ii)) def doc_factory(self, lang, filename): raise NotImplementedError def fill_proto_vocab(self, frequency_count, vocab_generator, lookup, name): for ll in frequency_count: voc = vocab_generator() voc.language = ll word_id = 0 for tt in frequency_count[ll]: word = voc.terms.add() word.id = word_id word.original = tt word.ascii = tt.encode("ascii", "replace") word.frequency = frequency_count[ll][tt] word.stop_word = tt in self._stop_words[ll] if tt in lookup[ll]: assert lookup[ll][tt] == word_id else: lookup[ll][tt] = word_id if word_id < 50 or (word_id < 1000 and "_" in word.ascii): print("%s\t%i\t%s\t%s\t%i" % (name, word_id, word.ascii, str( word.stop_word), word.frequency)) word_id += 1 def fill_proto_language_independent_vocab(self, frequency_count, vocab_generator, lookup, name): word_id = 0 for tt in frequency_count: if not tt: continue word = vocab_generator() word.id = word_id word.original = tt word.ascii = tt.encode("ascii", "replace") word.frequency = frequency_count[tt] lookup[tt] = word_id word_id += 1 def new_section(self): """ Create a new corpus section and return it """ c = Corpus() self.fill_proto_vocab(self._word_freq, c.tokens.add, self._word_lookup, "TOKEN") self.fill_proto_vocab(self._bigram_freq, c.bigrams.add, self._bigram_lookup, "BIGRAM") self.fill_proto_vocab(self._lemma_freq, c.lemmas.add, self._lemma_lookup, "LEMMA") self.fill_proto_vocab(self._tag_freq, c.pos.add, self._pos_tag_lookup, "TAG") self.fill_proto_language_independent_vocab(self._author_freq, c.authors.terms.add, self._author_lookup, "AUTHOR") self.fill_proto_language_independent_vocab(self._synset_freq, c.synsets.terms.add, self._synset_lookup, "SYNSET") return c def add_language(self, pattern, language=ENGLISH): search = self._file_base + pattern print "SEARCH:", search for ii in glob(search): self._files[language].add(ii) self._total_docs += 1 def write_proto(self, path, name, docs_in_sec=10000): self.build_vocab() section = self.new_section() doc_id = 0 bigram_list = {} if self._bigram_limit > 0: for lang in self._bigram_finder: bf = self._bigram_finder[lang] bigram_list[lang] = bf.real_ngrams(self._bigram_limit) for lang in self._files: doc_num = 0 section_num = 0 filename = "%s/%s_%s_%i" % (path, \ name, LANGUAGE_ID[lang], section_num) print path for doc in self.lang_iter(lang): if doc_num >= docs_in_sec: print "Done with section ", \ section_num, " we've written ", doc_id # Write the file write_proto(filename + ".index", section) section = self.new_section() section_num += 1 doc_num = 0 filename = "%s/%s_%s_%i" % (path, name, LANGUAGE_ID[lang], section_num) if not os.path.exists(filename): os.mkdir(filename) assert lang in self._word_lookup, "%i not in vocab, %s" % \ (lang, str(self._word_lookup.keys())) if doc_id % 100 == 0: print "Writing out ", lang, filename, doc_id, "/", \ len(self._files[lang]) if self._bigram_limit > 0: bf = self._bigram_finder[lang] doc_proto = doc.proto(doc_id, lang, self._author_lookup, self._word_lookup[lang], self._word_df[lang], self._lemma_lookup[lang], self._pos_tag_lookup[lang], self._synset_lookup, self._stemmer, self._bigram_lookup[lang], bigram_list[lang], bf.normalize_word) else: doc_proto = doc.proto(doc_id, lang, self._author_lookup, self._word_lookup[lang], self._word_df[lang], self._lemma_lookup[lang], self._pos_tag_lookup[lang], self._synset_lookup, self._stemmer) write_proto("%s/%i" % (filename, doc_id), doc_proto) section.doc_filenames.append("%s_%s_%i/%i" % \ (name, LANGUAGE_ID[lang], section_num, doc_id)) doc_id += 1 doc_num += 1 # We don't want to mix languages, so we close out each section when # done with a language if doc_num > 0: write_proto(filename + ".index", section) section = self.new_section() doc_num = 0 section_num += 1 filename = "%s/%s_%s_%i" % (path, name, LANGUAGE_ID[lang], section_num) if not os.path.exists(filename): os.mkdir(filename) print doc_id, " files written"
class BigramLanguageModel: def __init__(self, unk_cutoff, jm_lambda=0.6, dirichlet_alpha=0.1, katz_cutoff=5, kn_discount=0.1, kn_concentration=1.0, tokenize_function=TreebankWordTokenizer().tokenize, normalize_function=lower): self._unk_cutoff = unk_cutoff self._jm_lambda = jm_lambda self._dirichlet_alpha = dirichlet_alpha self._katz_cutoff = katz_cutoff self._kn_concentration = kn_concentration self._kn_discount = kn_discount self._vocab_final = False self._tokenizer = tokenize_function self._normalizer = normalize_function # Add your code here! self._vocab_freq = FreqDist() self._gram_freq = FreqDist() self._context_freq = FreqDist() self._vocab_freq[kSTART] += kUNK_CUTOFF + 1 self._vocab_freq[kEND] += kUNK_CUTOFF + 1 def train_seen(self, word, count=1): """ Tells the language model that a word has been seen @count times. This will be used to build the final vocabulary. """ assert not self._vocab_final, \ "Trying to add new words to finalized vocab" self._vocab_freq.inc(word, count) return self._vocab_freq[word] def tokenize(self, sent): """ Returns a generator over tokens in the sentence. No modify """ for ii in self._tokenizer(sent): yield ii def vocab_lookup(self, word): """ Given a word, provides a vocabulary representation. Words under the cutoff threshold shold have the same value. All words with counts greater than or equal to the cutoff should be unique and consistent. """ assert self._vocab_final, \ "Vocab must be finalized before looking up words" freqCount = self._vocab_freq[word] if freqCount > self._unk_cutoff: return word else: return "<UNK>" def finalize(self): """ Fixes the vocabulary as static, prevents keeping additional vocab from being added No modify """ self._vocab_final = True def tokenize_and_censor(self, sentence): """ Given a sentence, yields a sentence suitable for training or testing. Prefix the sentence with <s>, replace words not in the vocabulary with <UNK>, and end the sentence with </s>. No modify """ yield self.vocab_lookup(kSTART) for ii in self._tokenizer(sentence): yield self.vocab_lookup(self._normalizer(ii)) yield self.vocab_lookup(kEND) def normalize(self, word): """ Normalize a word No modify """ return self._normalizer(word) def mle(self, context, word): """ Return the log MLE estimate of a word given a context. If the MLE would be negative infinity, use kNEG_INF """ prob = 0.0 bgram = (context, word) numer = self._gram_freq[bgram] denom = self._context_freq[context] if denom == 0: return kNEG_INF if self._gram_freq[bgram] != 0: prob = numer / denom if prob == 0.0: return kNEG_INF else: return lg(prob) def laplace(self, context, word): """ Return the log MLE estimate of a word given a context. """ bgram = (context, word) numer = self._gram_freq[bgram] + 1 denom = len(self._vocab_freq.keys()) + self._context_freq[context] prob = numer / denom return lg(prob) def good_turing(self, context, word): """ Return the Good Turing probability of a word given a context """ return 0.0 def jelinek_mercer(self, context, word): """ Return the Jelinek-Mercer log probability estimate of a word given a context; interpolates context probability with the overall corpus probability. """ bigram = (context, word) bigram_prob = 0 unigram_prob = (1 - self._jm_lambda) * (1 / len(self._vocab_freq)) for i in self._gram_freq: if i == bigram: bigram_count = 1 bigram_prob = (self._jm_lambda + unigram_prob) * bigram_count result = unigram_prob + bigram_prob return lg(result) def kneser_ney(self, context, word): """ Return the log probability of a word given a context given Kneser Ney backoff """ bgram = (context, word) unigram_freq = FreqDist() theta = self._kn_concentration vocabulary = 1 / len(self._vocab_freq.keys()) discount_delta = self._kn_discount unigram_T = len(self._context_freq.keys()) bigram_T = self._context_freq[context] for i in self._gram_freq: unigram_freq.inc(i[1]) # Unigram Restaurant # C_0,x count_unirest_wordTable = unigram_freq[word] # C_0,. count_unirest_allTable = unigram_freq.N() # u_Bigram Restaurant # C_u,x count_birest_wordTable = self._gram_freq[bgram] # C_u,. count_birest_allTable = self._context_freq[context] existingTable_numer = count_birest_wordTable - discount_delta existingTable_denom = theta + count_birest_allTable existingTable = existingTable_numer / existingTable_denom if existingTable < 0: existingTable = 0 newTable_numer = theta + (bigram_T * discount_delta) newTable_denom = theta + count_birest_allTable newTable = newTable_numer / newTable_denom back_a_numer = count_unirest_wordTable - discount_delta back_a_denom = count_unirest_allTable + theta back_a = back_a_numer / back_a_denom if back_a < 0: back_a = 0 back_b_numer = theta + (unigram_T * discount_delta) back_b_denom = count_unirest_allTable + theta back_b = back_b_numer / back_b_denom back_b = back_b * vocabulary result = existingTable + (newTable * (back_a + back_b)) return lg(result) def dirichlet(self, context, word): """ Additive smoothing, assuming independent Dirichlets with fixed hyperparameter. """ prob = 0.0 bgram = (context, word) numer = self._gram_freq[bgram] + self._dirichlet_alpha denom = self._context_freq[context] + (self._dirichlet_alpha * len(self._vocab_freq.keys())) prob = numer / denom return lg(prob) def add_train(self, sentence): """ Add the counts associated with a sentence. """ # You'll need to complete this function, but here's a line of code that # will hopefully get you started. # Add new vocab counts nopunc_tokenize = RegexpTokenizer(r'\w+') nopunc_list = nopunc_tokenize.tokenize(sentence) for i in nopunc_list: self._vocab_freq[i] += 1 # Count occurances of bigrams for context, word in bigrams(self.tokenize_and_censor(sentence)): x = (context, word) self._gram_freq.inc(x) self._context_freq.inc(context) def perplexity(self, sentence, method): """ Compute the perplexity of a sentence given a estimation method No modify """ return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \ bigrams(self.tokenize_and_censor(sentence))])) def sample(self, samples=25): """ Sample words from the language model. @arg samples The number of samples to return. """ yield "" return
def pmi(a, b): return log(pairs[a, b]) - log(pairs.N()) - log(unigrams[a]) - log( unigrams[b]) + 2 * log(unigrams.N()) h = FrameHierarchy.load() # training data contains a bad frame valid_names = {f.name for f in h._frames.values()} with codecs.open("../../../training/data/naacl2012/cv.train.sentences.json", encoding="utf8") as train_file: train = [json.loads(line) for line in train_file] unsorted_frames = ([(f['target']['spans'][0]['start'], f['target']['name']) for f in s['frames']] for s in train) frames = [[name for start, name in sorted(s) if name in valid_names] for s in unsorted_frames] del unsorted_frames unigrams = FreqDist(chain(*frames)) pairs = FreqDist( chain(*[[tuple(sorted(b)) for b in combinations(f, 2)] for f in frames])) pmis = FreqDist({(a, b): pmi(a, b) for a, b in pairs.keys() if unigrams[a] >= THRESHOLD and unigrams[b] >= THRESHOLD}) unigrams_with_ancestors = FreqDist(unigrams) for u in unigrams: for a in h.ancestors(h._frames[u]): unigrams_with_ancestors.inc(a.name)
def reducer(key, values): finalfd = FreqDist() for fd in values: for k, v in fd: finalfd.inc(k, v) yield key, tuple(finalfd.items())
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)