def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type); dicts.append(d) return dicts
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i % 100 == 0: print ' dict', str(i) + '/' + str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log( float(num_docs) / doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type) dicts.append(d) return dicts
def wrd_ngram_stats(texts, corpus, order, include_lower=False): all_wd_ngrams = FreqDist() text_wrd_ngrams = [] for text in texts: if not text.endswith(".txt"): continue wrd_tokens = corpus.words(text) empty = len(corpus.raw(text)) == 0 # One freq. dist per n text_ngrams = [] for _ in range(order): text_ngrams.append(FreqDist()) if not empty: lower_wrds = [w.lower() for w in wrd_tokens if w.isalnum()] if include_lower: for n in range(1, order+1): wd_ng = ngrams(lower_wrds, n) text_ngrams[n-1].update(wd_ng) if n == order: all_wd_ngrams.update(wd_ng) else: wd_ng = ngrams(lower_wrds, order) text_ngrams[order-1].update(wd_ng) all_wd_ngrams.update(wd_ng) text_wrd_ngrams.append(text_ngrams) return all_wd_ngrams, text_wrd_ngrams
def evaluate_html(content, html_conf): fdist = FreqDist() if html_conf['usehtml'] == False: logging.info('Discarding HTML tags') return fdist logging.info("\tEvaluating HTML") # try with TITLE tag titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content) for title in titles: root = etree.fromstring(title) words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text)) terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')] stems = steming(terms_list) for i in range(html_conf['title']): fdist.update(stems) # try with H1 tag headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content) for header in headers: root = etree.fromstring(header) words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text)) terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')] stems = steming(terms_list) for i in range(html_conf['h1']): fdist.update(stems) return fdist
def buildCategoryDictionary(category): tweetList = twitter_fetch.get_tweets_text(classn=category) freq = FreqDist() for tweet in tweetList: freq.update(word for word in tokenizeTweet(tweet)) saveDictionaryToFile(freq, category + categoryDictFilePath) return freq
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos = 0 neg = 0 for review in posids: pos += 1 if (pos != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['pos'].update(token_helpers.tokenize_simple(word)) for review in negids: neg += 1 if (neg != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['neg'].update(token_helpers.tokenize_simple(word)) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords """
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def tokenize_data(sentences): # tokenize the dataset fdist = FreqDist() tokenized_sents = [] for sentence in sentences: tokenized_sent = [w.lower() for w in word_tokenize(sentence)] tokenized_sents.append(tokenized_sent) fdist.update(tokenized_sent) # print("Number of word types in the tokenized data: ", len(fdist)) return tokenized_sents
def generate_freq_dist(samples): fdist = FreqDist() lemmatizer = WordNetLemmatizer() for sample in samples: temp = FreqDist([ lemmatizer.lemmatize(word, "v") for sent in sent_tokenize(sample.text) for word in word_tokenize(sent) ]) fdist.update(temp) return fdist
def reduce_text(t1, t2): words = FreqDist(t1[0]) words.update(t2[0]) try: bigrams = FreqDist(t1[1]) bigrams.update(t2[1]) except: logger.error('problem in reducing..') logger.error('t1: %s' % str(t1)) logger.error('t2: %s' % str(t2)) return words, bigrams
def sentence_ngrams(sentence): print(str('pid:{} ||'.format(os.getpid())), time.strftime("%y-%m-%d_%H:%M:%S")) sentence = sentence.strip() if not sentence: raise ValueError('Empty sentence!') words = sentence.split(' ') ngrams_bag = FreqDist() for i in range(4): ngrams_bag.update(ngrams(words, i + 1)) return ngrams_bag
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True): """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs""" frequency_dists_by_length = {} for phrase in phrase_lists: l = len(phrase) if l not in frequency_dists_by_length: frequency_dists_by_length[l] = FreqDist() frequency_dists_by_length[l].inc(tuple(phrase)) # word -> C-value(word) phrase_scores = {} # word -> num occurrences(word) phrase_frequencies = FreqDist() # word -> (t(word), c(word)) sub_phrase_scores = {} # traverse from longest phrases to shortest for length, frequency_dist in sorted(frequency_dists_by_length.items(), \ key=lambda pair: pair[0], reverse=True): # update global frequency counts with all counts of this length phrase_frequencies.update(frequency_dist) # within each phrase length, traverse from most common phrases to least for phrase, frequency in frequency_dist.iteritems(): if phrase in sub_phrase_scores: t, c = sub_phrase_scores[phrase] subtractive = 1.0 / c * t else: subtractive = 0 if weight_by_length: if include_unigrams: weight = log(length + 1, 2) else: weight = log(length, 2) else: weight = 1 c_value = weight * (frequency - subtractive) if c_value >= c_value_threshold: phrase_scores[phrase] = c_value for sub_phrase in utils.sub_lists(phrase): if sub_phrase in sub_phrase_scores: t, c = sub_phrase_scores[sub_phrase] else: t, c = 0, 0 sub_phrase_scores[sub_phrase] = t + frequency, c + 1 return phrase_scores, phrase_frequencies
async def gobaby(urls): """ Подготовка-запуск асинхронного запроса по урлам, формирование итогового словаря слов :param urls: :return: """ futures = [get_content(url) for url in urls] # созадниае списка футур (функций, которые будут выполнены в асинхронном режиме) done, _ = await asyncio.wait(futures) # Запуск ФУТУР result_dict = FreqDist() # Словарь в котором будут собираться результаты for future in done: # Если футура выполнена try: result_dict.update(give_me_my_dict(future.result())) except Exception as e: print('Ошибка вот такая', e) create_cloud(result_dict) #Создание облага слов
def analyze(data, out_dir): summary = {} freq = FreqDist() sentence_length = defaultdict(list) year_freq_dist = defaultdict(FreqDist) year_dist = defaultdict(int) year_month_dist = defaultdict(int) year_quarter_dist = defaultdict(int) has_date = no_date = sentences = words = 0 for year, date_str, title, text in data: date = parsedate(date_str) logger.debug('%s -> %s' % (date_str, str(date))) freq.update(ngram_phrases(text,3)) if date: # Since can't use strftime for years before 1900, we need to use isoformat year_str = date.isoformat()[:4] year_mo_str = date.isoformat()[:7] has_date += 1 else: no_date += 1 year_mo_str = '' if year_str: year_range = get_year_range(year_str) sentence_length[ year_range ].extend( sentence_lengths(text) ) year_freq_dist[ year_range ].update( ngram_phrases(text,3) ) year_dist[year] += 1 if year_mo_str: year_month_dist[year_mo_str] += 1 year_quarter_dist[ year_quarter(year_mo_str) ] += 1 sentences += count_sentences(text) words += count_words(text) logger.debug('Documents with a valid date: %d Documents without a valid date: %d' % (has_date, no_date)) logger.debug('Total # Sentences: %d' % sentences) logger.debug('Total $ Words: %d' % words) generate_dict_csv(['year', 'cnt'], year_dist, os.path.join(out_dir, 'year-data.csv')) generate_dict_csv(['yearmo', 'cnt'], year_month_dist, os.path.join(out_dir, 'year-mo-data.csv')) generate_dict_csv(['yearq', 'cnt'], year_quarter_dist, os.path.join(out_dir, 'year-quarter-data.csv')) generate_stream_js(year_freq_dist, os.path.join(out_dir, 'stream-data.json')) generate_cloud_csv(year_freq_dist, os.path.join(out_dir, 'year-phrase-data.csv')) generate_sentence_length_csv(sentence_length, os.path.join(out_dir, 'data-sentence-lengths.csv'))
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True): """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs""" frequency_dists_by_length = {} for phrase in phrase_lists: l = len(phrase) if l not in frequency_dists_by_length: frequency_dists_by_length[l] = FreqDist() frequency_dists_by_length[l].inc(tuple(phrase)) # word -> C-value(word) phrase_scores = {} # word -> num occurrences(word) phrase_frequencies = FreqDist() # word -> (t(word), c(word)) sub_phrase_scores = {} # traverse from longest phrases to shortest for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True): # update global frequency counts with all counts of this length phrase_frequencies.update(frequency_dist) # within each phrase length, traverse from most common phrases to least for phrase, frequency in frequency_dist.iteritems(): if phrase in sub_phrase_scores: t, c = sub_phrase_scores[phrase] subtractive = 1.0 / c * t else: subtractive = 0 if weight_by_length: if include_unigrams: weight = log(length + 1, 2) else: weight = log(length, 2) else: weight = 1 c_value = weight * (frequency - subtractive) if c_value >= c_value_threshold: phrase_scores[phrase] = c_value for sub_phrase in utils.sub_lists(phrase): if sub_phrase in sub_phrase_scores: t, c = sub_phrase_scores[sub_phrase] else: t, c = 0, 0 sub_phrase_scores[sub_phrase] = t + frequency, c + 1 return phrase_scores, phrase_frequencies
def updateCategoryDictionary(category): tweetList = twitter_fetch.get_new_tweets(classn=category) freq = FreqDist() tmpDict = FreqDist() for tweet in tweetList: freq.update(word for word in tokenizeTweet(tweet)) try: oldDict = readDictionaryFromFile(category + categoryDictFilePath) except: newDict = buildCategoryDictionary(category) return newDict oldDict.update(freq) saveDictionaryToFile(oldDict, category + categoryDictFilePath) return oldDict
def generate_lookup(ngrams: List[List[str]]): fdist = FreqDist() for entry in ngrams: fdist.update(list(entry)) lookup = {} for ngram in fdist: key = ngram[:-1] word = ngram[-1] count = fdist[ngram] if key not in lookup: lookup[key] = {} lookup[key][word] = count return lookup
def standard_log_key(log_key_sequence_str): # 将日志键, 通过滑动窗口分为一个一个日志序列,这里将其分为4个日志键为一个序列 tokens = log_key_sequence_str.split(' ') # 将日志键其变为int tokens = [int(i) for i in tokens] K = max(tokens)+1 # 日志键的种类个数 # print("the tokens are:",tokens) bigramfdist_4 = FreqDist() bigrams_4 = ngrams(tokens, 4) # from nltk.util import ngrams # a = ['1', '2', '3', '4', '5'] # b = ngrams(a, 2) # for i in b: # print # i # ('1', '2') # ('2', '3') # ('3', '4') # ('4', '5') bigramfdist_4.update(bigrams_4) print("the bigramfdsit_4 is:", list(bigramfdist_4.keys())) # we set the length of history logs as 3 seq = np.array(list(bigramfdist_4.keys())) # print("the seq is:",seq) X, Y = seq[:, :3], seq[:, 3:4] # print(seq.shape) # (253, 4) # print(X_normal.shape) # (253, 3) # print(Y_normal.shape) # (253, 1) X = np.reshape(X, (-1, 3, 1)) # print(X_normal) # [[[6] # [72] # [6]] # # [[72] # [6] # [6]] # ...] # 将数字等比缩小,变为从0到1 X = X / K # 将整型标签转为onehot num_classes = len(list(set(Y.T.tolist()[0]))) + 1 # num_classes指的是Y_normal的种类 Y = keras.utils.to_categorical(Y) # num_classes=num_classes return X, Y
def char_ngram_stats(texts, corpus, order, include_lower=False): ''' Find character n-grams in some texts. @param texts: List of texts @param corpus: The corpus that holds the texts @param order: The order of the n-grams to consider. @param include_lower: Whether to include list of lower-order n-grams in output @return: A tuple: First element is a list of all n-grams (only of given order) across all texts. Second element is a a matrix with a list of lists of 1-grams, 2-grams, ..., n-grams per text. ''' all_char_ngrams = FreqDist() text_char_ngrams = [] # Char n-grams found in each text for text in texts: if not text.endswith(".txt"): continue empty = len(corpus.raw(text)) == 0 # One freq. dist per n text_ngrams = [] for _ in range(order): text_ngrams.append(FreqDist()) if not empty: text_str = corpus.raw(text).replace('\r','').replace('\n', ' ') if include_lower: for n in range(1, order+1): char_ng = ngrams(text_str, n) text_ngrams[n-1].update(char_ng) if n == order: all_char_ngrams.update(char_ng) else: char_ng = ngrams(text_str, order) text_ngrams[order-1].update(char_ng) all_char_ngrams.update(char_ng) text_char_ngrams.append(text_ngrams) return all_char_ngrams, text_char_ngrams
def get_train(log_key_sequence_str): # # we have the sequence of log keys # seq = np.array(log_key_sequence) # divide the log sequence into 4 for every unit tokens = log_key_sequence_str.split(' ') for i in range(len(tokens)): tokens[i] = tokens[i].replace('E', '') tokens[i] = int(tokens[i]) # print("the tokens are:",tokens) bigramfdist_4 = FreqDist() bigrams_4 = ngrams(tokens, 4) bigramfdist_4.update(bigrams_4) # print("the bigramfdsit_4 is:",bigramfdist_4.keys()) # we set the length of history logs as 3 seq = np.array(list(bigramfdist_4.keys())) # print("the seq is:",seq) X, Y = seq[:, :3], seq[:, 3:4] return X, Y
def reduce_tweets(t1, t2): tags = FreqDist(t1[0]) tags.update(t2[0]) words = FreqDist(t1[1]) words.update(t2[1]) places = FreqDist(t1[2]) places.update(t2[2]) bigrams = FreqDist(t1[3]) bigrams.update(t2[3]) return tags, words, places, bigrams
def word_count(drug=None,limit=None,pos_filter=False,lemma=True): """Scans comment texts (from drug_mentions.texts) for selected drug, calculates most common words. KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. Passed to drug_mentions.texts. limit: int or None. Optional limit on SQL queries retrieved by drug_mentions.texts. Defaults to None (returns all hits). pos_filter: boolean. Passed to tokenize(), set True to use part-of-speech filtering. lemma: boolean. Passed to tokenize(), set True to use lemmatization. RETURNS: freq: nltk.probability.FreqDist object. Frequency distribution of words from comments. RAISES: ValueError: for invalid drug name. """ try: texts = dm.texts(drug=drug,limit=limit) except ValueError: raise ValueError('Invalid drug name.') freq = FreqDist() for text in texts: freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma)) return freq
def buildGoogleUnigram( ): DirPrefix = "/home/jcavalie/googleNgrams_unigrams/" unigramFiles = os.listdir( DirPrefix ) unigramFiles = list( map( lambda _fileName: DirPrefix + _fileName, unigramFiles ) ) masterUnigram = FreqDist( ) with multiprocessing.Pool( 8, initializer = initProcess ) as ProcessPool: resAsync = ProcessPool.map_async( _buildUnigram, unigramFiles ) results = resAsync.get( ) ProcessPool.join( ) print( "all jobs finished, building master unigram" ) for freqdist in results: masterUnigram.update( freqdist ) with open( "PickledData/GoogleUnigram.pickle", 'wb' ) as pklFile: pickle.dump( masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL ) return
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [ fd.freq(word) * math.log(float(num_docs) / doc_freqs[word]) for word in all_tokens ] else: raise ValueError("No such feature type: %s" % feature_type) matrix[:, i] = v return matrix
def buildGoogleUnigram(): DirPrefix = "/home/jcavalie/googleNgrams_unigrams/" unigramFiles = os.listdir(DirPrefix) unigramFiles = list( map(lambda _fileName: DirPrefix + _fileName, unigramFiles)) masterUnigram = FreqDist() with multiprocessing.Pool(8, initializer=initProcess) as ProcessPool: resAsync = ProcessPool.map_async(_buildUnigram, unigramFiles) results = resAsync.get() ProcessPool.join() print("all jobs finished, building master unigram") for freqdist in results: masterUnigram.update(freqdist) with open("PickledData/GoogleUnigram.pickle", 'wb') as pklFile: pickle.dump(masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL) return
class AddAlphaBigramModel(): def __init__(self, alpha=0.1): self.vocabulary=set() self.V = 0 self.bigrams=ConditionalFreqDist([]) self.unigrams=FreqDist([]) self.alpha = 0.1 def train(self): self.vocabulary=set() this_bigrams=[] self.unigrams = FreqDist([]) for fileid in gutenberg.fileids(): for sentence in gutenberg.sents(fileid): words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",] this_bigrams += bigrams(words) self.vocabulary.update(words) self.unigrams.update(words) self.bigrams=ConditionalFreqDist(this_bigrams) self.V = len(self.vocabulary) def bigram_prob(self, w1, w2): numerator = self.bigrams[w1][w2] + self.alpha denominator = self.bigrams[w1].N() + (self.alpha * self.V) retval= math.log(numerator / denominator) return retval def unigram_prob(self, w): numerator = self.unigrams[w] + self.alpha denominator = self.unigrams.N() + (self.alpha * self.V) return math.log(numerator/denominator) def __contains__(self, w): return w in self.vocabulary
def _create_vocabulary(self): """Analyze all the text sentences in the data set and create a vocabulary: 1. The dataset vocabulary 2. Number of words in the vocabulary 3. Length of the longest sentence """ frequencies = FreqDist() max_sentence_length = 0 for idx in range(self.__len__()): txt_path = os.path.join(self.text_dir_path, self.images_df.iloc[idx].path + ".txt") with open(txt_path, "r") as f: for line in f: tokens = [ token.lower() for token in self.tokenizer.tokenize(line) ] if len(tokens) > max_sentence_length: max_sentence_length = len(tokens) frequencies.update(tokens) # Finally, create the vocabulary object from the torchtext library. vocabulary = Vocab(frequencies, min_freq=2, specials=["<unk>", "<eos>"]) return vocabulary, len(vocabulary.itos), max_sentence_length
def process_documents(path, html_conf): logging.info("Using documents from \"" + path + "\" directory ") if path[-1] != "/" : path + "/" documents = {} allterms = {} listing = os.listdir(path) allfreq = FreqDist() # retriving document content - discarding structure logging.info("Processing files...") for infile in listing: logging.info("\tReading document " + infile) raw_doc = open(path + infile, 'r').read() nonhtml_doc = nltk.clean_html(raw_doc) word_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', raw_doc)) terms_list = [ x.lower() for x in word_list if x.lower() not in stopwords.words('english')] stemmes = steming(terms_list) for stem in stemmes : allterms[stem] = 0 fdist = FreqDist(word.lower() for word in stemmes) allfreq.update(word.lower() for word in stemmes) htmldist = evaluate_html(raw_doc.lower(), html_conf) fdist.update(htmldist) allfreq.update(htmldist) documents[infile] = { 'docname': infile, 'terms': stemmes, 'tf': fdist, 'tfidf': None } for key, doc in documents.iteritems(): doctfidf = compute_tfidf(doc ,documents) documents[key]['tfidf'] = dict(allterms.items() + doctfidf.items()) return documents, allfreq
f.close() banset = set(stoplist) count = 0 for hotel in wordlists.fileids(): print hotel list1 = wordlists.words(hotel) list2 = [] for w in list1: list2.append(w) list3 = [w.strip() for w in list2] if(count==0): fdict = FreqDist(list3) else: fdict.update(list3) count+=1 print len(fdict) fdict2=fdict.copy() for w in fdict.keys()[:]: if w.strip() in banset or len(w.strip()) < 3 or len(w.strip()) > 25: del fdict2[w] elif isinstance(w, unicode): del fdict2[w] for w in fdict2.keys(): if len(w) < 3: print w, len(w)
testsets['neutral']) classifier.show_most_informative_features() def word_feats(words): return dict([(word, True) for word in words]) print 'evaluating single word features' evaluate_classifier(word_feats) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in tweets.words(categories=['pos']): word_fd.update([word.lower()]) label_word_fd['pos'].update([word.lower()]) for word in tweets.words(categories=['neg']): word_fd.update([word.lower()]) label_word_fd['neg'].update([word.lower()]) for word in tweets.words(categories=['neutral']): word_fd.update([word.lower()]) label_word_fd['neutral'].update([word.lower()]) # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N()
# featureBigramNeg.append(helperFuntions.bigramReturner(xNew)) featureUnigramNeg.append(helperFuntions.getFeatureVector(xNew)) # break for x in dataPosTrain: # print x xNew = helperFuntions.removePunctuation(x) xNew = helperFuntions.toLower(xNew) xNew = helperFuntions.removeNumbers(xNew) xNew = helperFuntions.removeStopWords(xNew) # featureBigramPos.append(helperFuntions.bigramReturner(xNew)) featureUnigramPos.append(helperFuntions.getFeatureVector(xNew)) # break for word in featureUnigramPos: word_fd.update(word) label_word_fd['pos'].update(word) for word in featureUnigramNeg: word_fd.update(word) label_word_fd['neg'].update(word) # print featureBigramPos # print featureUnigramPos pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems():
class EditDistanceFinder(): def __init__(self): self.char_probs = ConditionalProbDist([],MLEProbDist) self.bichar_freqs = ConditionalFreqDist([]) self.transp_freqs = FreqDist() self.DOWN,self.LEFT,self.DIAG,self.DOUBLE_DIAG = range(4) self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4) def train(self, fname): misspellings=[] for line in open(fname): line=line.strip() if not(line): continue w1, w2 = line.split(",") misspellings.append((w1.strip(),w2.strip())) last_alignments = None done = False while not done: print("Iteration") alignments, bigrams = self.train_alignments(misspellings) self.train_costs(alignments, bigrams) done = (alignments == last_alignments) last_alignments = alignments def train_alignments(self, misspellings): alignments = [] self.bichar_freqs = FreqDist() for error, corrected in misspellings: distance, this_alignments = self.align(corrected, error) alignments += this_alignments bigrams = [corrected[i:i+2] for i in range(len(corrected)-1)] self.bichar_freqs.update(bigrams) return alignments,bigrams def train_costs(self, alignments,bigrams): add_one_aligns = [(a,b) for a in string.ascii_lowercase for b in string.ascii_lowercase] single_aligns = [(a,b) for a,b in alignments if len(a) < 2] char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns) self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist) double_aligns = [a for a,b in alignments if len(a) >= 2] self.transp_freqs = FreqDist(double_aligns) def align(self, w1, w2, verbose=False): M = len(w1) +1 N = len(w2) +1 table = numpy.zeros((M,N)) backtrace = numpy.zeros((M,N)) for i in range(1,M): w1_char = w1[i-1] table[i,0] = table[i-1,0] + self.del_cost(w1_char) backtrace[i,0] = self.DOWN for j in range(1,N): w2_char = w2[j-1] backtrace[0,j] = self.LEFT table[0,j] = table[0,j-1] + self.ins_cost(w2_char) for i in range(1,M): w1_char = w1[i-1] for j in range(1,N): w2_char = w2[j-1] this_del = table[i-1,j] + self.del_cost(w1_char) this_ins = table[i,j-1] + self.ins_cost(w2_char) this_sub = table[i-1,j-1] + self.sub_cost(w1_char,w2_char) if j > 1 and i > 1 and w1[i-1] == w2[j-2] and w1[i-2]==w2[j-1] and w1[i-1] != w1[i-2]: this_transp = table[i-2,j-2] + self.transp_cost(w1_char, w2_char) else: this_transp = 999999 min_cost = min(this_del, this_ins, this_sub, this_transp) table[i,j] = min_cost if this_sub == min_cost: backtrace[i,j] = self.DIAG elif this_transp == min_cost: backtrace[i,j] = self.DOUBLE_DIAG elif this_ins == min_cost: backtrace[i,j] = self.LEFT else: # insert backtrace[i,j] = self.DOWN alignments = [] i = M - 1 j = N - 1 while (j or i): this_backtrace = backtrace[i,j] if this_backtrace == self.DIAG: # sub alignments.append((w1[i-1],w2[j-1])) i -= 1 j -= 1 elif this_backtrace == self.DOUBLE_DIAG: alignments.append((w1[i-2:i],w2[j-2:j])) i -= 2 j -= 2 elif this_backtrace == self.DOWN: # delete alignments.append((w1[i-1],"%")) i -= 1 elif this_backtrace == self.LEFT: # insert alignments.append(("%",w2[j-1])) j -= 1 alignments.reverse() if verbose: print(table) return table[M-1,N-1], alignments def transp_cost(self, char1, char2): ## how often do char1 and char2 get transposed? return 1 - self.transp_prob(char1,char2) def del_cost(self, char): return 1-self.char_probs[char].prob('%') def ins_cost(self, char): return 1-self.char_probs['%'].prob(char) def sub_cost(self, char1, char2): return 1-self.char_probs[char1].prob(char2) def transp_prob(self, char1, char2): numerator = self.transp_freqs[char1] + .1 denominator = self.bichar_freqs[char1] + .1*26*26 return numerator / denominator def prob(self, w1, w2): score, alignment = self.align(w1, w2) total_prob = 0 for a, b in alignment: if len(a) > 1: total_prob += log(self.transp_prob(a[0],a[1])) else: total_prob += self.char_probs[a].logprob(b) return total_prob def show_alignment(self, alignments): print("String1:", " ".join([x[0] for x in alignments])) print("String2:", " ".join([x[1] for x in alignments]))
fin.close() count=0 for hotel in wordlists.fileids(): list4 = [] print hotel taglist = tagger.tag(wordlists.words(hotel)) list1 = find_chunk('CHUNK: {<JJ.*> <RB>* <NN.*>+}') list2 = find_chunk2('CHUNK: {<NN.*>+ <VB.*> <RB>* <JJ.*>}') list3 = find_chunk3('CHUNK: {<VB.*> <RB>* <JJ.*> <NN.*>}') list4 = list1 + list2 + list3 if(count==0): fdict = FreqDist(list4) else: fdict.update(list4) count+=1 print 'Size of dictionary:',len(fdict) print '' f=open('stoplist.txt', 'r') stoplist=[] ban='IV' while(ban!=''): ban=f.readline() stoplist.append(ban.strip()) f.close() banset = set(stoplist) fdict2=fdict.copy()
def __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. :param n: the order of the language model (ngram size) :type n: C{int} :param train: the training text :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} :param estimator: a function for generating a probability distribution---defaults to MLEProbDist :type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s> :type pad_left: bool :param pad_right: whether to pad the right of each sentence with </s> :type pad_right: bool :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('<s>',) * (n - 1) if pad_left else () # Need _rpad even for unigrams or padded entropy will give # wrong answer because '</s>' will be treated as unseen... self._rpad = ('</s>',) if pad_right else () self._padLen = len(self._lpad)+len(self._rpad) self._N=0 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent if estimator is None: assert (estimator_args is ()) and (estimator_kwargs=={}),\ "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs) estimator = lambda fdist, bins: MLEProbDist(fdist) # Given backoff, a generator isn't acceptable if not isinstance(train,collections.abc.Sequence): train=list(train) self._W = len(train) # Coerce to list of list -- note that this means to train charGrams, # requires exploding the words ahead of time if train is not None: if isinstance(train[0], compat.string_types): train = [train] self._W=1 elif not isinstance(train[0],collections.abc.Sequence): # if you mix strings and generators, you have only yourself # to blame! for i in range(len(train)): train[i]=list(train[i]) if n == 1: if pad_right: sents=(chain(s,self._rpad) for s in train) else: sents=train fd=FreqDist() for s in sents: fd.update(s) if not estimator_args and not estimator_kwargs: self._model = estimator(fd,fd.B()) else: self._model = estimator(fd,fd.B(), *estimator_args, **estimator_kwargs) self._N=fd.N() else: cfd = ConditionalFreqDist() self._ngrams = set() for sent in train: self._N+=len(sent)+delta for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token]+=1 if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) # Code below here in this method, and the _words_following and _alpha method, are from # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) if isclose(total_observed_pr,1.0): total_observed_pr=1.0 else: assert 0.0 <= total_observed_pr <= 1.0,\ "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr) # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr if beta!=0.0: assert (0.0 <= backoff_total_pr < 1.0), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = beta / (1.0 - backoff_total_pr) else: assert ((0.0 <= backoff_total_pr < 1.0) or isclose(1.0,backoff_total_pr)), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = 0.0 self._backoff_alphas[ctxt] = alpha_ctxt
class TwitterCorpus(object): def __init__(self, args): self.dictionary = Dictionary() self.dictionary.add_word("<<<padding>>>") self.padding_value = self.dictionary.word2idx["<<<padding>>>"] self.max_vocab_size = args.max_vocab_size self.fdist = FreqDist() self.file_prepared = False self.username_re = re.compile("\@[\w]+") self.url_re = re.compile("http[s]?://[\w|\.|\?|\/]+") self.www_re = re.compile("www.[^ ]+") self.emoticon_re = re.compile( "(;D)|(:D)|(:/)|(=\))|(:-D)|(;-D)|(:\()|(=\()|(:\s{1}\()") self.run_on_re = re.compile(r"(\w)\1{2,}", re.DOTALL) self.negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } self.neg_pattern = re.compile(r'\b(' + '|'.join(self.negations_dic.keys()) + r')\b') self.datafile = os.path.join(args.data, "tweet_data.h5") self.data_handle = h5py.File(os.path.join(args.data, "tweet_data.h5"), 'w') self.prepare_dataset(args.training, 'training') self.prepare_dataset(args.testing, 'testing') self.data_handle.close() self.file_prepared = True def __getstate__(self): ''' Do not pickle the handle to the h5 file ''' state = self.__dict__.copy() del state['data_handle'] return state def __setstate__(self, state): self.__dict__.update(state) if os.path.exists(self.datafile): self.file_prepared = True else: self.file_prepared = False def get_padding_idx(self): return self.padding_value def get_data_file(self): if self.file_prepared: return self.datafile else: print( 'File is not prepared. Re-build TwitterCorpus object properly.', file=sys.stderr) def prepare_dataset(self, path, data_split): """ Preprocess the dataset in `path` data_split \in ['training','testing'] """ outpath = path.replace(".csv", ".prepared.csv") self._make_freqdist(path) tokens, max_len, num_tweets = self._preprocess_and_build_dictionary( path, outpath) self._pack_to_h5(outpath, data_split, tokens, max_len, num_tweets) def _process_tweet(self, tweet): """ Apply feature transformations to each tweet in the dataset """ # unique tokens with this, no depunct: 755992 # removing single char tokens, expanding contractions: 277990 # target vocab should be 76643, size reported in Kalchbrenner & Gref & Blunsom tweet = tweet.strip() tweet = BeautifulSoup(tweet, 'lxml').get_text() tweet = tweet.replace(u"\ufffd", "?") # @usernames -> USERNAME tweet = re.sub(self.username_re, lambda x: "USERNAME", tweet) # URLS -> URL tweet = re.sub(self.url_re, lambda x: "URL", tweet) # www. URLs -> URL tweet = re.sub(self.www_re, lambda x: "URL", tweet) # expand negation contractions tweet = re.sub(self.neg_pattern, lambda x: self.negations_dic[x.group()], tweet) # standardize emoticons tweet = re.sub(self.emoticon_re, lambda x: "", tweet) # shrink extended runs of any char tweet = re.sub(self.run_on_re, r"\1\1", tweet) # result = re.sub("(\d+) (\w+)", r"\2 \1") return tweet def _make_freqdist(self, path): """ Read all the tweets, calculate the frequencies of the words appearing in processed tweets """ translator = str.maketrans('', '', string.punctuation) print("Counting words in training...") with open(path, 'r', encoding='utf-8', errors='replace') as f: tweet_reader = csv.reader(f, delimiter=',', quotechar='"') for i, parts in enumerate(tweet_reader): tweet = parts[-1] clean_tweet = self._process_tweet(tweet) lc_clean_tweet = clean_tweet.lower() words = [ w for w in lc_clean_tweet.translate(translator).split() if len(w) > 1 ] self.fdist.update(words) if i % 10000 == 0: print("Processed first ", i, "tweets") def _preprocess_and_build_dictionary(self, inpath, outpath, depunct=True): """ Preprocess the Twitter Sentiment data set in `inpath`, build the dictionary, and write the sanitized output to `outpath`. In addition, return how many unique tokens we see in the corpus. return the number of unique tokens seen, the largest number of words seen in a single tweet, and the number of tweets in this file. """ assert os.path.exists(inpath) if depunct: translator = str.maketrans('', '', string.punctuation) with open(inpath, 'r', encoding='utf-8-sig', errors='replace') as in_f, open(outpath, 'w', encoding='utf-8') as out_f: tweet_reader = csv.reader(in_f, delimiter=',', quotechar='"') tweet_writer = csv.writer(out_f, delimiter=',', quotechar='"') max_len = 0 vocab_words = frozenset( [w for w, c in self.fdist.most_common(self.max_vocab_size)]) for i, parts in enumerate(tweet_reader): if (i % 10000) == 0: print("Finished tweet ", i) tweet = parts[-1] clean_tweet = self._process_tweet(tweet) lc_clean_tweet = clean_tweet.lower() words = [ w for w in lc_clean_tweet.translate(translator).split() if len(w) > 1 and w in vocab_words ] max_len = len(words) if len(words) > max_len else max_len for word in words: self.dictionary.add_word(word) clean_line = parts[:-1] + [" ".join(words)] tweet_writer.writerow(clean_line) unique_tokens = len(self.dictionary) return unique_tokens, max_len, i + 1 def _tweet_to_list(self, parts, max_len): label, tweet = parts[0], parts[-1] try: label = int(label) except ValueError: print('Cannot coerce ', label, ' to int ') label = -1 words = tweet.split() encoded_words = [self.dictionary.word2idx[word] for word in words] encoded_words = encoded_words + [ self.padding_value for i in range(max_len - len(words)) ] assert (len(encoded_words) == max_len) return encoded_words, label def _calculate_amount_to_write(self, chunk, chunk_size, num_examples): amount_to_write = num_examples - (chunk * chunk_size) if amount_to_write < 0: amount_to_write = num_examples if amount_to_write < chunk_size: return amount_to_write else: return chunk_size def _pack_to_h5(self, path, group, tokens, max_len, num_examples): """ Build the word2idx data structure for the Twitter Sentiment data set in `path` I'll use an hdf5 file to store the embedded seqs, labels. path := path to cleaned up tweet file tokens := number of tokens to encode group := 'training' or 'testing', which group in the h5 file do we encode the data from `path` max_len := most number of words observed in a tweet num_examples := number of tweets in this file in `path` """ assert os.path.exists(path) # create groups for data, labels group_name = '/' + group this_group = self.data_handle.create_group(group_name) data_name = group + "_data" label_name = group + "_labels" chunk = 0 chunk_size = 10000 buffer_size = self._calculate_amount_to_write(chunk, chunk_size, num_examples) data = this_group.create_dataset(data_name, shape=(num_examples, max_len), chunks=(buffer_size, max_len), dtype=np.int32) labels = this_group.create_dataset(label_name, shape=(num_examples, 1), dtype=np.int32) # parse, encode words in each tweet, write to h5file temp_array = np.empty((chunk_size, max_len), dtype=np.int32) temp_labels = np.empty((chunk_size, 1), dtype=np.int32) with open(path, 'r', encoding='utf-8-sig', errors='replace') as f: ids = torch.LongTensor(tokens) token = 0 tweet_reader = csv.reader(f, delimiter=',', quotechar='"') for i, parts in enumerate(tweet_reader): embedded_list, label = self._tweet_to_list(parts, max_len) temp_array[i % chunk_size, :] = np.array(embedded_list) temp_labels[i % chunk_size, 0] = label if (i + 1) % buffer_size == 0: # write the buffer to the h5file data[chunk * chunk_size:chunk * chunk_size + buffer_size, :] = temp_array[0:buffer_size, :] labels[chunk * chunk_size:chunk * chunk_size + buffer_size, 0] = temp_labels[0:buffer_size, 0] chunk += 1 buffer_size = self._calculate_amount_to_write( chunk, chunk_size, num_examples)
threestars.append(review) if stars[i] == 2: twostars.append(review) if stars[i] == 1: onestars.append(review) i = i + 1 word_fd = FreqDist() label_word_fd = ConditionalFreqDist() print 'Getting words...' for review in fivestars: if type(review) is str: for word in review.split(): if word not in stop: word_fd.update(stemmer.stem(word.decode('utf-8')).lower()) label_word_fd['5'].update( stemmer.stem(word.decode('utf-8')).lower()) for review in fourstars: if type(review) is str: for word in review.split(): word_fd.update(stemmer.stem(word.decode('utf-8')).lower()) label_word_fd['4'].update( stemmer.stem(word.decode('utf-8')).lower()) for review in threestars: if type(review) is str: for word in review.split(): word_fd.update(stemmer.stem(word.decode('utf-8')).lower()) label_word_fd['3'].update(
class EditDistanceFinder(): def __init__(self): self.char_probs = ConditionalProbDist([], MLEProbDist) self.bichar_freqs = ConditionalFreqDist([]) self.transp_freqs = FreqDist() self.DOWN, self.LEFT, self.DIAG, self.DOUBLE_DIAG = range(4) self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4) def train(self, fname): misspellings = [] for line in open(fname): line = line.strip() if not (line): continue w1, w2 = line.split(",") misspellings.append((w1.strip(), w2.strip())) last_alignments = None done = False while not done: print("Iteration") alignments, bigrams = self.train_alignments(misspellings) self.train_costs(alignments, bigrams) done = (alignments == last_alignments) last_alignments = alignments def train_alignments(self, misspellings): alignments = [] self.bichar_freqs = FreqDist() for error, corrected in misspellings: distance, this_alignments = self.align(corrected, error) alignments += this_alignments bigrams = [corrected[i:i + 2] for i in range(len(corrected) - 1)] self.bichar_freqs.update(bigrams) return alignments, bigrams def train_costs(self, alignments, bigrams): add_one_aligns = [(a, b) for a in string.ascii_lowercase for b in string.ascii_lowercase] single_aligns = [(a, b) for a, b in alignments if len(a) < 2] char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns) self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist) double_aligns = [a for a, b in alignments if len(a) >= 2] self.transp_freqs = FreqDist(double_aligns) def align(self, w1, w2, verbose=False): M = len(w1) + 1 N = len(w2) + 1 table = numpy.zeros((M, N)) backtrace = numpy.zeros((M, N)) for i in range(1, M): w1_char = w1[i - 1] table[i, 0] = table[i - 1, 0] + self.del_cost(w1_char) backtrace[i, 0] = self.DOWN for j in range(1, N): w2_char = w2[j - 1] backtrace[0, j] = self.LEFT table[0, j] = table[0, j - 1] + self.ins_cost(w2_char) for i in range(1, M): w1_char = w1[i - 1] for j in range(1, N): w2_char = w2[j - 1] this_del = table[i - 1, j] + self.del_cost(w1_char) this_ins = table[i, j - 1] + self.ins_cost(w2_char) this_sub = table[i - 1, j - 1] + self.sub_cost( w1_char, w2_char) if j > 1 and i > 1 and w1[i - 1] == w2[j - 2] and w1[ i - 2] == w2[j - 1] and w1[i - 1] != w1[i - 2]: this_transp = table[i - 2, j - 2] + self.transp_cost( w1_char, w2_char) else: this_transp = 999999 min_cost = min(this_del, this_ins, this_sub, this_transp) table[i, j] = min_cost if this_sub == min_cost: backtrace[i, j] = self.DIAG elif this_transp == min_cost: backtrace[i, j] = self.DOUBLE_DIAG elif this_ins == min_cost: backtrace[i, j] = self.LEFT else: # insert backtrace[i, j] = self.DOWN alignments = [] i = M - 1 j = N - 1 while (j or i): this_backtrace = backtrace[i, j] if this_backtrace == self.DIAG: # sub alignments.append((w1[i - 1], w2[j - 1])) i -= 1 j -= 1 elif this_backtrace == self.DOUBLE_DIAG: alignments.append((w1[i - 2:i], w2[j - 2:j])) i -= 2 j -= 2 elif this_backtrace == self.DOWN: # delete alignments.append((w1[i - 1], "%")) i -= 1 elif this_backtrace == self.LEFT: # insert alignments.append(("%", w2[j - 1])) j -= 1 alignments.reverse() if verbose: print(table) return table[M - 1, N - 1], alignments def transp_cost(self, char1, char2): ## how often do char1 and char2 get transposed? return 1 - self.transp_prob(char1, char2) def del_cost(self, char): return 1 - self.char_probs[char].prob('%') def ins_cost(self, char): return 1 - self.char_probs['%'].prob(char) def sub_cost(self, char1, char2): return 1 - self.char_probs[char1].prob(char2) def transp_prob(self, char1, char2): numerator = self.transp_freqs[char1] + .1 denominator = self.bichar_freqs[char1] + .1 * 26 * 26 return numerator / denominator def prob(self, w1, w2): score, alignment = self.align(w1, w2) total_prob = 0 for a, b in alignment: if len(a) > 1: total_prob += log(self.transp_prob(a[0], a[1])) else: total_prob += self.char_probs[a].logprob(b) return total_prob def show_alignment(self, alignments): print("String1:", " ".join([x[0] for x in alignments])) print("String2:", " ".join([x[1] for x in alignments]))
'THEREFORE', 'THEY', 'THEY\'D', 'THEY\'LL', 'THEY\'RE', 'THIRD', 'THIRTEEN', 'THIRTEENTH', 'THIRTIETH', 'THIRTY', 'THIS', 'THITHER', 'THOSE', 'THOUGH', 'THOUSAND', 'THOUSANDTH', 'THREE', 'THRICE', 'THROUGH', 'THUS', 'TILL', 'TO', 'TOWARDS', 'TODAY', 'TOMORROW', 'TOO', 'TWELFTH', 'TWELVE', 'TWENTIETH', 'TWENTY', 'TWICE', 'TWO', 'UNDER', 'UNDERNEATH', 'UNLESS', 'UNTIL', 'UP', 'US', 'VERY', 'WHEN', 'WAS', 'WASN\'T', 'WE', 'WE\'D', 'WE\'LL', 'WERE', 'WE\'RE', 'WEREN\'T', 'WE\'VE', 'WHAT', 'WHENCE', 'WHERE', 'WHEREAS', 'WHICH', 'WHILE', 'WHITHER', 'WHO', 'WHOM', 'WHOSE', 'WHY', 'WILL', 'WITH', 'WITHIN', 'WITHOUT', 'WON\'T', 'WOULD', 'WOULDN\'T', 'YES', 'YESTERDAY', 'YET', 'YOU', 'YOUR', 'YOU\'D', 'YOU\'LL', 'YOU\'RE', 'YOURS', 'YOURSELF', 'YOURSELVES', 'YOU\'VE'] files = ['IN', 'IP', 'LY', 'NA', 'OP', 'SP'] stop_words = set([word.lower() for word in function_words]) for file in files: with open(file + '.txt', 'w') as my_file: for each in glob('Mini-CORE/1+' + file + '*.txt'): # for each in glob('Mini-CORE/1+', files, '*.txt'): with open(each, 'r') as read_file: fd = FreqDist() text = read_file.read().lower() cleaned_text = clean(text) tokens = nltk.word_tokenize(cleaned_text) words = [token for token in tokens if token not in stop_words] tokens_fd = FreqDist(words) fd.update(tokens_fd) print(fd.most_common(), file=my_file)