def get_word_bigram_scores(pos_words, neg_words): pos_words_plain = list(itertools.chain(*pos_words)) neg_words_plain = list(itertools.chain(*neg_words)) bigram_finder = BigramCollocationFinder.from_words(pos_words_plain) pos_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) bigram_finder = BigramCollocationFinder.from_words(neg_words_plain) neg_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words_plain + pos_bigrams # 词和双词搭配 neg = neg_words_plain + neg_bigrams all_words = pos + neg pos_word_fd = FreqDist(pos) neg_word_fd = FreqDist(neg) word_fd = FreqDist(all_words) pos_word_count = pos_word_fd.N() # 积极词的数量 neg_word_count = neg_word_fd.N() # 消极词的数量 #total_word_count = pos_word_count + neg_word_count total_word_count = word_fd.N() word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def work_1(): file_string = "" txt_file = open("trabalho1.txt", "r+") csv_file = open("trabalho1.csv", "w+") csv_manage = csv.writer(csv_file, delimiter=";", quoting=csv.QUOTE_MINIMAL) base_text = txt_file.read() sentences = word_tokenize(base_text) frequency = FreqDist(sentences) print("texto : {0}".format(base_text)) print("Total de palavras : {0}".format(frequency.N())) print("Total de Termos : {0}".format(len(frequency.keys()))) print("") print("Tabela de Frequência de Termos") print("") for key in frequency.keys(): csv_manage.writerow([key, str(frequency.get(key))]) print("Termo: {0} Total: {1}".format(key, str(frequency.get(key)))) pdfOutput = PdfOutput(frequency, frequency.N(), len(frequency.keys()), base_text) servicePdfManager = ServiceManagerPdf() servicePdfManager.writePdf(pdfOutput) txt_file.close() csv_file.close()
def compute_scores(self, collocate_data, role=None, category=None): data = collocate_data if role: data = [x for x in collocate_data if x['role'] == role] if category: data = [x for x in collocate_data if x['category'] == category] dialogue_fd = FreqDist([w for d in data for w in d['dialogue_tokens']]) context_fd = FreqDist([w for d in data for w in d['context_tokens']]) collocate_fd = FreqDist([(w_c, w_d) for d in data for w_c in d['context_tokens'] for w_d in d['dialogue_tokens']]) N_d = dialogue_fd.N() N_c = context_fd.N() N_cd = collocate_fd.N() scores = defaultdict(dict) const = np.log(N_c) + np.log(N_d) - np.log(N_cd) for pair in collocate_fd: w_c, w_d = pair s = np.log(collocate_fd[pair]) - np.log(dialogue_fd[w_d]) - np.log( context_fd[w_c]) + const scores[w_d][w_c] = s return dict(scores)
class FrequenceVocabulary: """ Vocabulary that contains words frequency estimated from words count in files specified. """ def __init__(self, miss_f): """ Construct new vocabulary with function that computes word probability for words which absent in vocabulary. Example usage: >>> miss_f = lambda key, N: 10. / (N * 10 ** len(key)) :param miss_f: function for estimating probability of missing words. """ self.vocab = FreqDist() self._miss_f = miss_f def load_vocab(self, root='.', files='.*'): """ Load new vocabulary. :param root: the root directory for the corpus. :param files: A list or regexp specifying the files in this corpus. """ voc = PlaintextCorpusReader(root, files) for word in voc.words(): self.vocab[word.lower()] += 1 def p(self, key): """ :param key: word to compute it's probability :return: A probability distribution computed for key. """ return 1. * self.vocab[key] / self.vocab.N() if key in self.vocab.keys( ) else self._miss_f(key, self.vocab.N())
def _prepare(self): if self._is_prepared: return freq_dist_a = FreqDist() for a in self._pair.chunks_a: freq_dist_a.update(self._tokenize(a)) freq_dist_b = FreqDist() for b in self._pair.chunks_b: freq_dist_b.update(self._tokenize(b)) self._avg_freq_dist = FreqDist() n_a = freq_dist_a.N() n_b = freq_dist_b.N() for a in freq_dist_a: self._avg_freq_dist[a] = (freq_dist_a[a] / n_a + freq_dist_b[a] / n_b) / 2.0 for b in freq_dist_b: if self._avg_freq_dist[b] != 0.0: continue self._avg_freq_dist[b] = (freq_dist_a[b] / n_a + freq_dist_b[b] / n_b) / 2.0 self._chunks = self._sampler.generate_chunk_pairs(self._pair) self.__freq_a = None self.__freq_b = None self._is_prepared = True
def term_ratio(tf1: FreqDist, tf2: FreqDist, c=None, normalize=False): if normalize: if c is None: c = 1e-4 return { word: (tf1[word] / tf1.N()) / (tf2[word] / tf2.N() + c) for word in tf1.keys() } else: if c is None: c = 1 return {word: tf1[word] / (tf2[word] + c) for word in tf1.keys()}
def freq(inp, outp): """Input: a text file Output: a table of word frequency with three columns for Word, Count and Percent frequency """ text = open(inp, 'r').read() sents = nltk.sent_tokenize(text) all_words = [] for sent in sents: words = nltk.word_tokenize(sent) all_words += words all_words = [x.lower() for x in all_words] freq = FreqDist(all_words) tot = float(freq.N()) # output o = open(outp, 'w') o.write("Word\tCount\tPercent\n") for pair in freq.most_common(): o.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / tot)) o.close()
def count_pos(input, language): if language == 'english-nltk': words = word_tokenize(input) pos = pos_tag(words) elif language == 'english': s = pattern.en.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'spanish': s = pattern.es.parsetree(input, relations=True, lemmata=True) words = [] pos = [] for sentence in s: for w in sentence.words: words.append(w.string) pos.append((w.string, clean_text.clean_pos(w.type))) elif language == 'dutch': words = word_tokenize(input, 'dutch') tagger = nltk.data.load('taggers/alpino_aubt.pickle') pos = tagger.tag(words) tags = FreqDist(tag for (word, tag) in pos) relative_frequency = [] for item in tags.items(): relative_frequency.append((item[0], float(item[1]) / tags.N())) return relative_frequency
def extract_doc_feats_counts(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() for i in range(0, doc_num): doc_features = [0] * len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) for (tok, freq) in doc_freqs.items(): indx = tokens.index(tok) doc_features[indx] = freq * doc_freqs.N() f_tmp = numpy.asarray(doc_features) glob_features[i] = f_tmp.tolist() return (glob_features, tokens)
class Vocab: def __init__(self, tokens: List[Tokens], special_symbols: List[str] = None): special_symbols = [] if special_symbols is None else special_symbols special_symbols = special_symbols + [ "<eot>", "<response>", "<eos>", "<unk>", "<pad>", "<bos>" ] self.vocab = FreqDist() self.cdf = 0. for sample in tokens: for token in sample: if token not in special_symbols: self.vocab[token] += 1 print( f"total samples in vocab: {self.vocab.N()}, total tokens in vocab: {self.vocab.B()}" ) self.itos = [] self.stoi = {} def fit(self, num_tokens=15000): cdf = 0. for cdf in self.vocab._cumulative_frequencies( [i[0] for i in self.vocab.most_common(num_tokens)]): pass self.cdf = cdf / self.vocab.N() print( f"cdf of the {num_tokens} most common tokens in vocab {self.cdf}") self.itos = ["<unk>", "<pad>", "<eos>", "<bos>"] + [ tup[0] for tup in self.vocab.most_common(num_tokens) ] self.stoi = Counter( {key: index for index, key in enumerate(self.itos)})
def extract_doc_feats(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) occurences = defaultdict(lambda: 0) for doc in refactorized_documents: for x in set(doc): occurences[x] += 1 ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() glob_features = [{}]*doc_num for i in range(0, doc_num): doc_features = [0]*len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) doc_len = len(refactorized_documents[i]) for (tok,num) in doc_freqs.items(): max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len) # augmented #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq) tf = 1+math.log(num,10) idf = math.log( float(doc_num) / (float(occurences[tok])) ,10) tfidf = tf*idf indx = tokens.index(tok) doc_features[indx] = tfidf f_tmp = numpy.asarray(doc_features) f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps) glob_features[i] = f_tmp.tolist() glob_features = numpy.asarray(glob_features)*glob_freqs.N() print "Glob Freqs:", glob_freqs.N() return (glob_features,tokens)
def paper_title_NLP(title_corpus): # title_corpus is a list of tuple # keys like (19,1), means 2019/01 # value is a list of paper titles after tokenized # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer title_dict = {} pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) for t in title_corpus: key = (t[3], t[4]) if key in title_dict: filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) else: title_dict[key] = [] filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) # extract keywords with year span title_years = {} for k, v in title_dict.items(): key = (k[0], ) # year index if key in title_years.keys(): title_years[key].append(v) else: title_years[key] = [] title_years[key].append(v) deep_freq = [] for k, v in title_years.items(): fd = FreqDist() vs = [item for sublist in v for item in sublist] for v_ in vs: for word in v_: fd[word] += 1 print('The keywords for year:20{}'.format(str(k[0]))) print("Total number of words:{}".format(str( fd.N()))) # total number of samples print("Total number of unique words:{}".format(str( fd.B()))) # number of bins or unique samples fd.pprint(50) # The maximum number of items to display, default is 10 deep_freq.append(fd.freq('Deep') + fd.freq('deep')) print(deep_freq) plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq) plt.ylabel('frequency of deep word') plt.xlabel('years') plt.show()
def kneser_ney(self, context, word): """ Return the log probability of a word given a context given Kneser Ney backoff """ bgram = (context, word) unigram_freq = FreqDist() theta = self._kn_concentration vocabulary = 1 / len(self._vocab_freq.keys()) discount_delta = self._kn_discount unigram_T = len(self._context_freq.keys()) bigram_T = self._context_freq[context] for i in self._gram_freq: unigram_freq.inc(i[1]) # Unigram Restaurant # C_0,x count_unirest_wordTable = unigram_freq[word] # C_0,. count_unirest_allTable = unigram_freq.N() # u_Bigram Restaurant # C_u,x count_birest_wordTable = self._gram_freq[bgram] # C_u,. count_birest_allTable = self._context_freq[context] existingTable_numer = count_birest_wordTable - discount_delta existingTable_denom = theta + count_birest_allTable existingTable = existingTable_numer / existingTable_denom if existingTable < 0: existingTable = 0 newTable_numer = theta + (bigram_T * discount_delta) newTable_denom = theta + count_birest_allTable newTable = newTable_numer / newTable_denom back_a_numer = count_unirest_wordTable - discount_delta back_a_denom = count_unirest_allTable + theta back_a = back_a_numer / back_a_denom if back_a < 0: back_a = 0 back_b_numer = theta + (unigram_T * discount_delta) back_b_denom = count_unirest_allTable + theta back_b = back_b_numer / back_b_denom back_b = back_b * vocabulary result = existingTable + (newTable * (back_a + back_b)) return lg(result)
def extract_ngrams(text, low=1, high=2, lowercase=False, filter_punctuation=True, binary=False, least_common=None, most_common=None, normalize=False, sample=False): #text = ' '.join(review.paragraphs) tokens = None # Make lowercase if lowercase: tokens = word_tokenize(text.lower()) else: tokens = word_tokenize(text) # Remove Punctuation if filter_punctuation: words = [t for t in tokens if t not in PUNCTUATION] else: words = [t for t in tokens] # Do the N Gram Thing ngram_counts = {} assert not ( sample and binary ), "Please don't make sample and binary True. One or the other or neither pls" for n in range(low, high + 1): ngram_freqdist = FreqDist(ngrams(words, n)) grams_to_consider = ngram_freqdist if least_common: assert least_common > 0.0 and least_common <= 1.0, \ 'Least common must be a proportion, not %.3f' % least_common num_least_common = int(least_common * ngram_freqdist.N()) grams_to_consider = [] for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]: gram, count = bleh grams_to_consider.append(gram) for gram in grams_to_consider: if sample: ngram_counts[gram] = ngram_freqdist.freq(gram) elif binary: ngram_counts[gram] = True else: ngram_counts[gram] = ngram_freqdist[gram] if normalize: total_counts = sum(count for ngram, count in ngram_counts.items()) for gram, count in ngram_counts.items(): ngram_counts[gram] = count / total_counts return ngram_counts
def show(): print gutenberg.fileids() # 频率分布实例化 fd = FreqDist() for word in gutenberg.words('austen-persuasion.txt'): fd[word] += 1 print fd.N() print fd.B() # 得到前10个按频率排序后的词 for word, value in sorted(fd.items(), key=lambda item: -item[1])[:10]: print word, value
def media(entrada): ''' Essa função calcula a quantidade média de caracteres de cada palavra do texto dado como entrada. ''' fdist = FreqDist(len(w) for w in entrada) somaTotal = 0 for tam in fdist.most_common(): somaTotal += tam[0] * tam[1] resultadoMedia = somaTotal / fdist.N() return resultadoMedia
def get_word_scores(pos_words, neg_words): pos_words_plain = list(itertools.chain(*pos_words)) neg_words_plain = list(itertools.chain(*neg_words)) word_fd = FreqDist(pos_words_plain + neg_words_plain) # 可统计所有词的词频 pos_word_fd = FreqDist(pos_words_plain) neg_word_fd = FreqDist(neg_words_plain) pos_word_count = pos_word_fd.N() # 积极词的数量 neg_word_count = neg_word_fd.N() # 消极词的数量 #total_word_count = pos_word_count + neg_word_count total_word_count = word_fd.N() word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( pos_word_fd[word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname) for file in processed_corpus_texts: print("recording the file: " + file) if path.exists(file): freqs = collectFreqData(file) totalFQ = freqs + totalFQ towrite = dict() towrite["Subreddit"] = corpusname for word in getRegionalisms(): if totalFQ[word] == 0: towrite[word] = 0 else: if useLogFreq: towrite[word] = math.log(totalFQ[word] / totalFQ.N()) else: towrite[word] = totalFQ[word] / totalFQ.N() csvwritter.writerow(towrite)
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url https://www.google.com/search?q=personal+nutrition @scrapes pages to depth<=3, using priority-score based BFS """ doc = clean_html(response.body_as_unicode()) words = word_tokenize(doc) words = [word.lower() for word in words] words = [word for word in words if word not in self.stops] fdist = FreqDist(words) for word in set(words): if (fdist.freq(word) * fdist.N()) > 1: item = WordCount() item['word'] = word item['count'] = int(fdist.freq(word) * fdist.N()) yield item #for href in response.css("a::attr('href')"): # url = response.urljoin(href.extract()) # yield scrapy.Request(url, callback=self.parse)
def train(self, instances): """Remember the labels associated with the features of instances.""" label_counts = FreqDist() feature_counts = defaultdict(FreqDist) all_features = set() #collect counts: C(feature,label) and C(label) for instance in instances: if instance.label != '': #I'm throwing out one blog without a label that is in the corpus for some reason label_counts[instance.label] += 1 features = instance.features() for feature in features: all_features.add(feature) feature_counts[instance.label][feature] += 1 #smoothing, and also making sure that all features are counted for each label for label in feature_counts.keys(): for feature in all_features: feature_counts[label][feature] += 1 #P(label) total = label_counts.N() label_probs = { label: float(label_counts[label]) / total for label in label_counts } #P(feature|label) as a dictionary of dictionaries- C(feature,label)/SUM(C(feature,label) for all the features) feature_probs = {} for label in feature_counts: total = feature_counts[label].N() feature_probs[label] = { feature: float(feature_counts[label][feature]) / total for feature in feature_counts[label] } #set the model self.set_model({ "label_probs": label_probs, "feature_probs": feature_probs, "all_features": all_features })
def frequency(textfiles, out_file): """Input: a text file Output: a table of word frequency with three columns for Word, Count and Percent frequency """ words = [] for textfile in textfiles: with open(textfile, 'r') as fd: text = fd.read() words.extend(nltk.word_tokenize(text)) fdist = FreqDist(words) total = float(fdist.N()) with open(out_file, 'w') as output: output.write("Word\tCount\tPercent\n") for pair in sorted(fdist.items()): output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format( pair=pair, pc=100 * pair[1] / total))
def save_ngrams(lyrics_file): """Creates a file of N-grams and their occurences for the file with lyrics""" all_ngrams = [] with open(lyrics_file, "r") as f: for line in f.readlines(): if not line.isupper() and line != "\n": all_ngrams.extend( list( nltk.bigrams(word_tokenize(line), pad_left=True, pad_right=True, right_pad_symbol='</s>', left_pad_symbol='<s>'))) fd = FreqDist(all_ngrams) ngram_stats = fd.most_common(fd.N()) with open('ngrams.json', mode='w') as fp: json.dump(ngram_stats, fp)
def unigramFreqFile(subreddit): # get filtered files filenames = getTextFileNames(subreddit) countFileName = getCountFileName(subreddit) with open(countFileName, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() for filename in filenames: print("sending normalized values of " + filename + " to " + countFileName) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for word in line.split(): word = word.strip() if word.startswith("http") or word.isnumeric(): continue if 0 < len(word) < 23: frequencies[word] = frequencies.get(word, 0) + 1 frequencies["<end_comment>"] = 0 # write total number of words countVectorFile.write(str(frequencies.N())) for word in frequencies: countVectorFile.write(word+" "+str(frequencies[word])+"\n")
def textChanged_inputTextEdit(self): inputText = self.inputTextEdit.toPlainText().strip() inputText = ''.join(c for c in inputText if not ud.category(c).startswith('P') ) # Delete all ponctuations (Arabic included) inputTokens = functions.tokenization(inputText) freqDist = FreqDist(inputTokens) self.numWordEdit.setText(str(freqDist.N())) self.mostFreqWordEdit.setText(freqDist.max()) numSentences = len( functions.tok_stem(self.inputTextEdit.toPlainText(), False)) self.numSentenceEdit.setText(str(numSentences)) self.inStatsGroup.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False) self.searchWordGroup.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False) self.startPosTagButton.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False)
def bigramFreqFile(subreddit): #get filtered files filenames = getTextFileNames(subreddit) countfilename = getCountFileName(subreddit, unigram=False) with open(countfilename, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish. for filename in filenames: print("sending normalized values of " + filename + " to " + countfilename) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for bigram in list(bigrams(line.split())): okayrange = 0 < len(bigram[0]) < 23 and 0 < len(bigram[1]) < 23 if okayrange and bigram[1] != "<end_comment>": frequencies[bigram] = frequencies.get(bigram, 0) + 1 #write total number of words countVectorFile.write(str(frequencies.N())) #note, another good improvement, organize this for faster searching. for bigram in frequencies: countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
stoplist += stopwords.words('english') #stoplist.append("'t") if args.stop_punctuation: stoplist += [x.decode('UTF8') for x in set(list(punctuation))] stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014'] stoplist.append('--') words = [word for word in word_tokenize(text) if word not in stoplist] if args.stem: st = LancasterStemmer() words = [st.stem(word) for word in words] freq_dist = FreqDist(words) print('Total words: ' + str(orig_freq_dist.N())) print('Total after filter: ' + str(freq_dist.N())) # B() gives list of unique words print('Unique words: ' + str(freq_dist.B())) print('Unique words ratio: ' + str(float(freq_dist.B()) / float(freq_dist.N()))) print('\n') if args.words: for word in args.words: print(word + ': ' + str(freq_dist[word])) print(word + ' freq: ' + str(freq_dist.freq(word))) print('\n') # Show top 30 print('Top ' + str(args.num_words) + ' words:')
# print x # get number of docs that contains word 'w' if x > 0: return 1 return 0 return math.log(N / (reduce(add, map(map0, xx), 1)), 2) for col in dataset: vectorizer = DictVectorizer() document_collections = col['sentences'] pre_matrix = [] for d in document_collections: dc = FreqDist(my_tokenize(d)) nn = float(dc.N()) # for ix in dc: # dc[ix] = dc[ix] / nn pre_matrix.append(dc) tf_matrix = vectorizer.fit_transform(pre_matrix) # N_doc = tf_matrix.shape[0] # for i in range(tf_matrix.shape[1]): # idfx = idf(tf_matrix[:, i], N_doc) # vv = tf_matrix[:, i] # tf_matrix[:, i].multiply(idfx) # ccc = 0 col['model'] = MostRelevantSentence(vectorizer=vectorizer, collection_matrix=tf_matrix)
class TrigramHMM(HMM): def __init__(self, stemmer=BasicStemmer(), backoff: Model = None): super(TrigramHMM, self).__init__(stemmer, backoff) self.EMISSION_MATRIX = None self.TRANSITION_MATRIX = None self.transMatrix_file_save_name = "trigram_transitionTable" self.emissMatrix_file_save_name = "trigram_emissionTable" def loadTables(self): if not bool(self.EMISSION_MATRIX): if not os.path.exists('obj/hmm/' + self.emissMatrix_file_save_name + '.json'): print( "Emission table not found in Disk, reconstructing and saving ...." ) import glob os.chdir( position.replace("\\", "/") + "/../corpus/sources/emission") emissionSources = [ os.path.abspath(el) for el in list(glob.glob("*.txt")) ] os.chdir(position) self.EMISSION_MATRIX = self.constructEmissionMatrix( emissionSources) saveIndex( self.EMISSION_MATRIX, "obj\\hmm\\" + self.emissMatrix_file_save_name + '.pkl') saveIndexjson( self.EMISSION_MATRIX, "obj\\hmm\\" + self.emissMatrix_file_save_name + '.json') else: self.EMISSION_MATRIX = loadIndexJson( "obj/hmm/" + self.emissMatrix_file_save_name + '.json') #self.EMISSION_MATRIX = loadIndex("obj/hmm/" + self.emissMatrix_file_save_name + '.pkl') print("Emission table loaded from Disk ...") if not bool(self.TRANSITION_MATRIX): if not os.path.exists(position + '/obj/hmm/' + self.transMatrix_file_save_name + '.pkl'): print( "Transition table not found in Disk, reconstructing and saving ...." ) import glob os.chdir("../corpus/sources/transition") transitionSources = [ os.path.abspath(el) for el in list(glob.glob("*.txt")) ] os.chdir(position) self.TRANSITION_MATRIX = self.constructTransitionMatrix( transitionSources) #saveIndex(self.TRANSITION_MATRIX,"obj\\hmm\\"+self.transMatrix_file_save_name+'.pkl') save_obj( self.TRANSITION_MATRIX, "obj\\hmm\\" + self.transMatrix_file_save_name + '.pkl') else: self.TRANSITION_MATRIX = load_obj( "obj/hmm/" + self.transMatrix_file_save_name + '.pkl') try: convertedCond = [ tuple_parser(cond) for cond in self.TRANSITION_MATRIX.conditions() ] cfd = ConditionalFreqDist([ (tuple_parser(cond), tag) for cond in self.TRANSITION_MATRIX.conditions() for tag in self.TRANSITION_MATRIX[cond] ]) self.TRANSITION_MATRIX = cfd except Exception as e: print(e) print("Transition table loaded from Disk ...", end=" ") def constructEmissionMatrix(self, sourceFilesList: list): # construction of the emission matrix emission = defaultdict(dict) for tag in NE_TAG_lABELS: emission[tag] = defaultdict(float) for fileName in sourceFilesList: file = open(fileName, 'r', encoding='windows-1256') for line in file: words = re.split("\s+", line) entite = '' for word in words: word = self.stemmer.stem(word) if (re.findall('[A-Z]+', word) == []): entite = word continue if not word in emission: emission[word] = defaultdict(float) emission[word][entite] += 1 file.close() for tag in emission.keys(): somme = 0.0 for value in emission[tag].values(): somme += value for word in emission[tag].keys(): emission[tag][word] = round( float("{0:.6f}".format(emission[tag][word] / somme)), 6) self.EMISSION_MATRIX = emission return emission def constructTransitionMatrix(self, sourceFilesList: list): #construction of the transition matrix for fileName in sourceFilesList: file = open(fileName, 'r', encoding="windows-1256") fileFinal = "" for line in file: line = line.upper() if (len(line) > 1): if not line.startswith("<S>"): fileFinal += '<S> ' + line[:-1] + ' <E>\n' else: fileFinal += line[:-1] + '\n' file.close() tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != ''] self.initialProbabilities = FreqDist([ tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>' ]) self.tags = list(set(tokens)) self.bigramDist = FreqDist(list(bigrams(tokens))) Trigrams = list(trigrams(tokens)) cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams)) for word in cfd.conditions(): for bigram in cfd[word]: cfd[word][bigram] = round( float("{0:.6f}".format(cfd[word].freq(bigram))), 6) self.TRANSITION_MATRIX = cfd return cfd def __viterbi(self, observations: list, emissionTable: dict, transitionTable: ConditionalFreqDist): if not hasattr(self, 'bigramDist'): listcouples = [] for tag in self.TRANSITION_MATRIX.conditions(): for bigram in self.TRANSITION_MATRIX[tag]: listcouples.append(bigram) if not hasattr(self, 'tags'): self.tags = [] if not bigram[0] in self.tags: self.tags.append(bigram[0]) if not bigram[1] in self.tags: self.tags.append(bigram[1]) self.bigramDist = FreqDist(listcouples) for key in self.bigramDist: self.bigramDist[ key] = self.bigramDist[key] / self.bigramDist.N( ) # or simply self.bigramDist.freq(key) print("no bigramDist.... Creating bigramDist") if not hasattr(self, 'initialProbabilities'): print("no inital distribution.... Creating initDist") self.initialProbabilities = FreqDist(el[1] for el in self.bigramDist if el[0] == '<S>') for tag in self.initialProbabilities: self.initialProbabilities[tag] = self.initialProbabilities[ tag] / self.initialProbabilities.N() N = len(self.tags) T = len(observations) viterbi = numpy.zeros((N + 2, T)) # we remove the <S> from TAGS because its just a sign of sentence start if "<S>" in self.tags: self.tags.remove('<S>') N -= 1 backTrack = [] for i in range(N): if self.tags[i] not in emissionTable: emissionTable[self.tags[i]] = defaultdict(float) viterbi[i, 0] = round( float("{0:.6f}".format( (emissionTable[self.tags[i]][observations[0]] if observations[0] in emissionTable[self.tags[i]] else 0.0) * (self.initialProbabilities[self.tags[i]] if self.tags[i] in self.initialProbabilities else 0.0))), 6) for oIndex in range(1, T): bestTagIndex = numpy.argmax( [viterbi[i, oIndex - 1] for i in range(N)]) bestTag = self.tags[bestTagIndex] bestTag2 = self.tags[numpy.argmax( [viterbi[i, oIndex - 2] for i in range(N)])] if oIndex != 1 else "<S>" if viterbi[bestTagIndex, oIndex - 1] == 0: print("Zero resulting probability. Couldn't tag ", observations[oIndex - 1], "Previous besttag was :", bestTag2, end=" ") if self.bacckoff is not None: self.bacckoff.loadTables() best = None max = 0 for tag in self.bacckoff.TRANSITION_MATRIX[bestTag2]: if self.bacckoff.TRANSITION_MATRIX[bestTag2][tag] > max: max = self.bacckoff.TRANSITION_MATRIX[bestTag2][ tag] best = tag print("Best tag so fat ", best, end=" ") if best is not None: viterbi[ bestTagIndex, oIndex - 1] = max * self.EMISSION_MATRIX[best][ observations[oIndex - 1]] if observations[ oIndex - 1] in self.EMISSION_MATRIX[best] else 0.0 bestTag = self.tags[ bestTagIndex] #viterbi[bestTagIndex, oIndex - 1] if viterbi[bestTagIndex, oIndex - 1] == 0: if self.bacckoff.bacckoff is not None: self.bacckoff.bacckoff.tagword(observations[0]) else: viterbi[bestTagIndex, oIndex - 1] = 1 bestTag = "OTHER" else: viterbi[bestTagIndex, oIndex - 1] = 1 bestTag = "OTHER" print(" Backoff tag : ", bestTag, "viterbi : ", viterbi[bestTagIndex, oIndex - 1]) #print([viterbi[i, oIndex-1] for i in range(N) ]) backTrack.append((observations[oIndex - 1], bestTag)) for tIndex in range(N): if self.tags[tIndex] == '<E>': continue viterbi[tIndex, oIndex] = viterbi[bestTagIndex, oIndex - 1] * \ (emissionTable[self.tags[tIndex]][observations[oIndex]] if observations[oIndex] in emissionTable[self.tags[ tIndex]] else 0.0) * \ (transitionTable[self.tags[tIndex]][(bestTag,bestTag2)] if (bestTag,bestTag2) in transitionTable[self.tags[tIndex]] else 0.0) # if the observation belong to another TAG then OTHER we eliminate OTHER ps: the index of the tag OTHER on TAGS array is 0 '''if (tIndex > 0 and viterbi[tIndex, oIndex] > 0.0): viterbi[0, oIndex] = 0.0;''' # we save the backtrack of the last Observation bestTagIndex = numpy.argmax([viterbi[i, T - 1] for i in range(N)]) bestTag = self.tags[bestTagIndex] backTrack.append((observations[T - 1], bestTag)) for (word, tag) in backTrack: if tag == 'UNKNWN': if self.bacckoff is None: tag = "OTHER" else: self.bacckoff.backOffPrent(backTrack) return backTrack def tagText(self, text, algorithm="Viterbi"): self.loadTables() Tokenizer = BasicTokenize() tokens = Tokenizer.tokenize(text) tokens = [self.stemmer.stem(token) for token in tokens] return self.__viterbi(tokens, self.EMISSION_MATRIX, self.TRANSITION_MATRIX) def tagTokens(self, tokens: list, algorithm="Viterbi"): self.loadTables() tokens = [self.stemmer.stem(token) for token in tokens] return self.__viterbi(tokens, self.EMISSION_MATRIX, self.TRANSITION_MATRIX)
def main(download_settings_filename, parse_settings_filename, similarity_settings_filename): with open(download_settings_filename, 'r') as f: download_config = json.load(f) with open(parse_settings_filename, 'r') as f: parse_config = json.load(f) with open(similarity_settings_filename, 'r') as f: similarity_config = json.load(f) topic = download_config.get('topic', 'Medicine') data_dir = os.path.join( download_config.get('save_dir', os.path.join('data', 'wiki')), topic) n_pages = download_config.get('min_pages', 500) vocab_dir = os.path.join( parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic, 'vocab') save_dir = os.path.join( similarity_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic, 'graph') vocab_top_k = similarity_config.get('vocab_top_k', 100) graph_top_k = similarity_config.get('graph_top_k', 10) metric = similarity_config.get('metric', 'euclidean') json_files = glob(os.path.join(vocab_dir, '*.json')) total_vocab_filename = os.path.join(vocab_dir, 'total_count.json') with open(total_vocab_filename, 'r') as f: total_vocab = json.load(f) total_freq = FreqDist(total_vocab) total_number_words = total_freq.N() most_freq_words = total_freq.most_common(vocab_top_k) percentage_used = 100 * sum([x[1] for x in most_freq_words ]) / total_number_words total_vocab_list = [x[0] for x in most_freq_words] all_vocabs = [] good_json_indices = [] print( 'reading in preprocessed vocabulary using {:.2f}% of the total count of words' .format(percentage_used)) i = -1 for json_file in tqdm(json_files): i += 1 if json_file == total_vocab_filename: continue with open(json_file, 'r') as f: doc_vocab = json.load(f) vec = create_count_vector(doc_vocab, total_vocab_list) if sum(vec) > 0: all_vocabs.append(vec) good_json_indices.append(i) if len(good_json_indices) >= n_pages: print('found at least {} suitable pages - breaking out of loop'. format(n_pages)) break good_json_indices = np.array(good_json_indices) transformer = TfidfTransformer() tfidf = transformer.fit_transform(all_vocabs) print('computing similarity matrix') similarity_matrix = pairwise_distances(tfidf, metric=metric) # NOTE: ignore shortest as this is always "self" shortest_indices = np.argsort(similarity_matrix, axis=-1)[:, 1:graph_top_k + 1] print('finding top-{} closest pages'.format(graph_top_k)) pbar = tqdm(total=graph_top_k * min(shortest_indices.shape[0], n_pages)) analysis_results = {} for i in range(graph_top_k): ith_shortest = similarity_matrix[np.arange(shortest_indices.shape[0]), shortest_indices[:, i]] ith_shortest_indices = shortest_indices[:, i] for doc_index in range(min(shortest_indices.shape[0], n_pages)): doc_name = os.path.basename( json_files[good_json_indices[doc_index]]) doc_name = urllib.parse.unquote(doc_name[:doc_name.rfind('.')]) if doc_name not in analysis_results: analysis_results[doc_name] = {"names": [], "similarities": []} ith_shortest_doc_name = os.path.basename( json_files[good_json_indices[ith_shortest_indices[doc_index]]]) ith_shortest_doc_name = urllib.parse.unquote( ith_shortest_doc_name[:ith_shortest_doc_name.rfind('.')]) analysis_results[doc_name]["names"].append(ith_shortest_doc_name) analysis_results[doc_name]["similarities"].append( ith_shortest[doc_index]) pbar.update(1) pbar.close() os.makedirs(save_dir, exist_ok=True) with open(os.path.join(save_dir, 'raw_graph_info.json'), 'w') as f: json.dump(analysis_results, f, indent=4)
class DirichletWords(object): def initialize_index(self): self.word_to_int = {} self.int_to_word = {} def __init__(self, num_topics, alpha_topic=1.0, alpha_word=1.0, max_tables=50000, sanity_check=False, initialize=False, report_filename="topic_history.txt"): self.max_tables = max_tables self._alphabet = FreqDist() # store all words seen in a list so they are associated with a unique ID. self.initialize_index() self._words = FreqDist() self.alpha_topic = alpha_topic self.alpha_word = alpha_word self._num_updates = 0 self._report = None if report_filename: self._report = open(report_filename, 'w') self.num_topics = num_topics self._topics = [FreqDist() for x in xrange(num_topics)] # the sanity_check flag is for testing only. if initialize and sanity_check == True: self.deterministic_seed() elif initialize: self.initialize_topics() def deterministic_seed(self): ''' if sanity_check = True, this will seed the topics with enough variance to evolve but do so in the most basic and deterministic way possible, so a user can follow along each step of the algorithm''' chars = "abcdefghijklmnopqrstuvwxyz" for i in xrange(3): word = random.choice(chars) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) def initialize_topics(self): ''' initializes the topics with some random seed words so that they have enough relative bias to evolve when new words are passed in. ''' # we are going to create some random string from /dev/urandom. to convert # them to a string, we need a translation table that is 256 characters. translate_table = (string.letters * 5)[:256] # /dev/urandom is technically not as random as /dev/random, but it doesn't # block. r = open('/dev/urandom') # make random 'words' and add them to the topics. they'll never # realistically be seen again- which is good since we just want them to # seed the bias in the topics. for i in xrange(self.num_topics): word_length = random.randint(9, 20) word = r.read(word_length).translate(translate_table) self.index(word) topic_weights = probability_vector(self.num_topics) for k in xrange(self.num_topics): self.update_count(word, k, topic_weights[k]) r.close() def __len__(self): return len(self._words) def num_words(self): return sum(1 for x in self._words if self._words[x] >= 1) def as_matrix(self): ''' Return a matrix of the probabilities of all words over all topics. note that because we are using topic_prob(), this is equivalent to he expectation of log beta, ie Elogbeta ''' # XXX TODO we should store this on the fly instead of recomputing it # all the time! # create a numpy array here because that's what the e_step in streamLDA # expects num_words = self.num_words() print("%i words" % num_words) lambda_matrix = n.zeros((self.num_topics, num_words)) for word_index, word in enumerate(x for x in self._words \ if self._words[x] >= 1): topic_weights = [log(self.topic_prob(k, word)) \ for k in xrange(self.num_topics)] # topic weights for this word-- a column vector. lambda_matrix[:, word_index] = topic_weights self._num_updates += 1 if self._report: self._report.write("%i %i %i %i\n" % (self._num_updates, len(self._alphabet), \ len(self._words), sum(x.B() for x in self._topics))) return lambda_matrix def forget(self, proportion): num_tables = len(self._words) number_to_forget = proportion * num_tables if num_tables > self.max_tables: number_to_forget += (num_tables - self.max_tables) # change this to weight lower probability tables_to_forget = random.sample(xrange(num_tables), number_to_forget) words = self._words.keys() self.initialize_index() word_id = -1 for ii in words: word_id += 1 if not word_id in tables_to_forget: self.index(ii) continue count = self._words[ii] for jj in self._topics: self._topics[jj][ii] = 0 del self._topics[jj][ii] for jj in ii: self._chars[jj] -= count self._words[ii] = 0 del self._words[ii] def seq_prob(self, word): val = 1.0 # Weighted monkeys at typewriter for ii in word: # Add in a threshold to make sure we don't have zero probability sequences val *= max(self._alphabet.freq(ii), CHAR_SMOOTHING) # Normalize val /= 2**(len(word)) return val def merge(self, otherlambda, rhot): ''' fold the word counts in another DirichletWords object into this one, weighted by rhot. assumes self.num_topics is the same for both objects. ''' all_words = self._words.keys() + otherlambda._words.keys() distinct_words = list(set(all_words)) # combines the probabilities, with otherlambda weighted by rho, and # generates a new count by combining the number of words in the old # (current) lambda with the number in the new. here we essentially take # the same steps as update_count but do so explicitly so we can weight the # terms appropriately. total_words = float(self._words.N() + otherlambda._words.N()) self_scale = (1.0 - rhot) * total_words / float(self._words.N()) other_scale = rhot * total_words / float(otherlambda._words.N()) for word in distinct_words: self.index(word) # update word counts new_val = (self_scale * self._words[word] + other_scale * otherlambda._words[word]) if new_val >= 1.0: self._words[word] = new_val else: self._words[word] = 0 del self._words[word] # update topic counts for topic in xrange(self.num_topics): new_val = (self_scale * self._topics[topic][word] + other_scale * otherlambda._topics[topic][word]) if new_val >= 1.0: self._topics[topic][word] = new_val else: self._topics[topic][word] = 0 del self._topics[topic][word] # update sequence counts all_chars = self._alphabet.keys() + otherlambda._alphabet.keys() distinct_chars = list(set(all_chars)) for ii in distinct_chars: self._alphabet[ii] = (self_scale * self._alphabet[ii] + other_scale * otherlambda._alphabet[ii]) def word_prob(self, word): return (self._words[word] + self.alpha_word * self.seq_prob(word)) / \ (self._words.N() + self.alpha_word) def topic_prob(self, topic, word): return (self._topics[topic][word] + \ self.alpha_topic * self.word_prob(word)) / \ (self._topics[topic].N() + self.alpha_topic) def update_count(self, word, topic, count): # create an index for the word self.index(word) # increment the frequency of the word in the specified topic self._topics[topic][word] += count # also keep a separate frequency count of the number of times this word has # appeared, across all documents. self._words[word] += count # finally, keep track of the appearance of each character. # note that this does not assume any particular character set nor limit # recognized characters. if words contain punctuation, etc. then they will # be counted here. for ii in word: self._alphabet[ii] += count def index(self, word): assert not isinstance(word, int) if not word in self.word_to_int: self.word_to_int[word] = len(self.word_to_int) self.int_to_word[self.word_to_int[word]] = word return self.word_to_int[word] def dictionary(self, word_id): assert isinstance(word_id, int) return self.int_to_word[word_id] def print_probs(self, word): print "----------------" print word for ii in xrange(self.num_topics): print ii, self.topic_prob(ii, word) print "WORD", self.word_prob(word) print "SEQ", self.seq_prob(word)