class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) def getWordProbability(self, sentence, index): return 0 def getVocabulary(self, context): return [] def generateWord(self, context): return 'bunny' def generateSentence(self): result = [] # limit sentence length to 20 for i in range(20): word = LanguageModel.UNK while word == LanguageModel.UNK: # make sure word != UNK word = self.generateWord(result) result.append(word) if word == LanguageModel.STOP: break return result
class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = 0.5 self.lambda_2 = 0.5 ''' Trains a bigram-interpolation language model on a training set. ''' def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) ''' Returns the probability of the word at index, according to the model, within the specified sentence. ''' def getWordProbability(self, sentence, index): return ( self.lambda_1 * self.bigram.getWordProbability(sentence, index) + self.lambda_2 * self.unigram.getWordProbability(sentence, index)) ''' Returns, for a given context, a random word, according to the probabilities in the model. ''' def generateWord(self, context): return 'bunny'
def __init__(self): self.docs = dict() self.index = dict() self.vecs = None self.consts = None self.modified = False self.bigram_index = Bigram()
def __init__(self): self.unigram_model = Unigram() self.bigram_model = Bigram() self.trigram_model = Trigram() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5
def __init__(self, lambda_1=0.67): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = lambda_1 self.lambda_2 = 1 - lambda_1
def main(): raw_data = get_data() # Unigram uni = Unigram(raw_data) uni.main() # Bigram bi = Bigram(raw_data) bi.main()
class BigramInterpolation(LanguageModel): def __init__(self, lambda_1=0.67): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = lambda_1 self.lambda_2 = 1 - lambda_1 ''' Trains a bigram-interpolation language model on a training set. ''' def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) ''' Returns the probability of the word at index, according to the model, within the specified sentence. ''' def getWordProbability(self, sentence, index): return (self.lambda_1*self.bigram.getWordProbability(sentence, index) +self.lambda_2*self.unigram.getWordProbability(sentence, index)) ''' Returns, for a given context, a random word, according to the probabilities in the model. ''' def generateWord(self, context): if context: previous_word = context[-1] else: previous_word = LanguageModel.START if (previous_word not in self.word_dict) and (previous_word != LanguageModel.START): previous_word = LanguageModel.UNK if previous_word == LanguageModel.START: previous_word_index = 0 else: previous_word_index = self.word_dict[previous_word] probs_bigram = self.bigram.prob_counter[previous_word_index].toarray().ravel() probs_unigram = self.unigram.prob_counter[0].toarray().ravel() # Because the unigram model and bigram model have different word index for STOP, I need to make some adjustment stop_index = self.unigram.word_dict[LanguageModel.STOP] # move STOP probability to the first element of probs_unigram and leave the others unchanged stop_prob = probs_unigram[stop_index] probs_unigram = np.append(stop_prob, np.delete(probs_unigram, stop_index)) probs = self.lambda_1*probs_bigram + self.lambda_2*probs_unigram # Get the interpolation probability word_list = sorted(self.word_dict.items(), key=lambda item: item[1]) word_list = [k[0] for k in word_list] return np.random.choice(word_list, p=probs)
def main(): bg = Bigram() bg.train() print(sys.argv[1]) p, q, r = bg.test(sys.argv[1]) print("------Unsmooth Probability---------") print('{:.60f}'.format(p)) print("------Laplace Smooth Prob---------") print('{:.60f}'.format(q)) print("------Good Turing Prob---------") print('{:.60f}'.format(r))
def make_chains(in_string): """Make markov chains from text in in_string.""" chains = defaultdict(list) words = in_string.strip().split() for i in range(len(words) - 2): chains[Bigram(words[i], words[i + 1])].append(words[i + 2]) # Add marker for end-of-text chains[Bigram(words[i + 1], words[i + 2])].append(None) return chains
def main_bigramTrain(options, input): bigramModel = Bigram(0.000000000000001) for sen, _ in sentenceIterator(input): tags = [tok[options.tagField] for tok in sen] bigramModel.obsSequence(tags) bigramModel.count() bigramModel.writeToFile(options.bigramModelFile)
class Interpolation(LanguageModel): def __init__(self): self.unigram_model = Unigram() self.bigram_model = Bigram() self.trigram_model = Trigram() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5 def train(self, trainingSentences): self.unigram_model.train(trainingSentences) self.bigram_model.train(trainingSentences) self.trigram_model.train(trainingSentences) #Arbitrary lambdas. def getWordProbability(self, sentence, index): return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \ + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \ + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index)) #Doesn't matter which model we use here- vocabulary is the same def getVocabulary(self, context): return self.trigram_model.getVocabulary(context) #What does generating a sentence in an interpolation model look like? #I don't know, so what I've done is generate a word using trigram, bigram, and #unigram model some of the time, using the same values in getWordProbability def generateSentence(self): sentence = [] prev_previous = LanguageModel.START previous = random.choice(list(self.trigram_model.word_count.keys())) for i in range(20): model_choice = random.random() if model_choice <= self.trigram_lambda: word = self.trigram_model.generateWord(prev_previous, previous) elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda: word = self.bigram_model.generate_word(previous) else: word = self.unigram_model.generateWord() sentence.append(word) prev_previous = previous previous = word if word == LanguageModel.STOP: break return sentence
def __init__(self, featureSet, options): self.featureSet = featureSet self.params = '-b 1' self.lmw = options['lmw'] modelName = options['modelName'] sys.stderr.write('loading transition model...') self.transProbs = Bigram.getModelFromFile(options['bigramModelFile']) sys.stderr.write('done\nloading observation model...') self.model = load_model('{0}.model'.format(modelName)) self.labelCounter = options['labelCounter'] self.featCounter = options['featCounter'] sys.stderr.write('done\n')
def make_text(chains): """Generate markov-chain-generated text from chains.""" bigram = choice(list(chains)) print (bigram, end=' ') while True: follows = choice(chains[bigram]) if follows is None: break print(follows, end=' ') bigram = Bigram(bigram.word2, follows) print()
def get_bigrams(self): bigram_list = [] for bigram in nltk.bigrams(self.low_case_words_list): bigram_list.append(Bigram(self.doc_id, bigram)) return bigram_list
from hmm import HMM from bigram import Bigram import time import re import codecs if __name__ == "__main__": print('将对配置文件config.py中TESTFILE=' + TESTFILE + '进行分词...') start = time.time() with codecs.open(TESTFILE, 'r', 'gbk') as f: text = f.read() lines = text.split('\r\n') bigram = Bigram() lines_segs = [] # 逐行进行分词 for i, line in enumerate(lines): line_segs = [] if line != '': ###### 将每行从传入分词模型 ##### line_segs = bigram.cut(line) ############################### else: line_segs = line lines_segs.append(line_segs) # 没千行打印耗时 if i % 1000 == 0: print(str(i) + '/' + str(len(lines)), time.time() - start)
ratios = np.arange(0.05, 1.05, 0.05) unigram_accuracies = [] tfidf_accuracies = [] bigram_accuracies = [] for r in ratios: unigram_perceptron = Unigram(train_ratio=r) unigram_accuracy = unigram_perceptron.accuracy unigram_accuracies.append(unigram_accuracy) print(r, "unigram_perceptron", unigram_accuracy) tfidf_perceptron = Tfidf(train_ratio=r) tfidf_accuracy = tfidf_perceptron.accuracy tfidf_accuracies.append(tfidf_accuracy) print(r, "tfidf_perceptron", tfidf_accuracy) bigram_perceptron = Bigram(train_ratio=r) bigram_accuracy = bigram_perceptron.accuracy bigram_accuracies.append(bigram_accuracy) print(r, "bigram_perceptron", bigram_accuracy) pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb")) pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb")) pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb")) # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb")) # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb")) # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb")) fig = plt.figure() ax1 = fig.add_subplot(111) num_samples = ratios * 1000000 ax1.scatter(num_samples, unigram_accuracies, c='b', label='Unigrams')
from unigram import Unigram from bigram import Bigram from trigram import Trigram inputs = read('input.txt')[0].strip().split(" ") V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]), float(inputs[2]), inputs[3], inputs[4]) OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt" t1 = time() if V == 3: print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}") BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BYOM.execute() elif N == 1: print(f"unigram: V = {V} d = {S_FACTOR}") UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) UNIGRAM.execute() elif N == 2: print(f"bigram: V = {V} d = {S_FACTOR}") BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BIGRAM.execute() elif N == 3: print(f"trigram: V = {V} d = {S_FACTOR}") TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) TRIGRAM.execute() t2 = time() print(f"execution time: {t2 - t1}s")
from bigram import Bigram import os import re if __name__ == '__main__': bg = Bigram() bg.train(os.path.abspath('../darksouls_training.txt')) print 'model trained' # for key, item in bg.get_model().iteritems(): # print key, item bg.test('../darksouls_test.txt') print 'The entropy for the test set is: {:.2f}.'.format(bg.entropy) print 'The perplexity for the test set is: {:.2f}.'.format(bg.perplexity)
def __init__(self): self.unigram = Unigram() self.bigram = Bigram() self.coef = 0.5 print("W(bigram):W(unigram) coefficient is 1 :", self.coef)
def __init__(self): self.unigram = Unigram() self.bigram = Bigram()
class RetrievalIndex: def __init__(self): self.docs = dict() self.index = dict() self.vecs = None self.consts = None self.modified = False self.bigram_index = Bigram() def save(self, file_path): with open(file_path, 'wb') as f: pickle.dump(self, f) @classmethod def load(cls, file_path): with open(file_path, 'rb') as f: index = pickle.load(f) return index def __getstate__(self): return self.__dict__ def __setstate__(self, d): self.__dict__ = d @classmethod def from_xml(cls, xml, max_num=None, method='file'): index = cls() for doc in Doc.create_list_from_xml(xml, max_num=max_num, method=method): index.add_doc(doc) return index def add_doc(self, doc, raise_on_exists=True): self.set_modified() doc_id = doc.doc_id if doc_id in self.docs: if raise_on_exists: raise ValueError("Doc already in list, change id") else: return self.docs[doc_id] = doc for word, position, doc_part in doc.info_iterator: self.word_index_add_doc(word, position, doc_id, doc_part) #bigram for word in doc.bigram_words: self.bigram_index.add_word(word) def remove_doc(self, doc_id, raise_on_not_exists=True): self.set_modified() self.modified = True if doc_id not in self.docs: if raise_on_not_exists: raise ValueError("doc_id not found") else: return doc = self.docs[doc_id] for word, position, doc_part in doc.info_iterator: self.word_index_remove_doc(word, doc_id) del self.docs[doc_id] #bigram for word in doc.bigram_words: self.bigram_index.remove_word(word) def word_index_add_doc(self, word, position, doc_id, doc_part): self.index.setdefault(word, {}).setdefault(doc_id, {}).setdefault( doc_part, []).append(position) #assumes no Attack def word_index_remove_doc(self, word, doc_id, raise_on_not_exists=False): posting_list = self.get_posting_list( word, raise_on_not_exists=raise_on_not_exists) if doc_id not in posting_list: if raise_on_not_exists: raise ValueError("Doc %s not in posting list for word %s" % (doc_id, word)) return del posting_list[doc_id] if not posting_list: del self.index[word] def get_posting_list(self, word, raise_on_not_exists=True): if raise_on_not_exists and word not in self.index: raise ValueError('term not in index') return self.index.get(word, {}) def tf(self, term, doc_id, part): posting_list = self.get_posting_list(term) tf = len(posting_list.get(doc_id, {}).get(part, {})) return Tf_calc.transform_tf(tf) def idf(self, term, part): df = len(self.get_posting_list(term)) return Tf_calc.idf_transform(df, self.N) def tf_idf(self, term, doc_id, part): return self.tf(term, doc_id, part) * self.idf(term, part) def get_exact_docs(self, li_title, li_text, method="standard"): if method == "standard": li = li_text def is_fine(doc_id): return all(self.docs[doc_id].has_exact(phrase) for phrase in li) else: raise ValueError("method must be standard") ans = list(filter(is_fine, self.docs.keys())) return ans def query(self, query_title, query_text, should_divide=False, k=15, title_ratio=2, flatten=True, exact_method="standard"): query_title, li_title = Text_cleaner.query_cleaner(query_title) query_text, li_text = Text_cleaner.query_cleaner(query_text) query = Doc.from_query(query_title, query_text) good_doc_ids = self.get_exact_docs(li_title, li_text, exact_method) self.make_vectors() v, const = query.tf_idf() scores = [] for doc_id, doc_v in self.vecs.items(): if doc_id not in good_doc_ids: continue part_score = dict() for part in doc_v: part_score[part] = 0 for term, w_q in v[part].items(): part_score[part] += doc_v[part].get(term, 0) * w_q if should_divide: modified_vector = { term: doc_v['text'].get(term, 0) for term in v[part].keys() } new_contant = Tf_calc.const(modified_vector) normalization_factor = new_contant * const[part] + EPSILON else: normalization_factor = 1 part_score[part] /= normalization_factor final_score = part_score['title'] * title_ratio + part_score['text'] scores.append((doc_id, final_score)) scores.sort(key=lambda x: x[1], reverse=True) top_k = [scores[i][0] for i in range(min(k, len(scores)))] if k == 1 and flatten: return top_k[0] else: return top_k def make_vectors(self): if not self.modified: return self.vecs = {} self.consts = {} for doc_id, doc in self.docs.items(): self.vecs[doc_id] = {} self.consts[doc_id] = {} for part in Doc.PARTS: v = dict() for term in doc.distinct_terms(part): v[term] = self.tf_idf(term, doc_id, part) self.vecs[doc_id][part] = v self.consts[doc_id][part] = Tf_calc.const(v) self.modified = False def set_modified(self): self.modified = True @property def N(self): return len(self.docs) def __str__(self): ans = "" ans += "Doc_ids: %s\n" % str(list(self.docs)) ans += '+++++++++++++++++++++\n' ans += "Index: %s\n" % '\n------------\n'.join("%s: %s" % (word, self.index[word]) for word in self.index) return ans
def __init__(self): self.load_param() self.bigram = Bigram()
def train(self): bi_diff_word_dict = {} u_count_dict = {} count_word_dict = {} for word_line in self.word: count = 1 while len(word_line) > count: word_dict = Bigram(self.word_dict, word_line[count]) self.word_dict = word_dict.dict_bigram() count_word_dict = Bigram(count_word_dict, word_line[count - 1]) count_word_dict = count_word_dict.dict_bigram() context_count_dict = Bigram(self.context_count_dict, '') self.context_count_dict = context_count_dict.dict_bigram() bi_word = word_line[count - 1] + ' ' + word_line[count] word_dict = Bigram(self.word_dict, bi_word) self.word_dict = word_dict.dict_bigram() bi_diff_word_dict.update({bi_word: word_line[count - 1]}) context_count_dict = Bigram(self.context_count_dict, word_line[count - 1]) self.context_count_dict = context_count_dict.dict_bigram() count = count + 1 # Witten Bell Smoothing for k, v in bi_diff_word_dict.items(): if v in u_count_dict: bi_value = u_count_dict[v] u_count_dict.update({v: bi_value + 1}) else: u_count_dict[v] = 1 for k in count_word_dict.keys(): lambda_w = 1 - (1.0 * u_count_dict[k] / (u_count_dict[k] + count_word_dict[k])) self.lambda_word_dict.update({k: lambda_w}) for ngram, count in self.word_dict.items(): n_split_word = ngram.split(' ') n_split_word.pop() context = ''.join(n_split_word) prob = 1.0 * self.word_dict[ngram] / self.context_count_dict[context] self.word_dict.update({ngram: prob})
import numpy as np import csv from bigram import Bigram ham_validation_list = Bigram.processed_file("../ham/validation.txt") spam_validation_list = Bigram.processed_file("../spam/validation.txt") with open('eval_k_0.1_1.0.csv', mode='w') as report_file: report_writer = csv.writer(report_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) report_writer.writerow([ 'k', 'ham accuracy', 'spam accuracy', 'precision', 'recall', 'F1_score' ]) for k in np.linspace(0.1, 1.0, 10): ham_pro_dict = Bigram.bigram_probability_dict("../ham/train.txt", k) spam_pro_dict = Bigram.bigram_probability_dict("../spam/train.txt", k) ham_count_ham = ham_count_spam = 0 for message in ham_validation_list: spam_pp = Bigram.perplexity_single_message(spam_pro_dict, message) ham_pp = Bigram.perplexity_single_message(ham_pro_dict, message) result = None if spam_pp > ham_pp: ham_count_ham += 1 else: ham_count_spam += 1 spam_count_ham = spam_count_spam = 0
class HMM: def __init__(self): self.load_param() self.bigram = Bigram() def load_param(self): self.init_prob = self.read('init_prob') self.emiss_prob = self.read('emiss_prob') self.trans_prob = self.read('trans_prob') self.pinyin_states = self.read('pinyin_states') def read(self, filename): with open('model_params/' + filename + '.json', 'r') as f: return json.load(f) # Viterbi process def trans(self, strs): # 切分 seq = self.bigram.dp_search(strs) # smooth self.min_f = -3.14e+100 length = len(seq) viterbi = {} for i in range(length): viterbi[i] = {} # initize for s in self.pinyin_states.get(seq[0]): viterbi[0][s] = ( self.init_prob.get(s, self.min_f) + self.emiss_prob.get(s, {}).get(seq[0], self.min_f) + self.trans_prob.get(s, {}).get('BOS', self.min_f), -1) # DP # look trans_prob = {post1:{pre1:p1, pre2:p2}, post2:{pre1:p1, pre2:p2}} for i in range(length - 1): for s in self.pinyin_states.get(seq[i + 1]): viterbi[i + 1][s] = max([ (viterbi[i][pre][0] + self.emiss_prob.get(s, {}).get(seq[i + 1], self.min_f) + self.trans_prob.get(s, {}).get(pre, self.min_f), pre) for pre in self.pinyin_states.get(seq[i]) ]) for s in self.pinyin_states.get(seq[-1]): viterbi[length - 1][s] = (viterbi[length - 1][s][0] + self.trans_prob.get('EOS', {}).get(s, self.min_f), viterbi[length - 1][s][1]) words = [None] * length words[-1] = max(viterbi[length - 1], key=viterbi[length - 1].get) for n in range(length - 2, -1, -1): words[n] = viterbi[n + 1][words[n + 1]][1] return ''.join(w for w in words)
class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() self.coef = 0.5 print("W(bigram):W(unigram) coefficient is 1 :", self.coef) def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) def getWordProbability(self, sentence, index): coef = self.coef x = 1 / (1 + coef) if index == len(sentence): word = LanguageModel.STOP prev_word = sentence[-1] elif index == 0: word = sentence[0] prev_word = LanguageModel.START else: word = sentence[index] prev_word = sentence[index - 1] if prev_word not in self.bigram.probCounter: prev_word = LanguageModel.UNK if self.bigram.probCounter[prev_word][word] == 0: return x * coef * self.unigram.getWordProbability(sentence, index) else: return x * self.bigram.getWordProbability( sentence, index) + x * coef * self.unigram.getWordProbability( sentence, index) def getVocabulary(self, context): next_posb_word = [] # append all possible word except START in self.total for next_word in self.bigram.total: if next_word != LanguageModel.START: next_posb_word.append(next_word) # append STOP manually since there is no STOP in self.total next_posb_word.append(LanguageModel.STOP) return next_posb_word def generateWord(self, context): return self.bigram.generateWord(context) def generateSentence(self): result = [] # limit sentence length to 20 for i in range(20): word = LanguageModel.UNK while word == LanguageModel.UNK: # make sure word != UNK word = self.generateWord(result) result.append(word) if word == LanguageModel.STOP: break return result