def learn_trigram(datas, names, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(3) names = names.split(' ') #trigram.fit_corpus(datas[int(names[0])].train) trigram.fit_corpus2(datas[int(names[0])].train, datas[int(names[1])].train) """ if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['the']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['the']))) """ return trigram
def learn_trigram(data, verbose=True): from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) """ # Uncomment for reuters only # get most frequent trigrams from validation set """ #trigram.frequent_trigrams(data.dev) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(["my", "dog"]))) print( "sample 2: ", " ".join( str(x) for x in sampler.sample_sentence(["good", "morning"]))) return trigram
def learn_trigram(data, ratio, delta=1 / 2**(15), smoothing=True, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing) train = data.train[:int(ratio * len(data.train))] trigram.fit_corpus(train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) # from generator import Sampler # sampler = Sampler(trigram) # print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) # print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return trigram
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university']))) print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States']))) print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion']))) print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to']))) print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome']))) print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry']))) print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of']))) print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from']))) print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science']))) return trigram
def learn_trigram(data): from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return unigram
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) return trigram
def learn_trigram(data): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() #k = [0.0001,0.001,0.01,0.1,1] # k = [[0.35, 0.5, 0.15], [0.25, 0.5, 0.25], [0.15, 0.5, 0.35], [0.5, 0.35, 0.15], [0.5, 0.25, 0.25], [0.5, 0.15, 0.35], # [0.35, 0.15, 0.5], [0.25, 0.25, 0.5], # [0.15, 0.35, 0.5], [0.2, 0.4, 0.4], [0.3, 0.3, 0.4]] trigram.fit_corpus(data.train) # for i,j,l in k: trigram.l1 = 0.35 trigram.l2 = 0.5 trigram.l3 = 0.15 # print(i,j,l) #trigram.save_model() #trigram.load_model() #print("vocab:", len(trigram.vocab())) # # evaluate on train, test, and dev #print("train:", trigram.perplexity(data.train)) # # # # # add <sos>, <sos>, and <eos> to validation and test data # trigram.pre_processes(data.dev) trigram.pre_processes(data.test) # print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) #print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS']))) # # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS']))) # # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['SOS','SOS']))) return trigram
def learn_trigram(data): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return trigram
def learn_trigram(data1, data2, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() end = int(len(data2.train)) data_train = data1.train + data2.train[0:end] trigram.fit_corpus(data_train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data_train)) #print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data2.test)) '''
def learn_trigram(data, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) #if trigram.normMeth == "interpol": # trigram.findLamdas(data.dev) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(["They"]))) return trigram
def learn_trigram(data, thres=4, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(thres) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) #trigram.print_keys() # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram, temp=0.25) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['With']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['Next']))) return trigram
def learn_trigram(data, alpha, sampler=0, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) if verbose: print("vocab:", trigram.num_words) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train, alpha)) print("dev :", trigram.perplexity(data.dev, alpha)) print("test :", trigram.perplexity(data.test, alpha)) if sampler==1: from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return trigram
if __name__ == "__main__": dnames = ["brown", "reuters", "gutenberg"] datas = [] unigrams = [] bigrams = [] trigrams = [] # Learn the models for each of the domains, and evaluate it for dname in dnames: print("-----------------------") print(dname) data = read_texts("data/corpora.tar.gz", dname) datas.append(data) # trigram from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) unigrams.append(set(trigram.vocab())) bigrams.append(set(trigram.bigram)) trigrams.append(set(trigram.trigram)) n = len(dnames) overlap_unigram = np.zeros((n, n)) overlap_bigram = np.zeros((n, n)) overlap_trigram = np.zeros((n, n)) for i in xrange(n): for j in xrange(n): overlap_unigram[i][j] = len(unigrams[i] & unigrams[j]) overlap_bigram[i][j] = len(bigrams[i] & bigrams[j]) overlap_trigram[i][j] = len(trigrams[i] & trigrams[j])
word = self.rnd.choice(wps)[0] #predict some random nuber find the coresponding interval. s = -np.inf # running mass / accumulated (log) probability for w, lp in wps: s = np.logaddexp2(s, lp) if p < pow(2, s - tot): word = w break return word if __name__ == "__main__": from lm import Unigram from lm import Trigram #unigram = Unigram() trigram = Trigram() trigram.l = 0.1 corpus = [["I", "am", "Sam"]] #unigram.fit_corpus(corpus) trigram.fit_corpus(corpus) #print(unigram.model) test1 = [['I', 'am', 'Sam']] test2 = [['green', 'eggs', 'and', 'ham']] test3 = ['I', 'am', 'Sam', 'EOS'] trigram.pre_processes(test1) trigram.pre_processes(test2) print(trigram.perplexity(test1)) print(trigram.perplexity(test2))
incl_eos determines whether the space of words should include EOS or not. """ wps = [] tot = -np.inf # this is the log (total mass) for w in self.lm.vocab(): if not incl_eos and w == "END_OF_SENTENCE": continue lp = self.lm.cond_logprob(w, prev) wps.append([w, lp / self.temp]) tot = np.logaddexp2(lp / self.temp, tot) p = self.rnd.random() word = self.rnd.choice(wps)[0] s = -np.inf # running mass for w, lp in wps: s = np.logaddexp2(s, lp) if p < pow(2, s - tot): word = w break return word if __name__ == "__main__": # from lm import Unigram from lm import Trigram trigram = Trigram() corpus = [["i", "am", "sam"]] trigram.fit_corpus(corpus) sampler = Sampler(trigram) for i in xrange(10): print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))