def learn_trigram(data, verbose=True): from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) """ # Uncomment for reuters only # get most frequent trigrams from validation set """ #trigram.frequent_trigrams(data.dev) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(["my", "dog"]))) print( "sample 2: ", " ".join( str(x) for x in sampler.sample_sentence(["good", "morning"]))) return trigram
def learn_bigram(data): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Bigram bigram = Bigram() bigram.fit_corpus(data.train) print("vocab:", len(bigram.vocab())) # evaluate on train, test, and dev print("train:", bigram.perplexity(data.train)) print("dev :", bigram.perplexity(data.dev)) print("test :", bigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(bigram) print( "sample: ", " ".join( str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE']))) print( "sample: ", " ".join( str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE']))) print( "sample: ", " ".join( str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE']))) return bigram
def search_ngram(data, unk_prob, smooth, n=3, backoff=False): from lm import Ngram from numpy import inf perplexity = inf best_model = Ngram() for up in unk_prob: print("Trying unk_prob {}".format(up)) for s in smooth: ngram = Ngram(up, n, s, backoff) ngram.fit_corpus(data.train) curr_perp = ngram.perplexity(data.dev) if curr_perp < perplexity: perplexity = curr_perp best_model = ngram print("vocab:", len(best_model.vocab())) # evaluate on train, test, and dev print("train:", best_model.perplexity(data.train)) print("dev :", best_model.perplexity(data.dev)) print("test :", best_model.perplexity(data.test)) from generator import Sampler sampler = Sampler(best_model) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return best_model
def learn_bigram(data, print_info=False, sample_sentence=False, gamma=0, smooth=1): """Learns a bigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ bigram = lm.Bigram(gamma=gamma, smooth=smooth) bigram.fit_corpus(data.train) if print_info: print("vocab:", len(bigram.vocab())) # evaluate on train, test, and dev print("train:", bigram.perplexity(data.train)) print("dev :", bigram.perplexity(data.dev)) print("test :", bigram.perplexity(data.test)) if sample_sentence: from generator import Sampler sampler = Sampler(bigram) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return bigram
def learn_ngram(data, hyperp_set={}): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Ngram # from lm import Ngram_baseline if hyperp_set == {}: ngram = Ngram(comb=3) else: if list(hyperp_set.keys())[0] == 'lamb': print(hyperp_set[list(hyperp_set.keys())[0]]) ngram = Ngram(comb=3, lamb=hyperp_set[list(hyperp_set.keys())[0]]) elif list(hyperp_set.keys())[0] == 'gamma': ngram = Ngram(comb=3, gamma=hyperp_set[list(hyperp_set.keys())[0]]) else: ngram = Ngram(comb=3) ngram.fit_corpus(data.train) print("vocab:", len(ngram.vocab())) # evaluate on train, test, and dev print("train:", ngram.perplexity(data.train)) print("dev :", ngram.perplexity(data.dev)) print("test :", ngram.perplexity(data.test)) from generator import Sampler sampler = Sampler(ngram) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) return ngram
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university']))) print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States']))) print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion']))) print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to']))) print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome']))) print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry']))) print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of']))) print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from']))) print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science']))) return trigram
def learn_backoff_trigram(data, unk_prob=.0001, smooth=1e-7): from lm import BackoffTrigram ngram = BackoffTrigram(unk_prob, smooth) ngram.fit_corpus(data.train) print("vocab:", len(ngram.vocab())) # evaluate on train, test, and dev print("train:", ngram.perplexity(data.train)) print("dev :", ngram.perplexity(data.dev)) print("test :", ngram.perplexity(data.test)) from generator import Sampler sampler = Sampler(ngram) prefix = [] print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(prefix))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(prefix))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence(prefix))) return ngram
def learn_ngram(data, n=1, lower=True): """Learns a ngram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Ngram ngram = Ngram(n=n, lower=lower) ngram.fit_corpus(data.train) print("vocab:", len(ngram.vocab())) # evaluate on train, test, and dev print("train:", ngram.perplexity(data.train)) print("dev :", ngram.perplexity(data.dev)) print("test :", ngram.perplexity(data.test)) from generator import Sampler sampler = Sampler(ngram) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return ngram
def learn_unigram(data, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Unigram unigram = Unigram() unigram.fit_corpus(data.train) if verbose: print("vocab:", len(unigram.vocab())) # evaluate on train, test, and dev print("train:", unigram.perplexity(data.train)) print("dev :", unigram.perplexity(data.dev)) print("test :", unigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(unigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['They']))) return unigram
def generate(lm): st.subheader("Language Model Generation") prefix = st.text_input("Enter a prefix (default: no prefix)") temp = st.number_input("Enter a temperature", value=1.0) max_length = st.slider("Maximum length of generated sentences", 10, 50) num_samples = st.slider("Number of sentences to generate", 1, 10) sampler = Sampler(lm, temp) if st.button("Generate"): for i in range(num_samples): generated_text = " ".join( sampler.sample_sentence(prefix.split(), max_length)) st.write(f"{i}) {generated_text}\n")
def learn_trigram(data, delta=1 / 2**(15), smoothing=True, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return trigram
def learn_trigram(data, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) #if trigram.normMeth == "interpol": # trigram.findLamdas(data.dev) if verbose: print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(["They"]))) return trigram
def learn_trigram(data): from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) print("vocab:", len(trigram.vocab())) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return unigram
def learn_trigram(data, alpha, sampler=0, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram() trigram.fit_corpus(data.train) if verbose: print("vocab:", trigram.num_words) # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train, alpha, 1)) print("dev :", trigram.perplexity(data.dev, alpha, 1)) print("test :", trigram.perplexity(data.test, alpha, 1)) if sampler == 1: from generator import Sampler sampler = Sampler(trigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([]))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return trigram
def learn_trigram(data, thres=4, verbose=True): """Learns a trigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Trigram trigram = Trigram(thres) trigram.fit_corpus(data.train) if verbose: print("vocab:", len(trigram.vocab())) #trigram.print_keys() # evaluate on train, test, and dev print("train:", trigram.perplexity(data.train)) print("dev :", trigram.perplexity(data.dev)) print("test :", trigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(trigram, temp=0.25) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['With']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['Next']))) return trigram
def learn_unigram(data): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Unigram unigram = Unigram() unigram.fit_corpus(data.train) print("vocab:", len(unigram.vocab())) # evaluate on train, test, and dev print("train:", unigram.perplexity(data.train)) print("dev :", unigram.perplexity(data.dev)) print("test :", unigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(unigram) for _ in range(2): print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([], max_length=20))) return unigram
def learn(model, data, run_sampler=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ model.fit_corpus(data.train) print("vocab:", len(model.vocab())) # evaluate on train, test, and dev print("train:", model.perplexity(data.train)) print("dev :", model.perplexity(data.dev)) print("test :", model.perplexity(data.test)) if run_sampler: from generator import Sampler sampler = Sampler(model) for _ in range(2): print( "sample: ", " ".join( str(x) for x in sampler.sample_sentence([], max_length=20))) return model