def learn_unigram(data, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Unigram unigram = Unigram() unigram.fit_corpus(data.train) if verbose: print("vocab:", len(unigram.vocab())) # evaluate on train, test, and dev print("train:", unigram.perplexity(data.train)) print("dev :", unigram.perplexity(data.dev)) print("test :", unigram.perplexity(data.test)) from generator import Sampler sampler = Sampler(unigram) print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president']))) print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university']))) print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States']))) print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion']))) print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to']))) print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome']))) print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry']))) print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of']))) print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from']))) print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science']))) return unigram
def learn_unigram(data): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Unigram unigram = Unigram() unigram.fit_corpus(data.train) print("vocab:", len(unigram.vocab())) # evaluate on train, test, and dev # print("train:", unigram.perplexity(data.train)) # print("dev :", unigram.perplexity(data.dev)) # print("test :", unigram.perplexity(data.test)) # from generator import Sampler # sampler = Sampler(unigram) # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([]))) return unigram
def learn_unigram(data, verbose=True): """Learns a unigram model from data.train. It also evaluates the model on data.dev and data.test, along with generating some sample sentences from the model. """ from lm import Unigram unigram = Unigram() unigram.fit_corpus(data.train) if verbose: print("vocab:", len(unigram.vocab())) # evaluate on train, test, and dev print("train:", unigram.perplexity(data.train)) print("dev :", unigram.perplexity(data.dev)) print("test :", unigram.perplexity(data.test)) return unigram
incl_eos determines whether the space of words should include EOS or not. """ wps = [] tot = -np.inf # this is the log (total mass) for w in self.lm.vocab(): if not incl_eos and w == "END_OF_SENTENCE": continue lp = self.lm.cond_logprob(w, prev, 0) wps.append([w, lp / self.temp]) tot = np.logaddexp2(lp / self.temp, tot) p = random.random() word = random.choice(wps)[0] s = -np.inf # running mass for w, lp in wps: s = np.logaddexp2(s, lp) if p < pow(2, s - tot): word = w break return word if __name__ == "__main__": from lm import Unigram unigram = Unigram() corpus = [["sam", "i", "am"]] unigram.fit_corpus(corpus) print(unigram.model) sampler = Sampler(unigram) for i in range(10): print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))
from data import run_model from lm import Unigram, NgramNoUnk, NgramUnk import sys def check_params(params): if ( (len(params) <= 1) or (params[1] not in ('unigram', 'ngram')) or (params[1] == 'ngram' and len(params) < 4) ): print(f'Usage: {params[0]} <unigram|ngram> [n λ [voc_ratio]]') sys.exit(-1) if __name__ == '__main__': params = sys.argv check_params(params) model = params[1] if model == 'unigram': run_model(lambda: Unigram(), 'results/unigram') else: n = int(params[2]) λ = float(params[3]) if len(params) == 5: voc_ratio = float(params[4]) run_model(lambda: NgramUnk(n, λ, voc_ratio), f'results/ngram_n={n}_l={λ}_voc={voc_ratio}') else: run_model(lambda: NgramNoUnk(n, λ), f'results/ngram_n={n}_l={λ}')