def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(unigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university'])))
        print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States'])))
        print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion'])))
        print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to'])))
        print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome'])))
        print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry'])))
        print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of'])))
        print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from'])))
        print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science'])))
    return unigram
Beispiel #2
0
def learn_unigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    print("vocab:", len(unigram.vocab()))
    # evaluate on train, test, and dev
    # print("train:", unigram.perplexity(data.train))
    # print("dev  :", unigram.perplexity(data.dev))
    # print("test :", unigram.perplexity(data.test))
    # from generator import Sampler
    # sampler = Sampler(unigram)
    # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    # print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return unigram
Beispiel #3
0
def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
    return unigram
Beispiel #4
0
    incl_eos determines whether the space of words should include EOS or not.
    """
        wps = []
        tot = -np.inf  # this is the log (total mass)
        for w in self.lm.vocab():
            if not incl_eos and w == "END_OF_SENTENCE":
                continue
            lp = self.lm.cond_logprob(w, prev, 0)
            wps.append([w, lp / self.temp])
            tot = np.logaddexp2(lp / self.temp, tot)
        p = random.random()
        word = random.choice(wps)[0]
        s = -np.inf  # running mass
        for w, lp in wps:
            s = np.logaddexp2(s, lp)
            if p < pow(2, s - tot):
                word = w
                break
        return word


if __name__ == "__main__":
    from lm import Unigram
    unigram = Unigram()
    corpus = [["sam", "i", "am"]]
    unigram.fit_corpus(corpus)
    print(unigram.model)
    sampler = Sampler(unigram)
    for i in range(10):
        print(i, ":", " ".join(str(x) for x in sampler.sample_sentence([])))
Beispiel #5
0
from data import run_model
from lm import Unigram, NgramNoUnk, NgramUnk
import sys


def check_params(params):
  if (
      (len(params) <= 1)
      or (params[1] not in ('unigram', 'ngram'))
      or (params[1] == 'ngram' and len(params) < 4)
  ):
    print(f'Usage: {params[0]} <unigram|ngram> [n λ [voc_ratio]]')
    sys.exit(-1)


if __name__ == '__main__':
  params = sys.argv
  check_params(params)

  model = params[1]
  if model == 'unigram':
    run_model(lambda: Unigram(), 'results/unigram')
  else:
    n = int(params[2])
    λ = float(params[3])
    if len(params) == 5:
      voc_ratio = float(params[4])
      run_model(lambda: NgramUnk(n, λ, voc_ratio), f'results/ngram_n={n}_l={λ}_voc={voc_ratio}')
    else:
      run_model(lambda: NgramNoUnk(n, λ), f'results/ngram_n={n}_l={λ}')