Beispiel #1
0
def learn_trigram(data, verbose=True):
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    """
    # Uncomment for reuters only
    # get most frequent trigrams from validation set
    """
    #trigram.frequent_trigrams(data.dev)

    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))

        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ",
              " ".join(str(x) for x in sampler.sample_sentence(["my", "dog"])))
        print(
            "sample 2: ", " ".join(
                str(x) for x in sampler.sample_sentence(["good", "morning"])))
    return trigram
Beispiel #2
0
def learn_bigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Bigram
    bigram = Bigram()
    bigram.fit_corpus(data.train)
    print("vocab:", len(bigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", bigram.perplexity(data.train))
    print("dev  :", bigram.perplexity(data.dev))
    print("test :", bigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(bigram)
    print(
        "sample: ", " ".join(
            str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE'])))
    print(
        "sample: ", " ".join(
            str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE'])))
    print(
        "sample: ", " ".join(
            str(x) for x in sampler.sample_sentence(['START_OF_SENTENCE'])))
    return bigram
Beispiel #3
0
def search_ngram(data, unk_prob, smooth, n=3, backoff=False):
    from lm import Ngram
    from numpy import inf
    perplexity = inf
    best_model = Ngram()
    for up in unk_prob:
        print("Trying unk_prob {}".format(up))
        for s in smooth:
            ngram = Ngram(up, n, s, backoff)
            ngram.fit_corpus(data.train)
            curr_perp = ngram.perplexity(data.dev)
            if curr_perp < perplexity:
                perplexity = curr_perp
                best_model = ngram

    print("vocab:", len(best_model.vocab()))
    # evaluate on train, test, and dev
    print("train:", best_model.perplexity(data.train))
    print("dev  :", best_model.perplexity(data.dev))
    print("test :", best_model.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(best_model)
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return best_model
def learn_bigram(data,
                 print_info=False,
                 sample_sentence=False,
                 gamma=0,
                 smooth=1):
    """Learns a bigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    bigram = lm.Bigram(gamma=gamma, smooth=smooth)
    bigram.fit_corpus(data.train)
    if print_info:
        print("vocab:", len(bigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", bigram.perplexity(data.train))
        print("dev  :", bigram.perplexity(data.dev))
        print("test :", bigram.perplexity(data.test))

    if sample_sentence:
        from generator import Sampler
        sampler = Sampler(bigram)
        print("sample: ",
              " ".join(str(x) for x in sampler.sample_sentence([])))
        print("sample: ",
              " ".join(str(x) for x in sampler.sample_sentence([])))
        print("sample: ",
              " ".join(str(x) for x in sampler.sample_sentence([])))

    return bigram
Beispiel #5
0
def learn_ngram(data, hyperp_set={}):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Ngram
    # from lm import Ngram_baseline
    if hyperp_set == {}:
        ngram = Ngram(comb=3)
    else:
        if list(hyperp_set.keys())[0] == 'lamb':
            print(hyperp_set[list(hyperp_set.keys())[0]])
            ngram = Ngram(comb=3, lamb=hyperp_set[list(hyperp_set.keys())[0]])
        elif list(hyperp_set.keys())[0] == 'gamma':
            ngram = Ngram(comb=3, gamma=hyperp_set[list(hyperp_set.keys())[0]])
        else:
            ngram = Ngram(comb=3)

    ngram.fit_corpus(data.train)
    print("vocab:", len(ngram.vocab()))
    # evaluate on train, test, and dev
    print("train:", ngram.perplexity(data.train))
    print("dev  :", ngram.perplexity(data.dev))
    print("test :", ngram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(ngram)
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(['The'])))
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(['The'])))
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(['The'])))
    return ngram
def learn_trigram(data, delta=1/2**(15), smoothing=True, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'president'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'university'])))
        print("sample 3: ", " ".join(str(x) for x in sampler.sample_sentence(['The', 'United', 'States'])))
        print("sample 4: ", " ".join(str(x) for x in sampler.sample_sentence(['An', 'explosion'])))
        print("sample 5: ", " ".join(str(x) for x in sampler.sample_sentence(['To', 'be', 'or', 'to'])))
        print("sample 6: ", " ".join(str(x) for x in sampler.sample_sentence(['This', 'is', 'awesome'])))
        print("sample 7: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'am', 'sorry'])))
        print("sample 8: ", " ".join(str(x) for x in sampler.sample_sentence(['Today', 'the', 'chair', 'of'])))
        print("sample 9: ", " ".join(str(x) for x in sampler.sample_sentence(['Hello', 'I', 'came', 'from'])))
        print("sample 10: ", " ".join(str(x) for x in sampler.sample_sentence(['I', 'major', 'in', 'Computer', 'Science'])))
    return trigram
Beispiel #7
0
def learn_backoff_trigram(data, unk_prob=.0001, smooth=1e-7):
    from lm import BackoffTrigram
    ngram = BackoffTrigram(unk_prob, smooth)
    ngram.fit_corpus(data.train)
    print("vocab:", len(ngram.vocab()))
    # evaluate on train, test, and dev
    print("train:", ngram.perplexity(data.train))
    print("dev  :", ngram.perplexity(data.dev))
    print("test :", ngram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(ngram)
    prefix = []
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(prefix)))
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(prefix)))
    print("sample: ",
          " ".join(str(x) for x in sampler.sample_sentence(prefix)))
    return ngram
Beispiel #8
0
def learn_ngram(data, n=1, lower=True):
    """Learns a ngram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Ngram
    ngram = Ngram(n=n, lower=lower)
    ngram.fit_corpus(data.train)
    print("vocab:", len(ngram.vocab()))
    # evaluate on train, test, and dev
    print("train:", ngram.perplexity(data.train))
    print("dev  :", ngram.perplexity(data.dev))
    print("test :", ngram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(ngram)
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return ngram
def learn_unigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(unigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", unigram.perplexity(data.train))
        print("dev  :", unigram.perplexity(data.dev))
        print("test :", unigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(unigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(['They'])))
    return unigram
Beispiel #10
0
def generate(lm):
    st.subheader("Language Model Generation")
    prefix = st.text_input("Enter a prefix (default: no prefix)")
    temp = st.number_input("Enter a temperature", value=1.0)
    max_length = st.slider("Maximum length of generated sentences", 10, 50)
    num_samples = st.slider("Number of sentences to generate", 1, 10)
    sampler = Sampler(lm, temp)
    if st.button("Generate"):
        for i in range(num_samples):
            generated_text = " ".join(
                sampler.sample_sentence(prefix.split(), max_length))
            st.write(f"{i}) {generated_text}\n")
Beispiel #11
0
def learn_trigram(data, delta=1 / 2**(15), smoothing=True, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(backoff=0.000001, delta=delta, smoothing=smoothing)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ",
              " ".join(str(x) for x in sampler.sample_sentence([])))
        print("sample 2: ",
              " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram
def learn_trigram(data, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    #if trigram.normMeth == "interpol":
    #    trigram.findLamdas(data.dev)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram)
        print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence(['The'])))
        print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence(["They"])))
    return trigram
Beispiel #13
0
def learn_trigram(data):
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    print("vocab:", len(trigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", trigram.perplexity(data.train))
    print("dev  :", trigram.perplexity(data.dev))
    print("test :", trigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(trigram)
    print("sample 1: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample 2: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return unigram
def learn_trigram(data, alpha, sampler=0, verbose=True):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram()
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", trigram.num_words)

        # evaluate on train, test, and dev
        print("train:", trigram.perplexity(data.train, alpha, 1))
        print("dev  :", trigram.perplexity(data.dev, alpha, 1))
        print("test :", trigram.perplexity(data.test, alpha, 1))
        if sampler == 1:
            from generator import Sampler
            sampler = Sampler(trigram)
            print("sample 1: ",
                  " ".join(str(x) for x in sampler.sample_sentence([])))
            print("sample 2: ",
                  " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram
def learn_trigram(data, thres=4, verbose=True):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Trigram
    trigram = Trigram(thres)
    trigram.fit_corpus(data.train)
    if verbose:
        print("vocab:", len(trigram.vocab()))
        #trigram.print_keys()
        # evaluate on train, test, and dev

        print("train:", trigram.perplexity(data.train))
        print("dev  :", trigram.perplexity(data.dev))
        print("test :", trigram.perplexity(data.test))
        from generator import Sampler
        sampler = Sampler(trigram, temp=0.25)
        print("sample 1: ",
              " ".join(str(x) for x in sampler.sample_sentence(['With'])))
        print("sample 2: ",
              " ".join(str(x) for x in sampler.sample_sentence(['Next'])))
    return trigram
Beispiel #16
0
def learn_unigram(data):
    """Learns a unigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    from lm import Unigram
    unigram = Unigram()
    unigram.fit_corpus(data.train)
    print("vocab:", len(unigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", unigram.perplexity(data.train))
    print("dev  :", unigram.perplexity(data.dev))
    print("test :", unigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(unigram)
    for _ in range(2):
        print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([], max_length=20)))
    return unigram
Beispiel #17
0
def learn(model, data, run_sampler=True):
    """Learns a unigram model from data.train.

  It also evaluates the model on data.dev and data.test, along with generating
  some sample sentences from the model.
  """
    model.fit_corpus(data.train)
    print("vocab:", len(model.vocab()))
    # evaluate on train, test, and dev
    print("train:", model.perplexity(data.train))
    print("dev  :", model.perplexity(data.dev))
    print("test :", model.perplexity(data.test))

    if run_sampler:
        from generator import Sampler
        sampler = Sampler(model)
        for _ in range(2):
            print(
                "sample: ", " ".join(
                    str(x)
                    for x in sampler.sample_sentence([], max_length=20)))
    return model