Beispiel #1
0
def train(train_file):
    """Return the required language models trained from a file."""
    unigram = NgramModel(1)
    bigram_left = NgramModel(2)
    bigram_right = NgramModel(2)
    for line in train_file:
        tokens = line.rstrip().split()
        unigram.update(tokens)
        bigram_left.update(tokens)
        bigram_right.update(reversed(tokens))

    return (unigram, bigram_left, bigram_right)
Beispiel #2
0
def gen_disassociated_press(file=KJBIBLE, order=3, len=100):
    """Generate some autocorrelated text."""
    tokens = [k for k in tokenize(file) if k.isalpha()]
    model = NgramModel(order, tokens, MLEProbDist)
    ret = ['',] * (order-1)
    for i in range(len):
        tail = ret[-(order-1):]
        ret.append(model.generate_one(tail))
    return ' '.join(ret[(order-1):])
Beispiel #3
0
    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(
            map(
                lambda tokens: list(self._word_ids.add_words_transform(tokens)
                                    ), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(
            tags, 2 * self._n)  # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))
 def train_ngram(self, target_data_path, days, kakao_data_path):
     results = []
     ngramModel = NgramModel(self.dataLoader)
     self.dataLoader.ngram_data_loader(days)
     self.dataLoader.kakao_data_loader(kakao_data_path)
     target_datas = self.dataLoader.target_data_loader(target_data_path)
     target_data_len = len(target_datas)
     for i, target_data in enumerate(target_datas):
         if i % (target_data_len / 100) == 0:
             print("{}% 완료".format((i / target_data_len) * 100))
         re = ngramModel.detect_rule_recommend(target_data)
         results.append(target_data + " " + " ".join(re))
     self.dataLoader.write_result(self.write_file_path, results)
Beispiel #5
0
def initialize_bot(chars, nicks):
    n = 3
#    intros = ["So", "Hi", "In fact", "For what it's worth", "Think about it",
#              "Conversely", "On the other hand", "Debatably", "Especially", "Not to mention", "Although", "Moreover", "Equally",
#              "But", "Yes",
#              "See here", "Ultimately", "Rather", "Nevertheless", "As you said", "Mind you", "Even so"]
    char_corps = load_corpora(chars)
    est = lambda fdist, bins: MLEProbDist(fdist)
#    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#    est = lambda fdist, bins: WittenBellProbDist(fdist)
#    est = lambda fdist, bins: KneserNeyProbDist(fdist)
    models = {character: NgramModel(n, corp, estimator=est)
              for character, corp in char_corps.iteritems()}
#    return ChatBot(chars, nicks, intros, models, ngram=n, debug=False)
    return ChatBot(chars, nicks, models, ngram=n, debug=False)
Beispiel #6
0
        with open(file=path, mode="wb") as fp:
            fp.write(pickle.dumps(obj=params_dict))

    @classmethod
    def load(cls, path):
        """
        加载模型
        :param path: 保存路径
        :return:
        """
        params_dict = pickle.load(open(file=path, mode="rb"))
        lookup_table = params_dict['_lookup_table']
        ngram_model = pickle.loads(params_dict['_ngram_model_pickle'],
                                   fix_imports=True)
        return cls(ngram_model=ngram_model, lookup_table=lookup_table)


if __name__ == '__main__':
    from nltk.text import Text
    from nltk.corpus import gutenberg

    text1 = Text(gutenberg.words('melville-moby_dick.txt'))
    #
    ngramCounter = NgramCounter(order=2, train=text1)
    ngramModel = NgramModel(ngram_counter=ngramCounter)

    corrector = NgramCorrector(ngram_model=ngramModel)
    print(corrector.correct(['I', 'dooo', 'think', 'you', 'rre', 'goooood']))
    corrector2 = NgramCorrector.load("123")
    print(corrector2.correct(['I', 'don', 'think', 'you', 'rre', 'goooood']))
            test_text.extend(sentences)
        else:
            test_text.append(txt)

    #print test_files
    print len(test_files)

    total_train_files = []
    TOTAL = INCREMENT
    UPPER_LIMIT = 500

    while len(total_train_files) < UPPER_LIMIT:
        total_train_files = train_files[:TOTAL]
        data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files)
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        lm = NgramModel(3, data_set_corpus.words(), estimator)
        #lm = NgramModel(2, data_set_corpus.words(), estimator)

        P = []
        for s in test_text:
            s_tokens = nltk.word_tokenize(s)
            if SENTENCE:
                #if len(s_tokens) > 3:
                if len(s_tokens) > 10:
                    p = lm.perplexity(s_tokens)
                    P.append(p)
            else:
                p = lm.perplexity(s_tokens)
                P.append(p)
        TOTAL += INCREMENT
Beispiel #8
0
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: %s <corpus-root> <tweets-file>" % (sys.argv[0])
        sys.exit(1)

    corpus_root = sys.argv[1]
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ignored_words = nltk.corpus.stopwords.words('english')

    pos_movie_reviews = PlaintextCorpusReader(corpus_root + "/pos", ".*\.txt")
    neg_movie_reviews = PlaintextCorpusReader(corpus_root + "/neg", ".*\.txt")

    print "Corpora built."

    pos_unigram_lm = NgramModel(1, pos_movie_reviews.words(), estimator)
    print "Positive unigram model complete."
    pos_bigram_lm = NgramModel(2, pos_movie_reviews.words(), estimator)
    print "Positive bigram model complete."
    #pos_trigram_lm = NgramModel(3, pos_movie_reviews.words(), estimator)

    neg_unigram_lm = NgramModel(1, neg_movie_reviews.words(), estimator)
    print "Negative unigram model complete."
    neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator)
    print "Negative bigram model complete."
    #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator)

    #read in the tweets
    tweets = []
    tokenizer = utils.Tokenizer()
def main():
    """
    Trains and evaluates neural
    language models on the Microsoft Sentence Completion
    Challenge dataset.

    Allowed cmd-line flags:
        -s TS_FILES : Uses the reduced trainsed (TS_FILES trainset files)
        -o MIN_OCCUR : Only uses terms that occur MIN_OCCUR or more times
            in the trainset. Other terms are replaced with a special token.
        -f MIN_FILES : Only uses terms that occur in MIN_FILES or more files
            in the trainset. Other terms are replaced with a special token.
        -n : n-gram length (default 4)
        -t : Use tree-grams (default does not ues tree-grams)
        -u FTRS : Features to use. FTRS must be a string composed of zeros
            and ones, of length 5. Ones indicate usage of following features:
            (word, lemma, google_pos, penn_pos, dependency_type), respectively.

    Neural-net specific cmd-line flags:
        -ep EPOCHS : Number of training epochs, defaults to 20.
        -eps EPS : Learning rate, defaults to 0.005.
        -mnb MBN_SIZE : Size of the minibatch, defaults to 2000.

    """
    logging.basicConfig(level=logging.INFO)
    log.info("Evaluating model")

    #   get the data handling parameters
    ts_reduction = util.argv('-s', None, int)
    min_occ = util.argv('-o', 5, int)
    min_files = util.argv('-f', 2, int)
    n = util.argv('-n', 4, int)
    use_tree = '-t' in sys.argv
    bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"]
    ft_format = lambda s: map(bool_format, s)
    ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format))
    val_per_epoch = util.argv('-v', 10, int)

    #   nnets only support one-feature ngrams
    assert ftr_use.sum() == 1

    #   get nnet training parameters
    use_lbl = '-l' in sys.argv
    epochs = util.argv('-ep', 20, int)
    eps = util.argv('-eps', 0.002, float)
    mnb_size = util.argv('-mnb', 2000, int)
    n_hid = util.argv('-h', 1000, int)
    d = util.argv('-d', 100, int)

    #   load data
    ngrams, q_groups, answers, feature_sizes = data.load_ngrams(
        n, ftr_use, use_tree, subset=ts_reduction,
        min_occ=min_occ, min_files=min_files)
    used_ftr_sizes = feature_sizes[ftr_use]
    #   remember, we only use one feature
    vocab_size = used_ftr_sizes[0]
    log.info("Data loaded, %d ngrams", ngrams.shape[0])

    #   split data into sets
    x_train, x_valid, x_test = util.dataset_split(ngrams, 0.05, 0.05, rng=456)

    #   generate a version of the validation set that has
    #   the first term (the conditioned one) randomized
    #   w.r.t. unigram distribution
    #   so first create the unigram distribution, no smoothing
    unigrams_data = data.load_ngrams(1, ftr_use, False, subset=ts_reduction,
                                     min_occ=min_occ, min_files=min_files)[0]
    unigrams_data = NgramModel(1, False, ftr_use, feature_sizes, ts_reduction,
                               min_occ, min_files, 0.0, 0.0, unigrams_data)
    unigrams_dist = unigrams_data.probability_additive(
        np.arange(vocab_size).reshape(vocab_size, 1))
    unigrams_dist /= unigrams_dist.sum()
    #   finally, generate validation sets with randomized term
    x_valid_r = random_ngrams(x_valid, vocab_size, False, unigrams_dist)

    #   the directory for this model
    dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\
        % ("llbl" if use_lbl else "lmlp",
            "tree" if use_tree else "linear", n,
            "".join([str(int(b)) for b in ftr_use]),
            ts_reduction, min_occ, min_files)
    dir = os.path.join(_DIR, dir)
    if not os.path.exists(dir):
        os.makedirs(dir)

    #   filename base for this model
    file = "nhid-%d_d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % (
        n_hid, d, mnb_size, epochs, eps)

    #   store the logs
    if False:
        log_file_handler = logging.FileHandler(
            os.path.join(dir, file + ".log"))
        log_file_handler.setLevel(logging.INFO)
        logging.root.addHandler(log_file_handler)

    #   we will plot log-lik ratios for every _VALIDATE_MNB minibatches
    #   we will also plot true mean log-lik
    valid_on = {"x_valid": x_valid[:_LL_SIZE], "x_valid_r": x_valid_r[
        :_LL_SIZE], "x_train": x_train[:_LL_SIZE]}
    valid_ll = {k: [] for k in valid_on.keys()}
    valid_p_mean = {k: [] for k in valid_on.keys()}

    #   how often we validate
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    _VALIDATE_MNB = mnb_count / val_per_epoch

    def mnb_callback(net, epoch, mnb):
        """
        Callback function called after every minibatch.
        """
        if (mnb + 1) % _VALIDATE_MNB:
            return

        #   calculate log likelihood using the exact probability
        probability_f = theano.function([net.input], net.probability)
        for name, valid_set in valid_on.iteritems():
            p = probability_f(valid_set)
            valid_ll[name].append(np.log(p).mean())
            valid_p_mean[name].append(p.mean())

        log.info('Epoch %d, mnb: %d, x_valid mean-log-lik: %.5f'
                 ' , x_valid p-mean: %.5f'
                 ' , ln(p(x_valid) / p(x_valid_r).mean(): %.5f',
                 epoch, mnb, valid_ll["x_valid"][-1],
                 valid_p_mean["x_valid"][-1],
                 valid_ll["x_valid"][-1] - valid_ll["x_valid_r"][-1])

    #   track if the model progresses on the sentence completion challenge
    # sent_challenge = []

    def epoch_callback(net, epoch):

        #   log some info about the parameters, just so we know
        param_mean_std = [(k, v.mean(), v.std())
                          for k, v in net.params().iteritems()]
        log.info("Epoch %d: %s", epoch, "".join(
            ["\n\t%s: %.5f +- %.5f" % pms for pms in param_mean_std]))

        #   evaluate model on the sentence completion challenge
        # probability_f = theano.function([net.input], net.probability)
        # qg_log_lik = [[np.log(probability_f(q)).sum() for q in q_g]
        #               for q_g in q_groups]
        # predictions = map(lambda q_g: np.argmax(q_g), qg_log_lik)
        # sent_challenge.append((np.array(predictions) == answers).mean())
        # log.info('Epoch %d sentence completion eval score: %.4f',
        #          epoch, sent_challenge[-1])

    log.info("Creating model")
    if use_lbl:
        net = LLBL(n, vocab_size, d, 12345)
    else:
        net = LMLP(n, vocab_size, d, 12345)
    net.mnb_callback = mnb_callback
    net.epoch_callback = epoch_callback
    train_cost, valid_cost, _ = net.train(
        x_train, x_valid, mnb_size, epochs, eps)

    #   plot training progress info
    #   first we need values for the x-axis (minibatch count)
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    mnb_valid_ep = mnb_count / _VALIDATE_MNB
    x_axis_mnb = np.tile((np.arange(mnb_valid_ep) + 1) * _VALIDATE_MNB, epochs)
    x_axis_mnb += np.repeat(np.arange(epochs) * mnb_count, mnb_valid_ep)
    x_axis_mnb = np.hstack(([0], x_axis_mnb))

    plt.figure(figsize=(16, 12))
    plt.subplot(221)
    plt.plot(mnb_count * (np.arange(epochs) + 1), train_cost, 'b-',
             label='train')
    plt.plot(mnb_count * (np.arange(epochs) + 1), valid_cost, 'g-',
             label='valid')
    plt.axhline(min(valid_cost), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [min(valid_cost)])
    plt.title('cost')
    plt.grid()
    plt.legend(loc=1)

    plt.subplot(222)
    for name, valid_set in valid_ll.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.ylim((np.log(0.5 / vocab_size),
              max([max(v) for v in valid_ll.values()]) + 0.5))
    plt.axhline(max(valid_ll["x_valid"]), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [max(valid_ll["x_valid"])])
    plt.title('log-likelihood(x)')
    plt.grid()
    plt.legend(loc=4)

    plt.subplot(224)
    for name, valid_set in valid_p_mean.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.title('p(x).mean()')
    plt.grid()
    plt.legend(loc=4)

    # plt.subplot(224)
    # plt.plot(mnb_count * np.arange(epochs + 1), sent_challenge, 'g-')
    # plt.title('sent_challenge')
    # plt.grid()

    plt.savefig(os.path.join(dir, file + ".pdf"))