Example #1
0
def sanity_check_run():
    sp = stream_predictor.StreamPredictor()
    text = 'hahaha this is a sanity check, just checking some text'
    sp.train_characters(text)
    sp.train_characters(text)
    sp.train_characters(text)
    sp.train_characters(text)
    print(sp.generate_stream(5))
    print('Everything OK')
Example #2
0
def load_sp(storage_file):
    sp = stream_predictor.StreamPredictor()
    if os.path.isfile(storage_file):
        sp.file_manager.load_tsv(storage_file)
        print('Loaded PopManager.PopManager from ', storage_file)
    else:
        print(' Created new PopManager.PopManager. Didnt find anything at ',
              storage_file)
    return sp
Example #3
0
def perplexity_experiment_load(string):
    words = nltk.word_tokenize(string)
    word_count = len(words)
    train_count = int(0.99 * word_count)
    test_words = words[train_count:]
    sp = stream_predictor.StreamPredictor()
    sp.file_manager.load_pb_plain('../PatternStore/pride_token.txt')
    perplexity_list = sp.pop_manager.calculate_perplexity(test_words)
    plt.plot(perplexity_list)
    plt.show()
Example #4
0
def train_and_perplexity(input_text_file):
    max_input_length = 10**9
    words = data_fetcher.get_clean_words_from_file(input_text_file,
                                                   max_input_length)
    sp = stream_predictor.StreamPredictor()
    Trainer.train(words=words, streampredictor=sp)
    perplexity_list, iteration = sp.pop_manager.train_token_and_perplexity(
        words)
    plt.plot(iteration, perplexity_list)
    plt.xlabel('Time')
    plt.ylabel('Perplexity')
    plt.title('Perplexity during training')
    plt.show()
Example #5
0
def online_token_perplexity_trainer():
    print('Starting online training with tokens and perplexity calculation')
    sp = stream_predictor.StreamPredictor()
    if os.path.isfile(storage_file):
        sp.file_manager.load_pb(storage_file)
        print('Loaded PopManager.PopManager from ', storage_file)
    else:
        print(' Created new PopManager.PopManager. Didnt find anything at ',
              storage_file)
    for iteration in range(10):
        start_time = time.time()
        print('Iteration number ' + str(iteration))
        words = data_fetcher.get_online_words(10**10)
        perplexity_over_training, training_time = sp.pop_manager.train_token_and_perplexity(
            words)
        plt.plot(training_time, perplexity_over_training, 'd-')
        plt.show()
        sp.file_manager.save_pb(storage_file)
        total_time_mins = (time.time() - start_time) / 60
        rate_words_min = round(len(words) / total_time_mins / 1000)
        print('Total time taken to run this is ', round(total_time_mins, ndigits=2), \
            ' mins. Rate = ', rate_words_min, ' K words/min')
Example #6
0
def generalize_token():
    sp = stream_predictor.StreamPredictor()
    words = data_fetcher.get_clean_words_from_file('../Data/pride.txt', 10**9)
    sp.pop_manager.train_token(words)
    sp.generalizer.generalize()
    sp.file_manager.save_pb_plain('../PatternStore/pride_generalized.txt')