def sanity_check_run(): sp = stream_predictor.StreamPredictor() text = 'hahaha this is a sanity check, just checking some text' sp.train_characters(text) sp.train_characters(text) sp.train_characters(text) sp.train_characters(text) print(sp.generate_stream(5)) print('Everything OK')
def load_sp(storage_file): sp = stream_predictor.StreamPredictor() if os.path.isfile(storage_file): sp.file_manager.load_tsv(storage_file) print('Loaded PopManager.PopManager from ', storage_file) else: print(' Created new PopManager.PopManager. Didnt find anything at ', storage_file) return sp
def perplexity_experiment_load(string): words = nltk.word_tokenize(string) word_count = len(words) train_count = int(0.99 * word_count) test_words = words[train_count:] sp = stream_predictor.StreamPredictor() sp.file_manager.load_pb_plain('../PatternStore/pride_token.txt') perplexity_list = sp.pop_manager.calculate_perplexity(test_words) plt.plot(perplexity_list) plt.show()
def train_and_perplexity(input_text_file): max_input_length = 10**9 words = data_fetcher.get_clean_words_from_file(input_text_file, max_input_length) sp = stream_predictor.StreamPredictor() Trainer.train(words=words, streampredictor=sp) perplexity_list, iteration = sp.pop_manager.train_token_and_perplexity( words) plt.plot(iteration, perplexity_list) plt.xlabel('Time') plt.ylabel('Perplexity') plt.title('Perplexity during training') plt.show()
def online_token_perplexity_trainer(): print('Starting online training with tokens and perplexity calculation') sp = stream_predictor.StreamPredictor() if os.path.isfile(storage_file): sp.file_manager.load_pb(storage_file) print('Loaded PopManager.PopManager from ', storage_file) else: print(' Created new PopManager.PopManager. Didnt find anything at ', storage_file) for iteration in range(10): start_time = time.time() print('Iteration number ' + str(iteration)) words = data_fetcher.get_online_words(10**10) perplexity_over_training, training_time = sp.pop_manager.train_token_and_perplexity( words) plt.plot(training_time, perplexity_over_training, 'd-') plt.show() sp.file_manager.save_pb(storage_file) total_time_mins = (time.time() - start_time) / 60 rate_words_min = round(len(words) / total_time_mins / 1000) print('Total time taken to run this is ', round(total_time_mins, ndigits=2), \ ' mins. Rate = ', rate_words_min, ' K words/min')
def generalize_token(): sp = stream_predictor.StreamPredictor() words = data_fetcher.get_clean_words_from_file('../Data/pride.txt', 10**9) sp.pop_manager.train_token(words) sp.generalizer.generalize() sp.file_manager.save_pb_plain('../PatternStore/pride_generalized.txt')