def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--seq_len', default=40, type=int) parser.add_argument('--model', default=None) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--translit_path', default=None) parser.add_argument('--language', default=None) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language=args.language) (test_text, trans, long_letter_reverse_mapping) = utils.load_language_data( language=args.language, is_train=False) print("Building network ...") (output_layer, predict) = utils.define_model(args.hdim, args.depth, trans_vocab_size=trans_vocab_size, vocab_size=vocab_size, is_train=False) if args.model: f = np.load(args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Testing ...") if args.translit_path: data = codecs.open(args.translit_path, 'r', encoding='utf-8').read() translate_romanized(predict, data, args.seq_len, trans, trans_vocab_size, trans_to_index, index_to_char, long_letter_reverse_mapping) else: test(predict, test_text, args.language, args.model, args.seq_len, long_letter_reverse_mapping, trans, trans_to_index, char_to_index, index_to_trans, index_to_char)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--grad_clip', default=100, type=int) parser.add_argument('--lr', default=0.01, type=float) parser.add_argument('--batch_size', default=50, type=int) parser.add_argument('--num_epochs', default=10, type=int) parser.add_argument('--seq_len', default=60, type=int) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--model', default=None) parser.add_argument('--model_name_prefix', default='model') parser.add_argument('--language', default='hy-AM') parser.add_argument('--start_from', default=0, type=float) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language=args.language) (train_text, val_text, trans) = utils.load_language_data(language=args.language) data_size = len(train_text) print("Building Network ...") (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train=True) if args.model: f = np.load('languages/' + args.language + '/models/' + args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Training ...") p = int(len(train_text) * args.start_from) + 1 step_cnt = 0 avg_cost = 0 it = 0 while it < args.num_epochs: avg_cost = 0 date_at_beginning = datetime.now() non_native_skipped = 0 for _ in range(PRINT_FREQ): x, y, p, turned, non_native_sequences = utils.gen_data( p, args.seq_len, args.batch_size, train_text, trans, trans_to_index, char_to_index) if turned: it += 1 avg_cost += train(x, np.reshape(y, (-1, vocab_size))) non_native_skipped += non_native_sequences date_after = datetime.now() print("Epoch {} average loss = {} Time {} sec. Nonnatives skipped {}". format(1.0 * it + 1.0 * p / data_size, avg_cost / PRINT_FREQ, (date_after - date_at_beginning).total_seconds(), non_native_skipped)) step_cnt += 1 if True: #step_cnt * args.batch_size > 100000: print('computing validation loss...') val_turned = False val_p = 0 val_steps = 0. val_cost = 0. while not val_turned: x, y, val_p, val_turned, non_native = utils.gen_data( val_p, args.seq_len, args.batch_size, val_text, trans, trans_to_index, char_to_index) val_steps += 1 val_cost += cost(x, np.reshape(y, (-1, vocab_size))) print('validation loss is ' + str(val_cost / val_steps)) file_name = 'languages/' + args.language + '/models/' + args.model_name_prefix + '.hdim' + str( args.hdim) + '.depth' + str(args.depth) + '.seq_len' + str( args.seq_len) + '.bs' + str( args.batch_size) + '.epoch' + str( 1.0 * it + 1.0 * p / data_size) + '.loss' + str( avg_cost / PRINT_FREQ) + '.npz' print("saving to -> " + file_name) np.save(file_name, lasagne.layers.get_all_param_values(output_layer)) step_cnt = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--grad_clip', default=100, type=int) parser.add_argument('--lr', default=0.01, type=float) parser.add_argument('--batch_size', default=50, type=int) parser.add_argument('--num_epochs', default=50, type=int) parser.add_argument('--seq_len', default=60, type=int) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--model', default=None) parser.add_argument('--model_name_prefix', default='model') parser.add_argument('--language', default='hy-AM') parser.add_argument('--start_from', default=0, type=float) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language = args.language) (train_text, val_text, trans) = utils.load_language_data(language = args.language) data_size = len(train_text) print("Building Network ...") (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train = True) if args.model: f = np.load('languages/' + args.language + '/models/' + args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Training ...") step_cnt = 0 date_at_beginning = datetime.now() last_time = date_at_beginning for epoch in range(args.num_epochs): train_text = train_text.split(u'։') random.shuffle(train_text) train_text = u'։'.join(train_text) avg_cost = 0.0 count = 0 num_of_samples = 0 num_of_chars = 0 for (x, y) in utils.data_generator(train_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = True): sample_cost = train(x, np.reshape(y,(-1,vocab_size))) sample_cost = float(sample_cost) count += 1 num_of_samples += x.shape[0] num_of_chars += x.shape[0] * x.shape[1] time_now = datetime.now() if (time_now - last_time).total_seconds() > 60 * 1: # 10 minutes print('Computing validation loss...') val_cost = 0.0 val_count = 0.0 for ((x_val, y_val, indices, delimiters), non_valids_list) in utils.data_generator(val_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = False): val_cost += x_val.shape[0] *cost(x_val,np.reshape(y_val,(-1,vocab_size))) val_count += x_val.shape[0] print('Validation loss is {}'.format(val_cost/val_count)) file_name = 'languages/{}/models/{}.hdim{}.depth{}.seq_len{}.bs{}.time{:4f}.epoch{}.loss{:.4f}'.format(args.language, args.model_name_prefix, args.hdim, args.depth, args.seq_len, args.batch_size, (time_now - date_at_beginning).total_seconds()/60, epoch, val_cost/val_count) print("saving to -> " + file_name) np.save(file_name, lasagne.layers.get_all_param_values(output_layer)) last_time = datetime.now() print("On step #{} loss is {:.4f}, samples passed {}, chars_passed {}, {:.4f}% of an epoch {} time passed {:4f}"\ .format(count, sample_cost, num_of_samples, num_of_chars, 100.0*num_of_chars/len(train_text), epoch, (time_now - date_at_beginning).total_seconds()/60.0)) avg_cost += sample_cost
# dev dataset # load test set filename = 'dataset/Flickr_8k.devImages.txt' test = load_set(filename) print('Dataset: %d' % len(test)) test_descriptions = load_clean_descriptions('descriptions.txt', test) print('Descriptions: test=%d' % len(test_descriptions)) test_features = load_photo_features('features.pkl', test) print('Photos: test=%d' % len(test_features)) # prepare sequences X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features) # fit model model = define_model(vocab_size, max_length) filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='min') model.fit([X1train, X2train], ytrain, epochs=50, verbose=1, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))
def problem_solving(nb, problem, problem_languages, args, time_start): if problem_languages[nb] == "pl": pass print(problem) local_path = get_problem_truth(args.c, problem) print(local_path) problem_collection, number_of_texts = tagging_problem( local_path, problem_languages[nb]) print('tagged') authors = make_authors_list(problem_collection) print('authors defined') freq1 = args.freq1 freq2 = args.freq2 training_set_size, test_set_size = set_sizes(problem_collection) random.seed(time.time()) trunc_words1, trunc_words2 = create_char_ngrams_stat( problem_collection, freq2, problem_languages[nb]) problem_collection = filter_problem_corpus(problem_collection, trunc_words1, trunc_words2, problem_languages[nb]) problem_collection, nb_categories = create_ngrams_and_splitgrams( problem_collection) words_encoder, words_num = stats_for_ngrams_and_skipgrams( problem_collection, nb_categories, freq1) freq_feature, words_num = vectorise_problem_corpus(problem_collection, words_encoder, words_num, frequency, number_of_texts) freq_feature_form_norm, network_sizes = compute_mean_and_std( freq_feature, problem_collection, words_num) model_test = define_model(network_sizes, len(authors), len(words_encoder)) optimiser_test = define_optimiser(model_test) bceloss = torch.nn.NLLLoss() if use_cuda: bceloss = bceloss.cuda() mseloss = torch.nn.MSELoss() if use_cuda: mseloss = mseloss.cuda() model = training(model_test, training_set_size, problem_collection, authors, bceloss, optimiser_test, freq_feature_form_norm) print('after training') result = testing(problem_collection, model, authors, freq_feature_form_norm) print('after testing') with open(os.path.join(args.o, 'answers-{}.json'.format(problem)), 'w') as outfile: json.dump(result, outfile) time_now = time.time() timing = time_now - time_start print(as_minutes(timing)) print('sdadkashdksadfksahfksafhksadhf') return
def problem_solving(nb, problem, problem_languages, args, time_start): if True: #problem = 'problem00001' #nb = 0 if problem_languages[nb] == "pl": pass #continue #if (nb != 0 and nb != 0): # continue print(problem) local_path = get_problem_truth(args.c, problem) print(local_path) #global problem_collection problem_collection, number_of_texts = tagging_problem( local_path, problem_languages[nb]) print('tagged') #gc.collect() #save_tools(problem_collection_, 'problem_collection_anfang') #save_tools(number_of_texts, 'number_of_texts') #problem_collection_ = load_tools('problem_collection_anfang') #number_of_texts = load_tools('number_of_texts') #save_tools(number_of_texts, 'number_of_texts') authors = make_authors_list(problem_collection) print('authors defined') #quit() results = [] #frequency = random.choice([200,220,240,260,280,300]) #freq1 = random.choice([100,150,200,250,300,350]) #freq2 = random.choice([100,150,200,250,300,350]) #frequency_ = [200,220,240,260,280,300] #freq1_ = [100,150,200,250,300,350] #freq2_ = [100,150,200,250,300,350] if True: #for x in range(1): #break #problem_collection = copy.deepcopy(problem_collection_) #frequency = 3000#random.choice([500,600,800,1000,1200,1500]) #freq1 = 100#random.choice([100,150,200,250,300,350]) #freq2 = 200#random.choice([100,150,200,250,300,350]) #training_set_size, test_set_size = set_sizes(problem_collection) #random.seed(time.time()) #print(frequency, freq1, freq2) #trunc_words1, trunc_words2 = create_char_ngrams_stat(problem_collection, freq1, freq2, problem_languages[nb]) #problem_collection = filter_problem_corpus(problem_collection, trunc_words1, trunc_words2, problem_languages[nb]) #problem_collection, nb_categories = create_ngrams_and_splitgrams(problem_collection) #words_encoder, words_num = stats_for_ngrams_and_skipgrams(problem_collection, nb_categories, frequency) #problem_collection, freq_feature, words_num = vectorise_problem_corpus(problem_collection, words_encoder, words_num, frequency, number_of_texts) #freq_feature_form_norm, pca, network_sizes = compute_mean_and_std(freq_feature, problem_collection,words_num) ################################ #noisy_labels = cluster_test(problem_collection, len(freq_feature), authors, freq_feature_form_norm) frequency = 8000 #random.choice([500,600,800]) freq1 = 400 #random.choice([100,150,200,250,300,350]) freq2 = 1000 #random.choice([100,150,200,250,300,350]) training_set_size, test_set_size = set_sizes(problem_collection) random.seed(time.time()) print(frequency, freq1, freq2) #del problem_collection trunc_words1, trunc_words2 = create_char_ngrams_stat( problem_collection, freq1, freq2, problem_languages[nb]) problem_collection = filter_problem_corpus(problem_collection, trunc_words1, trunc_words2, problem_languages[nb]) problem_collection, nb_categories = create_ngrams_and_splitgrams( problem_collection) words_encoder, words_num = stats_for_ngrams_and_skipgrams( problem_collection, nb_categories, frequency) freq_feature, words_num = vectorise_problem_corpus( problem_collection, words_encoder, words_num, frequency, number_of_texts) freq_feature_form_norm, pca, network_sizes = compute_mean_and_std( freq_feature, problem_collection, words_num) #result = cluster_test(problem_collection, len(freq_feature), authors, freq_feature_form_norm) #save_tools(problem_collection, 'problem_collection') #save_tools(words_encoder, 'words_encoder') #save_tools(words_num, 'words_num') #save_tools(freq_feature, 'freq_feature') #problem_collection = load_tools('problem_collection') #words_encoder = load_tools('words_encoder') #words_num = load_tools('words_num') #freq_feature = load_tools('freq_feature') #print('tutaj') #global model_test #model_train = define_model(network_sizes, len(authors), freq_feature_form_norm,len(words_encoder)) model_test = define_model(network_sizes, len(authors), freq_feature_form_norm, len(words_encoder)) #model = define_model(network_sizes, len(authors), freq_feature_form_norm,len(words_encoder)) #global optimiser_test #optimiser_train = define_optimiser(model_train) optimiser_test = define_optimiser(model_test) bceloss = torch.nn.NLLLoss() if use_cuda: bceloss = bceloss.cuda() mseloss = torch.nn.MSELoss() if use_cuda: mseloss = mseloss.cuda() #global model model = training([None, model_test], training_set_size, problem_collection, authors, bceloss, mseloss, (None, optimiser_test), freq_feature_form_norm, None) print('after training') result = testing(problem_collection, model, authors, freq_feature_form_norm, None) print('after testing') with open(os.path.join(args.o, 'answers-{}.json'.format(problem)), 'w') as outfile: json.dump(result, outfile) #results.append(result) del model_test, optimiser_test, bceloss, mseloss, outfile #gc.collect() del freq_feature_form_norm, pca, network_sizes, result, freq_feature, words_num #gc.collect() del trunc_words1, trunc_words2, nb_categories, words_encoder, training_set_size, test_set_size #gc.collect() del problem_collection, model #del globals()['problem_collection'], globals()['model'] #del globals()['optimiser_test'] #del globals()['model_test'] #gc.collect() time_now = time.time() timing = time_now - time_start print(as_minutes(timing)) #gc.collect() del number_of_texts, authors gc.collect() #save_tools(results, problem) #quit() #quit() print('sdadkashdksadfksahfksafhksadhf') return