def transform_data_from_df_to_dataset(data, stack_size): vocabulary, keyword_num = load_keyword_identifier_split_vocabulary( get_token_vocabulary, [BEGIN], [END], UNK) print("vocab_size:{}".format(vocabulary.vocabulary_size)) print("The max token id:{}".format(max( vocabulary.word_to_id_dict.values()))) slk_constants = C99SLKConstants() # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64} label_vocabulary = C99LabelVocabulary(slk_constants) production_vocabulary = SLKProductionVocabulary(slk_constants) transforms_fn = transforms.Compose([ # IsNone("original"), CopyMap(), key_transform(RangeMaskMap(stack_size), "max_scope_list"), key_transform(IndexMaskMap(stack_size), "identifier_scope_index"), # IsNone("after type input"), FlatMap(), # IsNone("Flat Map"), PadMap(keyword_num, stack_size), # IsNone("Pad Map"), ]) generate_dataset = lambda df: CCodeDataSet(df, vocabulary, stack_size, transforms_fn) res = generate_dataset(data[0]) del data[0] return res, keyword_num, vocabulary
def get_transform(stack_size): vocabulary, keyword_num = load_keyword_identifier_split_vocabulary( get_token_vocabulary, [BEGIN], [END], UNK) print("vocab_size:{}".format(vocabulary.vocabulary_size)) print("The max token id:{}".format(max( vocabulary.word_to_id_dict.values()))) slk_constants = C99SLKConstants() # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64} label_vocabulary = C99LabelVocabulary(slk_constants) production_vocabulary = SLKProductionVocabulary(slk_constants) transforms_fn = transforms.Compose([ # IsNone("original"), CopyMap(), key_transform(RangeMaskMap(stack_size), "max_scope_list"), key_transform(IndexMaskMap(stack_size), "identifier_scope_index"), key_transform( GrammarLanguageModelTypeInputMap(production_vocabulary, vocabulary, label_vocabulary, keyword_num), "tree", "target"), # IsNone("after type input"), FlatMap(), # IsNone("Flat Map"), PadMap(keyword_num, stack_size), # IsNone("Pad Map"), ]) return keyword_num, vocabulary, transforms_fn
def transform_data_from_df_to_dataset(data, ): for d, n in zip(data, ["train", "val", "test"]): print("There are {} raw data in the {} dataset".format(len(d), n)) vocabulary, keyword_num = load_keyword_identifier_split_vocabulary( get_token_vocabulary, [BEGIN], [END], UNK) print("vocab_size:{}".format(vocabulary.vocabulary_size)) print("The max token id:{}".format(max( vocabulary.word_to_id_dict.values()))) slk_constants = C99SLKConstants() # terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64} label_vocabulary = C99LabelVocabulary(slk_constants) production_vocabulary = SLKProductionVocabulary(slk_constants) transforms_fn = transforms.Compose([ # IsNone("original"), CopyMap(), key_transform( GrammarLanguageModelTypeInputMap(production_vocabulary, vocabulary, label_vocabulary, keyword_num), "tree", "target"), # IsNone("after type input"), FlatMap(), # IsNone("Flat Map"), PadMap(keyword_num), # IsNone("Pad Map"), ]) generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn) res = generate_dataset(data[0]) del data[0] return res, keyword_num, vocabulary
def train_and_evaluate(data, batch_size, embedding_dim, hidden_state_size, rnn_num_layer, learning_rate, epoches, saved_name, load_previous_model=False): save_path = os.path.join(config.save_model_root, saved_name) for d, n in zip(data, ["train", "val", "test"]): print("There are {} raw data in the {} dataset".format(len(d), n)) vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK) print("vocab_size:{}".format(vocabulary.vocabulary_size)) print("The max token id:{}".format(max(vocabulary.word_to_id_dict.values()))) slk_constants = C99SLKConstants() terminal_token_index = set(range(slk_constants.START_SYMBOL-2)) - {63, 64} label_vocabulary = C99LabelVocabulary(slk_constants) production_vocabulary = SLKProductionVocabulary(slk_constants) transforms_fn = transforms.Compose([ IsNone("original"), key_transform(GrammarLanguageModelTypeInputMap(production_vocabulary), "tree"), IsNone("after type input"), FlatMap(), IsNone("Flat Map"), PadMap(production_vocabulary.token_num()), IsNone("Pad Map"), ]) generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn) data = [generate_dataset(d) for d in data] for d, n in zip(data, ["train", "val", "test"]): print("There are {} parsed data in the {} dataset".format(len(d), n)) train_dataset, valid_dataset, test_dataset = data keyword_index = [vocabulary.word_to_id(t) for t in pre_defined_c_tokens | {"CONSTANT", "STRING_LITERAL"}] identifier_index = label_vocabulary.get_label_id("ID") - 1 # zero loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=PAD_TOKEN) model = GrammarLanguageModel( vocabulary.vocabulary_size, production_vocabulary.token_num(), embedding_dim, hidden_state_size, rnn_num_layer, identifier_index, keyword_index, terminal_token_index, batch_size ) optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') if load_previous_model: torch_util.load_model(model, save_path) valid_loss = evaluate(model, valid_dataset, batch_size, loss_function) test_loss = evaluate(model, test_dataset, batch_size, loss_function) best_valid_perplexity = torch.exp(valid_loss)[0] best_test_perplexity = torch.exp(test_loss)[0] print( "load the previous mode, validation perplexity is {}, test perplexity is :{}".format(best_valid_perplexity, best_test_perplexity)) scheduler.step(best_valid_perplexity) else: best_valid_perplexity = None best_test_perplexity = None for epoch in range(epoches): train_loss = train(model, train_dataset, batch_size, loss_function, optimizer) valid_loss = evaluate(model, valid_dataset, batch_size, loss_function) test_loss = evaluate(model, test_dataset, batch_size, loss_function) train_perplexity = torch.exp(train_loss)[0] valid_perplexity = torch.exp(valid_loss)[0] test_perplexity = torch.exp(test_loss)[0] scheduler.step(valid_perplexity) if best_valid_perplexity is None or valid_perplexity < best_valid_perplexity: best_valid_perplexity = valid_perplexity best_test_perplexity = test_perplexity torch_util.save_model(model, save_path) print("epoch {}: train perplexity of {}, valid perplexity of {}, test perplexity of {}". format(epoch, train_perplexity, valid_perplexity, test_perplexity)) print("The model {} best valid perplexity is {} and test perplexity is {}". format(saved_name, best_valid_perplexity, best_test_perplexity))
def train_and_evaluate(data, batch_size, embedding_dim, hidden_state_size, rnn_num_layer, learning_rate, epoches, saved_name, load_previous_model=False): save_path = os.path.join(config.save_model_root, saved_name) for d, n in zip(data, ["train", "val", "test"]): print("There are {} raw data in the {} dataset".format(len(d), n)) vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map, [BEGIN], [END], UNK) production_vocabulary = get_all_c99_production_vocabulary() print("terminal num:{}".format(len( production_vocabulary._terminal_id_set))) transforms_fn = transforms.Compose([ key_transform(GrammarLanguageModelTypeInputMap(production_vocabulary), "tree"), FlatMap(), PadMap(production_vocabulary.token_num()), ]) generate_dataset = lambda df: CCodeDataSet(df, vocabulary, transforms_fn) data = [generate_dataset(d) for d in data] for d, n in zip(data, ["train", "val", "test"]): print("There are {} parsed data in the {} dataset".format(len(d), n)) train_dataset, valid_dataset, test_dataset = data loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=PAD_TOKEN) model = GrammarLanguageModel(vocabulary.vocabulary_size, production_vocabulary.token_num(), embedding_dim, hidden_state_size, rnn_num_layer, batch_size) optimizer = optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') if load_previous_model: torch_util.load_model(model, save_path) valid_loss = evaluate(model, valid_dataset, batch_size, loss_function) test_loss = evaluate(model, test_dataset, batch_size, loss_function) best_valid_perplexity = torch.exp(valid_loss)[0] best_test_perplexity = torch.exp(test_loss)[0] print( "load the previous mode, validation perplexity is {}, test perplexity is :{}" .format(best_valid_perplexity, best_test_perplexity)) scheduler.step(best_valid_perplexity) else: best_valid_perplexity = None best_test_perplexity = None for epoch in range(epoches): train_loss = train(model, train_dataset, batch_size, loss_function, optimizer) valid_loss = evaluate(model, valid_dataset, batch_size, loss_function) test_loss = evaluate(model, test_dataset, batch_size, loss_function) train_perplexity = torch.exp(train_loss)[0] valid_perplexity = torch.exp(valid_loss)[0] test_perplexity = torch.exp(test_loss)[0] scheduler.step(valid_perplexity) if best_valid_perplexity is None or valid_perplexity < best_valid_perplexity: best_valid_perplexity = valid_perplexity best_test_perplexity = test_perplexity torch_util.save_model(model, save_path) print( "epoch {}: train perplexity of {}, valid perplexity of {}, test perplexity of {}" .format(epoch, train_perplexity, valid_perplexity, test_perplexity)) print("The model {} best valid perplexity is {} and test perplexity is {}". format(saved_name, best_valid_perplexity, best_test_perplexity))