def padding_train_test_commits(train, test, params): # training data # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- train_msgs, train_codes = extract_msg(commits=train), keras_extract_code( commits=train) train_msgs_codes = [m + ' ' + c for m, c in zip(train_msgs, train_codes)] train_dict_msg_code = dictionary(data=train_msgs_codes) train_pad_msg_msg_code = mapping_commit_msg(msgs=train_msgs_codes, max_length=params.msg_length, dict_msg=train_dict_msg_code) train_labels = load_label_commits(commits=train) train_data = (train_labels, train_pad_msg_msg_code) # testing data # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- test_msgs, test_codes = extract_msg(commits=test), keras_extract_code( commits=test) test_msgs_codes = [m + ' ' + c for m, c in zip(test_msgs, test_codes)] test_pad_msg_code = mapping_commit_msg(msgs=test_msgs_codes, max_length=params.msg_length, dict_msg=train_dict_msg_code) test_labels = load_label_commits(commits=test) test_data = (test_labels, test_pad_msg_code) return train_data, test_data, train_dict_msg_code
def padding_train_test_commits(train, test, params): # training data # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- train_msgs, train_codes = extract_msg(commits=train), extract_code( commits=test) train_dict_msg, train_dict_code = dictionary(data=train_msgs), dictionary( data=train_codes) train_pad_msg = mapping_commit_msg(msgs=train_msgs, max_length=params.msg_length, dict_msg=train_dict_msg) train_pad_added_code = mapping_commit_code( type="added", commits=train, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=train_dict_code) train_pad_removed_code = mapping_commit_code( type="removed", commits=train, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=train_dict_code) train_labels = load_label_commits(commits=train) train_data = (train_labels, train_pad_msg, train_pad_added_code, train_pad_removed_code) dict_commit = (train_dict_msg, train_dict_code) # testing data # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- test_msgs, test_codes = extract_msg(commits=test), extract_code( commits=test) test_pad_msg = mapping_commit_msg(msgs=test_msgs, max_length=params.msg_length, dict_msg=train_dict_msg) test_pad_added_code = mapping_commit_code( type="added", commits=test, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=train_dict_code) test_pad_removed_code = mapping_commit_code( type="removed", commits=test, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=train_dict_code) test_labels = load_label_commits(commits=test) test_data = (test_labels, test_pad_msg, test_pad_added_code, test_pad_removed_code) return train_data, test_data, dict_commit
def padding_testing_commit(commits, dictionary, params): msgs = extract_msg(commits=commits) codes = extract_code(commits=commits) ids = extract_id(commits=commits) #patch id dict_msg, dict_code = dictionary # padding commit message pad_msg = mapping_commit_msg(msgs=msgs, max_length=params.msg_length, dict_msg=dict_msg) # padding commit code pad_added_code = mapping_commit_code(type_="added", commits=commits, max_file=params.code_file, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=dict_code) pad_removed_code = mapping_commit_code(type_="removed", commits=commits, max_file=params.code_file, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=dict_code) labels = load_label_commits(commits=commits) return pad_msg, pad_added_code, pad_removed_code, labels, ids
def padding_commit(commits, params): msgs, codes = extract_msg(commits=commits), keras_extract_code( commits=commits) msgs_codes = [m + ' ' + c for m, c in zip(msgs, codes)] dict_msg_code = dictionary(data=msgs_codes) # padding commit message pad_msg_code = mapping_commit_msg(msgs=msgs_codes, max_length=params.msg_length, dict_msg=dict_msg_code) labels = load_label_commits(commits=commits) return (labels, pad_msg_code, dict_msg_code)
def padding_pred_commit(commits, params, dict_msg, dict_code): msgs, codes = extract_msg(commits=commits), extract_code(commits=commits) # padding commit message pad_msg = mapping_commit_msg(msgs=msgs, max_length=params.msg_length, dict_msg=dict_msg) # padding commit code pad_added_code = mapping_commit_code(type="added", commits=commits, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=dict_code) pad_removed_code = mapping_commit_code(type="removed", commits=commits, max_hunk=params.code_hunk, max_code_line=params.code_line, max_code_length=params.code_length, dict_code=dict_code) labels = load_label_commits(commits=commits) return pad_msg, pad_added_code, pad_removed_code, labels
#path_input_data1 = "test_data.out" #path_input_data2 = "training_data.out" #path_output_data = "./try_data/dict_try.pkl" #show_dict = True params = read_args().parse_args() path_input_data1 = params.text_path1 path_input_data2 = params.text_path2 path_output_data = params.dict_path show_dict = params.print test_data = extract_commit(path_file=path_input_data1) train_data = extract_commit(path_file=path_input_data2) whole_data = train_data + test_data # add train data and test data together #whole_data = test_data msgs, codes = extract_msg(whole_data), extract_code(whole_data) dict_msg, dict_code = dictionary(data=msgs), dictionary(data=codes) #print(len(msgs)) #print (msgs[1]) #print(len(codes)) #print(codes[1]) print("the number of different tokens in message part is : {n}".format(n=len(dict_msg))) print("the number of different tokens in code part is : {n}".format(n=len(dict_code))) dict_whole = (dict_msg, dict_code) with open(path_output_data, 'wb') as handle: pickle.dump(dict_whole, handle, protocol=pickle.HIGHEST_PROTOCOL) if show_dict == True: