def padding_train_test_commits(train, test, params):
    # training data
    # -------------------------------------------------------------------------------------------------------
    # -------------------------------------------------------------------------------------------------------
    train_msgs, train_codes = extract_msg(commits=train), extract_code(
        commits=test)
    train_dict_msg, train_dict_code = dictionary(data=train_msgs), dictionary(
        data=train_codes)
    train_pad_msg = mapping_commit_msg(msgs=train_msgs,
                                       max_length=params.msg_length,
                                       dict_msg=train_dict_msg)
    train_pad_added_code = mapping_commit_code(
        type="added",
        commits=train,
        max_hunk=params.code_hunk,
        max_code_line=params.code_line,
        max_code_length=params.code_length,
        dict_code=train_dict_code)
    train_pad_removed_code = mapping_commit_code(
        type="removed",
        commits=train,
        max_hunk=params.code_hunk,
        max_code_line=params.code_line,
        max_code_length=params.code_length,
        dict_code=train_dict_code)
    train_labels = load_label_commits(commits=train)
    train_data = (train_labels, train_pad_msg, train_pad_added_code,
                  train_pad_removed_code)
    dict_commit = (train_dict_msg, train_dict_code)

    # testing data
    # -------------------------------------------------------------------------------------------------------
    # -------------------------------------------------------------------------------------------------------
    test_msgs, test_codes = extract_msg(commits=test), extract_code(
        commits=test)
    test_pad_msg = mapping_commit_msg(msgs=test_msgs,
                                      max_length=params.msg_length,
                                      dict_msg=train_dict_msg)
    test_pad_added_code = mapping_commit_code(
        type="added",
        commits=test,
        max_hunk=params.code_hunk,
        max_code_line=params.code_line,
        max_code_length=params.code_length,
        dict_code=train_dict_code)
    test_pad_removed_code = mapping_commit_code(
        type="removed",
        commits=test,
        max_hunk=params.code_hunk,
        max_code_line=params.code_line,
        max_code_length=params.code_length,
        dict_code=train_dict_code)
    test_labels = load_label_commits(commits=test)
    test_data = (test_labels, test_pad_msg, test_pad_added_code,
                 test_pad_removed_code)
    return train_data, test_data, dict_commit
Beispiel #2
0
def padding_testing_commit(commits, dictionary, params):
    msgs = extract_msg(commits=commits)
    codes = extract_code(commits=commits)
    ids = extract_id(commits=commits)  #patch id

    dict_msg, dict_code = dictionary

    # padding commit message
    pad_msg = mapping_commit_msg(msgs=msgs,
                                 max_length=params.msg_length,
                                 dict_msg=dict_msg)

    # padding commit code
    pad_added_code = mapping_commit_code(type_="added",
                                         commits=commits,
                                         max_file=params.code_file,
                                         max_hunk=params.code_hunk,
                                         max_code_line=params.code_line,
                                         max_code_length=params.code_length,
                                         dict_code=dict_code)
    pad_removed_code = mapping_commit_code(type_="removed",
                                           commits=commits,
                                           max_file=params.code_file,
                                           max_hunk=params.code_hunk,
                                           max_code_line=params.code_line,
                                           max_code_length=params.code_length,
                                           dict_code=dict_code)
    labels = load_label_commits(commits=commits)
    return pad_msg, pad_added_code, pad_removed_code, labels, ids
def padding_commit_topwords(commits, params):
    codes = extract_code(commits=commits)
    dict_code = dictionary(data=codes)

    # padding commit code
    pad_added_code = mapping_commit_code(type="added",
                                         commits=commits,
                                         max_hunk=params.code_hunk,
                                         max_code_line=params.code_line,
                                         max_code_length=params.code_length,
                                         dict_code=dict_code)
    pad_removed_code = mapping_commit_code(type="removed",
                                           commits=commits,
                                           max_hunk=params.code_hunk,
                                           max_code_line=params.code_line,
                                           max_code_length=params.code_length,
                                           dict_code=dict_code)
    return pad_added_code, pad_removed_code, dict_code
Beispiel #4
0
def padding_pred_commit(commits, params, dict_msg, dict_code):
    msgs, codes = extract_msg(commits=commits), extract_code(commits=commits)

    # padding commit message
    pad_msg = mapping_commit_msg(msgs=msgs,
                                 max_length=params.msg_length,
                                 dict_msg=dict_msg)
    # padding commit code
    pad_added_code = mapping_commit_code(type="added",
                                         commits=commits,
                                         max_hunk=params.code_hunk,
                                         max_code_line=params.code_line,
                                         max_code_length=params.code_length,
                                         dict_code=dict_code)
    pad_removed_code = mapping_commit_code(type="removed",
                                           commits=commits,
                                           max_hunk=params.code_hunk,
                                           max_code_line=params.code_line,
                                           max_code_length=params.code_length,
                                           dict_code=dict_code)
    labels = load_label_commits(commits=commits)
    return pad_msg, pad_added_code, pad_removed_code, labels
Beispiel #5
0
    #path_input_data1 = "test_data.out"
    #path_input_data2 = "training_data.out"
    #path_output_data = "./try_data/dict_try.pkl"
    #show_dict = True
    params = read_args().parse_args() 
    path_input_data1 =  params.text_path1
    path_input_data2 =  params.text_path2
    path_output_data =  params.dict_path
    show_dict = params.print
    
    test_data = extract_commit(path_file=path_input_data1)
    train_data = extract_commit(path_file=path_input_data2)
    
    whole_data = train_data + test_data   # add  train data and test data together
    #whole_data = test_data
    msgs, codes = extract_msg(whole_data), extract_code(whole_data)
    dict_msg, dict_code = dictionary(data=msgs), dictionary(data=codes)
    
    #print(len(msgs))
    #print (msgs[1])
    #print(len(codes))
    #print(codes[1])
    print("the number of different tokens in message part is : {n}".format(n=len(dict_msg)))
    print("the number of different tokens in code part is : {n}".format(n=len(dict_code)))
   

    dict_whole = (dict_msg, dict_code)
    with open(path_output_data, 'wb') as handle:
        pickle.dump(dict_whole, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    if show_dict == True: