def traverse_all_bugs(all_bugids, new_bugids, sign, train=True, weaks=None): ''' :param all_bugids: :param new_bugids: :param sign: 用来标识是哪一轮实验,提取好的训练集和测试集活跃度数据会放在以sign明明的文件夹里。 :param train: :return: ''' start_time = time.time() print('start_time:', start_time) bugs_counts = {} bug_msg_all, _ = data_helper.get_msg_all() count = 0 # all_bugids = sorted(bug_msg_all.keys()) # 升序排好 # for id in bug_msg_all.keys(): for id in new_bugids: print(count) count += 1 if train: actives = get_developer_active_sequence(bug_msg_all, all_bugids, id, all_bugids.index(id) - 1, train, weaks) else: actives = get_developer_active_sequence(bug_msg_all, all_bugids, id, len(all_bugids) - 1, train, weaks) bugs_counts[id] = len(actives) write_active_sequence_to_file(actives, sign, id) print('计算消耗时间:', time.time() - start_time) with open('../data/active_counts_{}.txt'.format(time.time()), 'w') as writer: for key in bugs_counts.keys(): writer.write(str(key) + '\t' + str(bugs_counts[key]) + '\n') print('最终结束时间:', time.time())
def _writer_little_bug_msg_all(little_bugids): ''' 这个文件是测试用的,是为了查看划分的训练集是否正确 :param little_bugids: :return: ''' bug_msg_all, _ = get_msg_all() with open('../data/eval_little_bug_msg_all.txt', 'w') as writer: for name, value in little_bugids.items(): for bugid in value: # writer.write('{}\n'.format('\t'.join(bug_msg_all.get(bugid)))) writer.write('{}\t{}\n'.format(bugid, bug_msg_all.get(bugid)[0]))
def count_bug_fixed_by_each_developer(): bugid_each_developer = {} # key=developers' name , value=[bug_ids] bug_msg_all, _ = get_msg_all() # bug_msg_all = {} for bugid, value in bug_msg_all.items(): der = value[0] # 获取当前样本的修复者 if der in bugid_each_developer.keys(): bugid_each_developer[der].append(bugid) else: bugid_each_developer.setdefault(der, [bugid]) # with open('../data/num_of_bugs_fixed_by_each_developer.txt', 'w') as writer: # for name in bugid_each_developer.keys(): # writer.write('{}\t{}\n'.format(name, len(bugid_each_developer[name]))) return bugid_each_developer
import numpy as np import util.PATH as PATH from util.data_helper import get_msg_all from util.data_helper import split_dataset_by_time_windows ''' 提取窗口0的词汇表信息 ''' # 字典,key=bugid,value= bug_msg_all, _ = get_msg_all() windows = split_dataset_by_time_windows(bug_msg_all) vocabulary = [] for i in range(len(windows[0])): print(i) with open(PATH.path_corpus + str(windows[0][i]), 'r') as reader: for line in reader.readlines(): if line.strip() not in vocabulary: vocabulary.append(line.strip()) with open('../data/windows/window_0_vocabulary.txt', 'w') as writer: for word in vocabulary: writer.write('{}\n'.format(word))
writer.write('{}\n'.format(word)) def del_invalid_developers(bug_msg_all): ''' 主要是用来检查数据集中是否还存在无效开发者 :param bug_msg_all: :return: ''' invalids = [ "nobody", "inbox", "webmaster", "platform", "unassigned", "issues", "needsconfirm", "swneedsconfirm" ] invalid_bugs = [] invalid_devs = set() for key, value in bug_msg_all.items(): if True in list(map(lambda x: x in value[0].lower(), invalids)): # 该条bug是由无效开发者修复的 invalid_bugs.append(key) invalid_devs.add(value[0]) print('无效的bug条数={}'.format(len(invalid_bugs))) print('无效的开发者数目={}'.format(len(invalid_devs))) return invalid_bugs if __name__ == '__main__': bug_msg_all, _ = data_helper.get_msg_all( ) # # key=bug_id value= {assign_to creation_ts delta_ts product component} # extract_vocabulary(bug_msg_all, PATH.path_vocabulary) del_invalid_developers(bug_msg_all)
actives = get_developer_active_sequence(bug_msg_all, all_bugids, id, len(all_bugids) - 1, train) bugs_counts[id] = len(actives) write_active_sequence_to_file(actives, sign, id) print('计算消耗时间:', time.time() - start_time) with open('../data/active_counts_{}.txt'.format(time.time()), 'w') as writer: for key in bugs_counts.keys(): writer.write(str(key) + '\t' + str(bugs_counts[key]) + '\n') print('最终结束时间:', time.time()) if __name__ == '__main__': # traverse_all_bugs() bug_msg_all, _ = data_helper.get_msg_all() time_windows = data_helper.split_dataset_by_time_windows(bug_msg_all) for i in [0]: traverse_all_bugs(time_windows[i], time_windows[i], sign=i, train=True) # 针对训练集 traverse_all_bugs(time_windows[i], time_windows[i + 1], sign=i, train=False) # 针对测试集 # # bugids = sorted(bug_msg_all.keys()) # 升序排好 # actives = get_developer_active_sequence(bug_msg_all, bugids, 6533) # for i in actives: # print(i)