def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, is_split=False): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = CNTrain.preProcess(df, date) if not is_split: prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) prList.sort() prList, communities_data = CNTrain.RecommendByCN(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) else: prList, communities_data = CNTrain.RecommendByCNSplit(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], convertDict, communities_data['whole']['author_list'], key=project + str(date) + str(filter_train) + str(filter_test)) """新增返回测试 训练集大小,用于做统计""" # from source.scikit.combine.CBTrain import CBTrain # recommendList, answerList = CBTrain.recoverName(recommendList, answerList, convertDict) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return prList, convertDict, trainSize, communities_data
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, a=0.5): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = EARECTrain.preProcess(df, date) prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) # prList.sort() recommendList, answerList, = EARECTrain.RecommendByEAREC(train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum, a=a) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, algorithmType, recommendNum=5, featureType=3, filter_train=False, filter_test=False): df = None """对需求文件做合并 """ for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """获取测试的 pull number列表""" train_data, train_data_y, test_data, test_data_y, convertDict, prList = MLTrain.preProcess(df, date, project, featureType, isNOR=True) print("train data:", train_data.shape) print("test data:", test_data.shape) recommendList, answerList = MultipleLabelAlgorithm. \ RecommendByAlgorithm(train_data, train_data_y, test_data, test_data_y, algorithmType) trainSize = (train_data.shape[0], test_data.shape[0]) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, recommendNum=5, alpha=0.98, K=20, c=1): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) filename = projectConfig.getHGDataPath() + os.sep + f'HG_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = HGTrain.preProcess(df, date) prList = list(set(test_data['pr_number'])) prList.sort() recommendList, answerList, authorList = HGTrain.RecommendByHG(train_data, train_data_y, test_data, test_data_y, date, project, convertDict, recommendNum=recommendNum, alpha=alpha, K=K, c=c, useLocalPrDis=False) """保存推荐结果,用于做统计""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date), authorList=authorList) """新增返回训练集 测试集大小""" trainSize = (train_data.shape[0], test_data.shape[0]) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def testCHREVAlgorithm(project, dates, filter_train=False, filter_test=False, error_analysis=False): # 多个case, 元组代表总共的时间跨度,最后一个月用于测试 recommendNum = 5 # 推荐数量 excelName = f'outputCHREV_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' sheetName = 'result' """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() """根据推荐列表做评价""" recommendList, answerList, prList, convertDict, trainSize = CHREVTrain.algorithmBody(date, project, recommendNum, filter_test=filter_test, filter_train=filter_train) topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) error_analysis_data = None filter_answer_list = None if error_analysis: y = date[2] m = date[3] filename = projectConfig.getCHREVDataPath() + os.sep + f'CHREV_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'review_user_login', 'pr_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( recommendList, answerList, filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, filter_answer_list=filter_answer_list, key=project + str(date) + str(filter_train) + str(filter_test)) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'CHREV_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)