def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, is_split=False): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = CNTrain.preProcess(df, date) if not is_split: prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) prList.sort() prList, communities_data = CNTrain.RecommendByCN(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) else: prList, communities_data = CNTrain.RecommendByCNSplit(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], convertDict, communities_data['whole']['author_list'], key=project + str(date) + str(filter_train) + str(filter_test)) """新增返回测试 训练集大小,用于做统计""" # from source.scikit.combine.CBTrain import CBTrain # recommendList, answerList = CBTrain.recoverName(recommendList, answerList, convertDict) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return prList, convertDict, trainSize, communities_data
def algorithmBody(date, project, recommendNum=5, filter_train=True, filter_test=True, disMapList=None): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = FPSTrain.preProcess(df, date) prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) """2020.8.1 本来FPS的pr顺序是倒序,现在改为正序,便于和其他算法推荐名单比较""" prList.sort() recommendList, answerList = FPSAlgorithm.RecommendByFPS(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum, disMapList=disMapList) """新增返回测试 训练集大小,用于做统计""" """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) # """输出推荐名单到文件""" # DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, a=0.5): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = EARECTrain.preProcess(df, date) prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) # prList.sort() recommendList, answerList, = EARECTrain.RecommendByEAREC(train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum, a=a) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def checkPRTimeLineResult(owner, repo, limit=5): """检查PRTimeline数据是否完整爬取""" """1. 获取该仓库所有的pr_node""" repo_fullname = owner + "/" + repo pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname) pr_nodes = list(pr_nodes) pr_nodes = [node[0] for node in pr_nodes] """2. 读取prtimeline文件,对比pr""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' df = pandasHelper.readTSVFile(fileName=target_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """3. 获取需要fetch的PR""" fetched_prs = list(df['pullrequest_node']) need_fetch_prs = list(set(pr_nodes).difference(set(fetched_prs))) Logger.logi("there are {0} pr_timeline need to fetch".format(need_fetch_prs.__len__())) """设置fetch参数""" pos = 0 fetchLimit = 200 size = need_fetch_prs.__len__() while pos < size: sub_need_fetch_prs = need_fetch_prs[pos:pos + fetchLimit] Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) """4. 开始爬取""" results = AsyncProjectAllDataFetcher.getPullRequestTimeLine(owner=owner, repo=repo, nodes=sub_need_fetch_prs) Logger.logi("successfully fetched {0} pr! ".format(pos + fetchLimit)) pos += fetchLimit
def loadLocalPrDistance(project): prDisDf_LCP = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep + f"pr_distance_{project}_LCP.tsv", header=pandasHelper.INT_READ_FILE_WITH_HEAD) prDisDf_LCS = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep + f"pr_distance_{project}_LCS.tsv", header=pandasHelper.INT_READ_FILE_WITH_HEAD) prDisDf_LCSubseq = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep + f"pr_distance_{project}_LCSubseq.tsv", header=pandasHelper.INT_READ_FILE_WITH_HEAD) prDisDf_LCSubstr = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep + f"pr_distance_{project}_LCSubstr.tsv", header=pandasHelper.INT_READ_FILE_WITH_HEAD) DisMapLCP = {} DisMapLCS = {} DisMapLCSubseq = {} DisMapLCSubstr = {} for row in prDisDf_LCP.itertuples(index=False, name='Pandas'): p1 = row[0] p2 = row[1] dis = row[2] DisMapLCP[(p1, p2)] = dis DisMapLCP[(p2, p1)] = dis for row in prDisDf_LCS.itertuples(index=False, name='Pandas'): p1 = row[0] p2 = row[1] dis = row[2] DisMapLCS[(p1, p2)] = dis DisMapLCS[(p2, p1)] = dis for row in prDisDf_LCSubseq.itertuples(index=False, name='Pandas'): p1 = row[0] p2 = row[1] dis = row[2] DisMapLCSubseq[(p1, p2)] = dis DisMapLCSubseq[(p2, p1)] = dis for row in prDisDf_LCSubstr.itertuples(index=False, name='Pandas'): p1 = row[0] p2 = row[1] dis = row[2] DisMapLCSubstr[(p1, p2)] = dis DisMapLCSubstr[(p2, p1)] = dis return [DisMapLCS, DisMapLCP, DisMapLCSubseq, DisMapLCSubstr]
def algorithmBody(date, project, algorithmType, recommendNum=5, featureType=3, filter_train=False, filter_test=False): df = None """对需求文件做合并 """ for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """获取测试的 pull number列表""" train_data, train_data_y, test_data, test_data_y, convertDict, prList = MLTrain.preProcess(df, date, project, featureType, isNOR=True) print("train data:", train_data.shape) print("test data:", test_data.shape) recommendList, answerList = MultipleLabelAlgorithm. \ RecommendByAlgorithm(train_data, train_data_y, test_data, test_data_y, algorithmType) trainSize = (train_data.shape[0], test_data.shape[0]) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """预处理新增返回测试pr列表 2020.4.11""" train_data, train_data_y, test_data, test_data_y, convertDict = XFTrain.preProcess(df, date) prList = list(set(test_data['pr_number'])) prList.sort() """根据算法获得推荐列表""" recommendList, answerList = XFTrain.RecommendByXF(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) trainSize = (train_data.shape[0], test_data.shape[0]) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, recommendNum=5, alpha=0.98, K=20, c=1): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) filename = projectConfig.getHGDataPath() + os.sep + f'HG_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = HGTrain.preProcess(df, date) prList = list(set(test_data['pr_number'])) prList.sort() recommendList, answerList, authorList = HGTrain.RecommendByHG(train_data, train_data_y, test_data, test_data_y, date, project, convertDict, recommendNum=recommendNum, alpha=alpha, K=K, c=c, useLocalPrDis=False) """保存推荐结果,用于做统计""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date), authorList=authorList) """新增返回训练集 测试集大小""" trainSize = (train_data.shape[0], test_data.shape[0]) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def testMLAlgorithms(project, dates, algorithm): """ 测试算法接口,把流程相似的算法统一 algorithm : svm, dt, rf """ recommendNum = 5 # 推荐数量 excelName = f'output{algorithm}.xlsx' sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() """直接读取不带路径的信息""" filename = projectConfig.getRootPath() + os.sep + 'data' + os.sep + 'train' + os.sep + \ f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv' df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD) print("raw df:", df.shape) # """读取带路径的文件信息""" # filename = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep + \ # f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv' # df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD, # sep=StringKeyUtils.STR_SPLIT_SEP_CSV) """df做预处理""" train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project, isNOR=True) recommendList = None answerList = None """根据算法获得推荐列表""" if algorithm == StringKeyUtils.STR_ALGORITHM_SVM: # 支持向量机 recommendList, answerList = MLTrain.RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) elif algorithm == StringKeyUtils.STR_ALGORITHM_DT: # 决策树 recommendList, answerList = MLTrain.RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) elif algorithm == StringKeyUtils.STR_ALGORITHM_RF: # 随机森林 recommendList, answerList = MLTrain.RecommendByRandomForest(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) """根据推荐列表做评价""" topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime)
def algorithmBody(date, project, recommendNum=5, response_limit_time=8, active_limit_time=10): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) filename = projectConfig.getGADataPath() + os.sep + f'GA_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = GATrain.preProcess(df, date) prList = list(test_data_y.keys()) prList.sort(reverse=False) recommendList, answerList = GATrain.RecommendByGA(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum, response_limit_time=response_limit_time, active_limit_time=active_limit_time) """新增返回测试 训练集大小,用于做统计""" """新增返回训练集 测试集大小""" trainSize = (list(set(train_data['pr_number'])).__len__(), list(set(test_data['pr_number'])).__len__()) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def processFileNameVector(filename): """ 手工计算tf-idf @param filename: 要读取的文件名(文件是带"include_filepath"的数据) @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法 """ # 获取包含filename的df df = pandasHelper.readTSVFile(fileName=filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD, sep=StringKeyUtils.STR_SPLIT_SEP_CSV) # 统计包含s的pr sub2pr = {} # 统计每个pr中s出现的次数 数据结构:key: prNumber, value: {s:2} pr2sub = {} for index, row in df.iterrows(): subs = splitFileName(row['filename']) for sub in subs: if sub not in sub2pr: sub2pr[sub] = set() # 添加出现sub的pr sub2pr[sub].add(row['pr_number']) if row['pr_number'] not in pr2sub: pr2sub[row['pr_number']] = {} if sub not in pr2sub[row['pr_number']]: pr2sub[row['pr_number']][sub] = 0 # sub在该pr中出现的次数+1 pr2sub[row['pr_number']][sub] += 1 # 获取所有出现过的s,添加到表头作为维度 path_vector = list(sub2pr.keys()) # 计算weight(pr,s) = (s在pr中出现的次数)* (log(所有pr的数量/出现s的PR数量) + 1) pr_path_weight_df_columns = ['pr_number'].extend(path_vector) pr_path_weight_df = pandas.DataFrame(columns=pr_path_weight_df_columns) # 所有pr的数量 nt = len(pr2sub.keys()) for pr in pr2sub: new_row = {'pr_number': pr} for sub in sub2pr: # 在df中添加新列,默认path权值都为0 df[sub] = 0 # s在pr中出现的次数 tf = 0 if sub in pr2sub[pr]: s_cnt = pr2sub[pr][sub] # 出现s的pr数量 pr_cnt = len(sub2pr[sub]) idf = math.log(nt / pr_cnt) + 1 # 计算s在pr中的权值 pr_s_weight = tf * idf new_row[sub] = pr_s_weight pr_path_weight_df = pr_path_weight_df.append([new_row], ignore_index=True) # 根据pr_number关联pr_path_weight_df和pr_df df = pandas.merge(df, pr_path_weight_df, on="pr_number", how="left") return df
def algorithmBody(date, project, recommendNum=5): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) filename = projectConfig.getCFDataPath() + os.sep + f'CF_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = CFTrain.preProcess(df, date) prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) prList.sort() recommendList, answerList = CFTrain.RecommendByCF(date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) """新增返回测试 训练集大小,用于做统计""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def algorithmBody(date, project, recommendNum=5): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 print(y, m) filename = projectConfig.getCADataPath() + os.sep \ + f'CA_{project}_data_{y}_{m}_to_{y}_{m}.tsv' if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """预处理新增返回测试pr列表 2020.4.11""" # train_data, train_data_y, test_data, test_data_y, convertDict = CATrain.preProcess(df, date) # # prList = list(test_data['pr_number']) # # """根据算法获得推荐列表""" # recommendList, answerList = IRTrain.RecommendByIR(train_data, train_data_y, test_data, # test_data_y, recommendNum=recommendNum) # trainSize = (train_data.shape[0], test_data.shape[0]) # return recommendList, answerList, prList, convertDict, trainSize CATrain.preProcess(df, date)
def loadLocalPrDistance(project): prDisDf_FPS = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep + f"pr_distance_{project}_FPS.tsv", header=pandasHelper.INT_READ_FILE_WITH_HEAD) DisMapFPS = {} for row in prDisDf_FPS.itertuples(index=False, name='Pandas'): p1 = row[0] p2 = row[1] dis = row[2] DisMapFPS[(p1, p2)] = dis return DisMapFPS
def pr_review_ratio(): """绘制某个pr和对应review数量的分布""" train_path = projectConfig.getDataTrainPath() filename = os.path.join(train_path, 'pr_review_ratio_akka.tsv') df = pandasHelper.readTSVFile(filename) # MLTrain.getSeriesBarPlot(df[1]) import matplotlib.pyplot as plt fig = plt.figure() # fig.add_subplot(2, 1, 1) counts = df[1].value_counts() print(counts) counts.sort_index().plot(kind='bar') plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.title('项目akka每一个pull-request对应的review数量') plt.xlabel('pull-request数量') plt.ylabel('review数量') plt.show()
def attachFileNameToOriginData(project, date): """ 在训练集中加入file信息 @rtype: None """ print("-----------------start------------------") start_time = datetime.now() # 训练数据路径 train_data_path = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep # 表格文件路径 origin_filepath = train_data_path + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv' target_filepath = train_data_path + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv' # 获取原始表格 origin_df = pandasHelper.readTSVFile(origin_filepath, pandasHelper.INT_READ_FILE_WITHOUT_HEAD) # 原始表格表头 columns = ['reviewer_reviewer', 'pr_number', 'review_id', 'commit_sha', 'author', 'pr_created_at', 'pr_commits', 'pr_additions', 'pr_deletions', 'pr_head_label', 'pr_base_label', 'review_submitted_at', 'commit_status_total', 'commit_status_additions', 'commit_status_deletions', 'commit_files', 'author_review_count', 'author_push_count', 'author_submit_gap'] origin_df.columns = columns print("fetch origin data success!") print("start fetching commit_file data from mysql......") # 从数据库获取commitFiles DataFrame,包含了每次commit的文件信息 results = query(project) commit_files = results[0] cur_time = datetime.now() print("fetch commit_file data success! cur_cost_time: ", cur_time - start_time) # 根据commit_sha,合并原始数据和commitFile new_df = pandas.merge(origin_df, commit_files, on="commit_sha", how="left") new_df.to_csv(target_filepath, encoding='utf-8', index=False, header=True) print("attach commit_file data to origin data success! result output to :" + target_filepath) print("-----------------finish------------------")
def testChangeTriggerAnalyzer(owner, repo, pull_request_node): AsyncApiHelper.setRepo(owner, repo) """读取PRTimeline,获取需要分析change_trigger的pr列表""" pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) pr_nodes = list(set(list(pr_timeline_df['pullrequest_node']))) pr_nodes.sort() """按照爬取限制取子集""" pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'] == pull_request_node] """对子集按照pull_request_node分组""" grouped_timeline = pr_timeline_items.groupby((['pullrequest_node'])) """将分组结果保存为字典{pr->pr_timeline_items}""" formated_data = [] for pr, group in grouped_timeline: record = group.to_dict(orient='records') record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION))) formated_data.append(record) """分析这些pr的timeline""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data) print("finish!")
def testBayesAlgorithms(project, dates): # 输入测试日期和对应文件序列 输出一整个算法的表现 recommendNum = 5 # 推荐数量 excelName = 'outputNB.xlsx' sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for i in range(1, 4): # Bayes 有三个模型 for date in dates: filename = projectConfig.getRootPath() + r'\data\train' + r'\\' \ + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv' df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD) """df做预处理""" isNOR = True if i == 1 or i == 3: isNOR = False # 对伯努利不做归一 train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project, isNOR=isNOR) """根据算法获得推荐列表""" recommendList, answerList = MLTrain.RecommendByNativeBayes(train_data, train_data_y, test_data, test_data_y, recommendNum, i) """根据推荐列表做评价""" topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, test_type=StringKeyUtils.STR_TEST_TYPE_SLIDE): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 print(y, m) filename = None if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE: if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT: if filter_test: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE: """df做预处理""" """预处理新增返回测试pr列表 2020.4.11""" train_data, train_data_y, test_data, test_data_y, convertDict = IR_ACTrain.preProcessBySlide(df, date) prList = list(test_data['pr_number']) """根据算法获得推荐列表""" recommendList, answerList = IR_ACTrain.RecommendByIR_AC_SLIDE(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) trainSize = (train_data.shape[0], test_data.shape[0]) return recommendList, answerList, prList, convertDict, trainSize elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT: """df做预处理""" """新增人名映射字典""" test_data, test_data_y, convertDict = IR_ACTrain.preProcessByIncrement(df, date) prList = list(test_data.drop_duplicates(['pr_number'])['pr_number']) """增量预测第一个pr不预测""" prList.sort() prList.pop(0) recommendList, answerList = IR_ACTrain.RecommendByIR_AC_INCREMENT(test_data, test_data_y, recommendNum=recommendNum) """新增返回测试 训练集大小,用于做统计""" """新增返回训练集 测试集大小""" trainSize = (test_data.shape) print(trainSize) # """输出推荐名单到文件""" # DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict) return recommendList, answerList, prList, convertDict, trainSize
def demo(): data = pandasHelper.readTSVFile(projectConfig.getFPSTestData(), pandasHelper.INT_READ_FILE_WITHOUT_HEAD) print("input data:", data.shape) startTime = datetime.now() # print(DataFrameColumnUtils.COLUMN_REVIEW_FPS) """导入pullrequest, review,file,commit数据""" pullrequests, pullrequestsIndex = \ BeanNumpyHelper.getBeansFromDataFrame(PullRequest(), DataFrameColumnUtils.COLUMN_REVIEW_FPS_PULL_REQUEST, data) # if configPraser.getPrintMode(): # print(pullrequests.__len__()) # print(pullrequestsIndex) time2 = datetime.now() print("pull request cost time:", time2 - startTime) reviews, reviewsIndex = BeanNumpyHelper.getBeansFromDataFrame(Review(), DataFrameColumnUtils.COLUMN_REVIEW_FPS_REVIEW, data) time3 = datetime.now() print("review cost time:", time3 - time2) if configPraser.getPrintMode(): print(reviews) print(reviewsIndex) commits, commitsIndex = BeanNumpyHelper.getBeansFromDataFrame(Commit(), DataFrameColumnUtils.COLUMN_REVIEW_FPS_COMMIT, data) time4 = datetime.now() print("commits cost time:", time4 - time3) # if configPraser.getPrintMode(): # print(commits) # print(commitsIndex) files, filesIndex = BeanNumpyHelper.getBeansFromDataFrame(File(), DataFrameColumnUtils.COLUMN_REVIEW_FPS_FILE, data) time5 = datetime.now() print("file cost time:", time5 - time4) # if configPraser.getPrintMode(): # print(files) # print(filesIndex) pullrequestReviewIndex = BeanNumpyHelper.beanAssociate(pullrequests, [StringKeyUtils.STR_KEY_REPO_FULL_NAME, StringKeyUtils.STR_KEY_NUMBER], reviews, [StringKeyUtils.STR_KEY_REPO_FULL_NAME, StringKeyUtils.STR_KEY_PULL_NUMBER]) time6 = datetime.now() print("pull request index time:", time6 - time5) # if configPraser.getPrintMode(): # print(pullrequestReviewIndex) reviewCommitIndex = BeanNumpyHelper.beanAssociate(reviews, [StringKeyUtils.STR_KEY_COMMIT_ID], commits, [StringKeyUtils.STR_KEY_SHA]) time7 = datetime.now() print("commits index cost time:", time7 - time6) # # if configPraser.getPrintMode(): # print(reviewCommitIndex) commitFileIndex = BeanNumpyHelper.beanAssociate(commits, [StringKeyUtils.STR_KEY_SHA], files, [StringKeyUtils.STR_KEY_COMMIT_SHA]) time8 = datetime.now() print("files index cost time:", time8 - time7) # if configPraser.getPrintMode(): # print(commitFileIndex) receiveTime = datetime.now() print("load cost time:", receiveTime - startTime) """用于做评价的结果收集""" recommendList = [] answerList = [] testNumber = configPraser.getTestNumber() if configPraser.getFPSCtypes(): """调用dll库实现增加运行速度""" dll = CDLL("cFPS.dll") dll.addf.restype = c_float dll.addf.argtypes = [c_float, c_float] print(dll.addf(10, 30)) c_prs = FPSClassCovert.convertPullRequest(pullrequests) c_reviews = FPSClassCovert.convertReview(reviews) c_commits = FPSClassCovert.convertCommit(commits) c_files = FPSClassCovert.convertFile(files) c_result = c_fps_result() print(c_prs) print(c_reviews) print(c_commits) print(c_files) dll.FPS.restype = None dll.FPS.argtypes = (POINTER(c_fps_pr), c_int, POINTER(c_fps_review), c_int, POINTER(c_fps_commit), c_int, POINTER(c_fps_file), c_int, POINTER(c_fps_result), c_int, c_int) prs_num = c_prs.__len__() p_c_prs = (c_fps_pr * prs_num)(*c_prs) reviews_num = c_reviews.__len__() p_c_reviews = (c_fps_review * reviews_num)(*c_reviews) commits_num = c_commits.__len__() p_c_commits = (c_fps_commit * commits_num)(*c_commits) files_num = c_files.__len__() p_c_files = (c_fps_file * files_num)(*c_files) dll.FPS(p_c_prs, prs_num, p_c_reviews, reviews_num, p_c_commits, commits_num, p_c_files, files_num, pointer(c_result), 0, 10, True) endTime = datetime.now() print("total cost time:", endTime - startTime, " recommend cost time:", endTime - receiveTime) print("answer:", str(c_result.answer, encoding='utf-8')) print("recommend:", str(c_result.recommend, encoding='utf-8')) else: """使用Python实现算法""" for pos in range(0, testNumber): """通过review算法获取推荐名单""" candicateList, authorList = FPSAlgorithm.reviewerRecommend(pullrequests, pullrequestsIndex, reviews, reviewsIndex, commits, commitsIndex, files, filesIndex, pullrequestReviewIndex, reviewCommitIndex, commitFileIndex, pos, configPraser.getReviewerNumber()) print("candicateList", candicateList) endTime = datetime.now() print("total cost time:", endTime - startTime, " recommend cost time:", endTime - receiveTime) recommendList.append(candicateList) answerList.append(authorList)
def appendFilePathFeatureVector(inputDf, projectName, date, pull_number_name): """ 用tf-idf模型计算pr的所有commit的设计的文件的路径 注: 文件改动来源于 pullrequest直接关联的changeFile 2020.7.7 @description: 给df, 在之前的dataframe的基础上面追加 pr路径形成的tf-idf特征向量 @notice: datafrme 必须有pull_number id,可以重复 @param origin_df: 预先读取好的dataframe @param projectName: 指定项目名 @param date: 开始年,开始月,结束年,结束月的四元组 @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法 """ """对输入df做label存在检测""" if 'label' not in inputDf.columns: raise Exception("label not in input dataframe!") df = inputDf[[pull_number_name]].copy(deep=True) df.drop_duplicates(inplace=True) df.columns = ['pr_number'] """读取commit pr relation文件""" time1 = datetime.now() pr_change_file_path = projectConfig.getPRChangeFilePath() """pr_change_file 数据库输出 自带抬头""" prChangeFileData = pandasHelper.readTSVFile( os.path.join(pr_change_file_path, f'ALL_{projectName}_data_pr_change_file.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) """做三者连接""" df = pandas.merge(df, prChangeFileData, left_on='pr_number', right_on='pull_number') print("merge relation:", df.shape) df = df[['pr_number', 'filename']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) print("after merge:", df.shape) """获取filepath -> sub_filepath映射表""" file_path_list = set(df['filename'].copy(deep=True)) file_path_dict = {} for file_path in file_path_list: sub_file_path = splitFileName(file_path) if file_path not in file_path_dict: file_path_dict[file_path] = set() file_path_dict[file_path] = file_path_dict[file_path].union(sub_file_path) """获取pr_number -> sub_filepath语料""" pr_to_file_path = df[['pr_number', 'filename']] # 按照pr_number分组,获得原始语料(未经过分词的filepath)""" groups = dict(list(pr_to_file_path.groupby('pr_number'))) # 获取目标语料(即经过自定义分词后的语料) pr_file_path_corpora = [] for pr in groups: paths = list(groups[pr]['filename']) sub_paths = list(map(lambda x: list(file_path_dict[x]), paths)) sub_paths = reduce(lambda x, y: x + y, sub_paths) pr_file_path_corpora.append(sub_paths) """计算tf-idf""" print("start tf_idf algorithm......") # 建立词典 dictionary = corpora.Dictionary(pr_file_path_corpora) # 基于词典建立新的语料库 corpus = [dictionary.doc2bow(text) for text in pr_file_path_corpora] # 用语料库训练TF-IDF模型 tf_idf_model = models.TfidfModel(corpus) # 得到加权矩阵 path_tf_tdf = list(tf_idf_model[corpus]) """处理path_tf_tdf,构造pr_path加权矩阵""" print("start merge tf_idf to origin_df......") pr_list = list(groups.keys()) columns = ['pr_number'] path_ids = list(dictionary.token2id.values()) path_ids = list(map(lambda x: str(x), path_ids)) columns.extend(path_ids) pr_path_weight_df = pandas.DataFrame(columns=columns).fillna(value=0) for index, row in enumerate(path_tf_tdf): """用字典的方式填充dataframe""" new_row = {'pr_number': pr_list[index]} row = list(map(lambda x: (str(x[0]), x[1]), row)) path_weight = dict(row) new_row = dict(new_row, **path_weight) pr_path_weight_df = pr_path_weight_df.append(new_row, ignore_index=True) pr_path_weight_df = pr_path_weight_df.fillna(value=0) print(pr_path_weight_df.shape) """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理""" tempData = pr_path_weight_df.copy(deep=True) labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True) tempData = pandas.merge(tempData, labelData, on='pr_number') tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True) tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True) tempData_train.drop(columns=['pr_number', 'label'], inplace=True) tempData_test.drop(columns=['pr_number', 'label'], inplace=True) # tempData.drop(columns=['pr_number'], inplace=True) """PCA 做缩减""" pca = PCA(n_components=0.95) tempData_train = pca.fit_transform(tempData_train) print("after pca :", tempData_train.shape) print(pca.explained_variance_ratio_) tempData_train = pandas.DataFrame(tempData_train) tempData_test = pca.transform(tempData_test) print("after pca :", tempData_train.shape) tempData_test = pandas.DataFrame(tempData_test) tempData = pandas.concat([tempData_train, tempData_test], axis=0) tempData.reset_index(drop=True, inplace=True) """和提供的数据做拼接""" tempData['pr_number_t'] = list(pr_path_weight_df['pr_number']) inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t') inputDf.drop(columns=['pr_number_t'], inplace=True) return inputDf
return math.ceil(2) else: return num @staticmethod def get_pronouncing_nums(words): counts = 0 for word in words: counts += FleshReadableUtils.get_pronouncing_num(word) print('音节总数:', str(counts)) return counts if __name__ == "__main__": data = pandasHelper.readTSVFile(projectConfig.getReviewCommentTestData()) comments = data.as_matrix()[:, (2, 4)] print(comments.shape) readable = [] # 可读性 stopWordRate = [] # 停句率 questionRatio = [] # 问题率 codeElementRatio = [] # 代码元素率 stopKeyRatio = [] # 关键字率 conceptualSimilarity = [] # 概念相似度 badCase = [] stopwords = SplitWordHelper().getEnglishStopList() languageKeyWords = LanguageKeyWordHelper.LanguageKeyWordLanguage.getRubyKeyWordList() for line in comments:
def appendTextualFeatureVector(inputDf, projectName, date, pull_number_name): """ 用tf-idf模型计算pr的所有title,pr的文本 pr的信息直接从PRDataFile 那里获取 @description: 给df, 在之前的dataframe的基础上面追加 pr路径形成的tf-idf特征向量 @notice: datafrme 必须有pull_number_id,可以重复 @param origin_df: 预先读取好的dataframe @param projectName: 指定项目名 @param date: 开始年,开始月,结束年,结束月的四元组 @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法 """ """对输入df做label存在检测""" if 'label' not in inputDf.columns: raise Exception("label not in input dataframe!") print("input shape:", inputDf.shape) print(date) df = inputDf[[pull_number_name]].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) df.columns = ['pr_number'] """读取pullrequestData 文件""" pull_request_path = projectConfig.getPullRequestPath() pullRequestData = pandasHelper.readTSVFile( os.path.join(pull_request_path, f'ALL_{projectName}_data_pullrequest.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) """pull_number和pr review commit relation做拼接""" df = pandas.merge(df, pullRequestData, left_on='pr_number', right_on='number') df = df[['pr_number', 'title', 'body']].copy(deep=True) df.columns = ['pr_number', 'pr_title', 'pr_body'] df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = row[list(df.columns).index('pr_title')] pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = row[list(df.columns).index('pr_body')] pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """填充为向量""" wordVectors = DataProcessUtils.convertFeatureDictToDataFrame(wordVectors, featureNum=feature_cnt) """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理""" tempData = wordVectors.copy(deep=True) tempData['pr_number'] = df['pr_number'] labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True) tempData = pandas.merge(tempData, labelData, on='pr_number') tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True) tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True) tempData_train.drop(columns=['pr_number', 'label'], inplace=True) tempData_test.drop(columns=['pr_number', 'label'], inplace=True) """PAC 做缩减""" pca = PCA(n_components=0.95) tempData_train = pca.fit_transform(tempData_train) print("after pca :", tempData_train.shape) print(pca.explained_variance_ratio_) tempData_train = pandas.DataFrame(tempData_train) tempData_test = pca.transform(tempData_test) print("after pca :", tempData_train.shape) tempData_test = pandas.DataFrame(tempData_test) tempData = pandas.concat([tempData_train, tempData_test], axis=0) tempData.reset_index(drop=True, inplace=True) tempData['pr_number_t'] = df['pr_number'].copy(deep=True) """和原来特征做拼接""" inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t') inputDf.drop(columns=['pr_number_t'], inplace=True) return inputDf
def preProcess(df, date, project, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.reset_index(drop=True, inplace=True) """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login']) recoverDict = {v: k for k, v in convertDict.items()} print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits', 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """计算作者的关系""" df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER') """计算累积的历史数据""" request_number_prs = [] # 作者之前发出的数量 request_number_merged_prs = [] # 作者发出的被接受的数量 request_number_rejected_prs = [] # 作者发出被拒绝的数量 request_accept_rate = [] # 作者pr被接受的概率 request_reject_rate = [] # 作者pr被拒绝的概率 for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)] request_number_prs.append(temp_df.shape[0]) accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0] request_number_merged_prs.append(accept_times) request_number_rejected_prs.append(temp_df.shape[0] - accept_times) if temp_df.shape[0] > 0: request_accept_rate.append(accept_times/temp_df.shape[0]) request_reject_rate.append(1 - accept_times / temp_df.shape[0]) else: request_accept_rate.append(0) request_reject_rate.append(0) df['request_number_prs'] = request_number_prs df['request_number_merged_prs'] = request_number_merged_prs df['request_number_rejected_prs'] = request_number_rejected_prs df['request_accept_rate'] = request_accept_rate df['request_reject_rate'] = request_reject_rate """添加作者是否关注项目""" user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation() userWatchRepoRelation = pandasHelper.readTSVFile( os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) watchRepoMap = {} for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name'])) isFollow = False for repo in following_list: owner, name = repo.split('/') if name == project: isFollow = True watchRepoMap[convertDict[k]] = isFollow request_watches = [] for row in df.itertuples(): author = getattr(row, 'author_user_login') request_watches.append(watchRepoMap[author]) df['request_watches'] = request_watches """添加作者follower数量, followings数量, 是否follow团队成员""" user_follow_relation_path = projectConfig.getUserFollowRelation() userFollowRelation = pandasHelper.readTSVFile( os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) followMap = {} followerCountMap = {} followingCountMap = {} followCoreMemberMap = {} """收集核心成员列表""" coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login'])) for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login'])) followingCountMap[convertDict[k]] = following_list.__len__() isFollowCoreMember = False for f in following_list: if f in convertDict.keys(): followMap[(convertDict[k], convertDict[f])] = 1 if f in coreMemberList: isFollowCoreMember = True followCoreMemberMap[convertDict[k]] = isFollowCoreMember follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login'])) followerCountMap[convertDict[k]] = follower_list.__len__() # for f in follower_list: # if f in convertDict.keys(): # followMap[(convertDict[f], convertDict[k])] = 1 request_number_follows = [] request_number_following = [] request_follow_ct = [] for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" request_number_following.append(followingCountMap[author]) request_number_follows.append(followerCountMap[author]) request_follow_ct.append(followCoreMemberMap[author]) df['request_number_following'] = request_number_following df['request_number_follows'] = request_number_follows df['request_follow_ct'] = request_follow_ct """先提前统计正确答案""" tagDict = dict(list(df.groupby('pr_number'))) train_data = df.loc[df['label'] == 0].copy(deep=True) test_data = df.loc[df['label'] == 1].copy(deep=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) test_data.drop_duplicates(subset=['pr_number'], inplace=True) """获得pr list""" prList = list(test_data['pr_number']) test_data.drop(columns=['pr_number'], inplace=True) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList
def commentAcceptRatioByReviewer(project): """计算以项目为粒度的评审意见认可度,通过时间来划分""" notesFileName = projectConfig.getNotesDataPath() + os.sep + f"notes_{project}.tsv" df_notes = pandasHelper.readTSVFile(notesFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD) df_notes.drop_duplicates(subset=['id'], inplace=True, keep="last") df_notes.sort_values(by='merge_request_id', ascending=False, inplace=True) print(df_notes.shape) mrFileName = projectConfig.getMergeRequestDataPath() + os.sep + f"mergeRequest_{project}.tsv" df_mr = pandasHelper.readTSVFile(mrFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """日期修补""" for index, row in df_mr.iterrows(): if row["created_at"] is None: row["created_at"] = row["merged_at"] df_mr = df_mr[["iid", "created_at"]].copy(deep=True) df_mr["iid"] = df_mr["iid"].apply(lambda x: int(x)) df_mr.drop_duplicates(subset=['iid'], inplace=True) print(df_mr.shape) # x = range(-2, 11) # y = [] # for i in x: # y.append(df_notes.loc[df_notes['change_trigger'] == i].shape[0]) # plt.bar(x=x, height=y) # plt.title(f'review comment({project})') # for a, b in zip(x, y): # plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11) # # print("review comment useful:", df_notes.shape[0] - df_notes.loc[df_notes['change_trigger'] < 0].shape[0]) # plt.show() data = pandas.merge(left=df_notes, right=df_mr, left_on="merge_request_id", right_on="iid") data['label'] = data["created_at_y"].apply(lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))) data['label_y'] = data['label'].apply(lambda x: x.tm_year) data['label_m'] = data['label'].apply(lambda x: x.tm_mon) data = data.loc[data["change_trigger"] != -2].copy(deep=True) # pandasHelper.writeTSVFile("comment.csv", df_notes) """按照每个人分类""" groups = dict(list(data.groupby('reviewer'))) # 获取目标语料(即经过自定义分词后的语料) date = (2019, 5, 2020, 6) columns = ["reviewer"] for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 columns.append(str(f"{y}年{m}月")) ratio_df = DataFrame(columns=columns) # reviewer_list = ["bidinger", "mbouaziz", "raphael-proust", "romain.nl", "vect0r", "rafoo_"] reviewer_list = [] for reviewer, temp_df in groups.items(): print(reviewer, temp_df.shape[0]) if reviewer not in reviewer_list: tempDict = {"reviewer": reviewer} for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 df = temp_df.loc[(temp_df['label_y'] == y) & (temp_df['label_m'] == m)].copy(deep=True) sum = df.shape[0] if sum == 0: pass # tempDict[f'{y}年{m}月'] = 0 else: valid = df.loc[df['change_trigger'] >= 0].shape[0] tempDict[f'{y}年{m}月'] = valid / sum ratio_df = ratio_df.append(tempDict, ignore_index=True) print(ratio_df.shape)
def commentAcceptRatioByProject(projects, date): """计算以项目为粒度的评审意见认可度,通过时间来划分 projects: 指定若干的项目 date: 四元组,指定计算指标的开始时间和结束时间 (minYear, minMonth, maxYear, maxMonth) 如(2019,10,2020,11) 是闭区间 """ columns = ["project"] for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 columns.append(str(f"{y}/{m}")) result_df = DataFrame(columns=columns) # 用于存储最后结果的 dataframe for project in projects: notesFileName = projectConfig.getNotesDataPath() + os.sep + f"notes_{project}.tsv" df_notes = pandasHelper.readTSVFile(notesFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD) df_notes.drop_duplicates(subset=['id'], inplace=True, keep="last") df_notes.sort_values(by='merge_request_id', ascending=False, inplace=True) print(df_notes.shape) mrFileName = projectConfig.getMergeRequestDataPath() + os.sep + f"mergeRequest_{project}.tsv" df_mr = pandasHelper.readTSVFile(mrFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """日期修补""" for index, row in df_mr.iterrows(): if row["created_at"] is None: row["created_at"] = row["merged_at"] df_mr = df_mr[["iid", "created_at"]].copy(deep=True) df_mr["iid"] = df_mr["iid"].apply(lambda x: int(x)) df_mr.drop_duplicates(subset=['iid'], inplace=True) print(df_mr.shape) # x = range(-2, 11) # y = [] # for i in x: # y.append(df_notes.loc[df_notes['change_trigger'] == i].shape[0]) # plt.bar(x=x, height=y) # plt.title(f'review comment({project})') # for a, b in zip(x, y): # plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11) # # print("review comment useful:", df_notes.shape[0] - df_notes.loc[df_notes['change_trigger'] < 0].shape[0]) # plt.show() data = pandas.merge(left=df_notes, right=df_mr, left_on="merge_request_id", right_on="iid") data['label'] = data["created_at_y"].apply(lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))) data['label_y'] = data['label'].apply(lambda x: x.tm_year) data['label_m'] = data['label'].apply(lambda x: x.tm_mon) data = data.loc[data["change_trigger"] != -2].copy(deep=True) # pandasHelper.writeTSVFile("comment.csv", df_notes) # """按照时间拆分""" # minYear = min(data['label']).tm_year # minMonth = min(data['label']).tm_mon # maxYear = max(data['label']).tm_year # maxMonth = max(data['label']).tm_mon # date = (minYear, minMonth, maxYear, maxMonth) tempDict = {"project": project} for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 df = data.loc[(data['label_y'] == y) & (data['label_m'] == m)].copy(deep=True) commentCount = df.shape[0] if commentCount == 0: pass else: validCount = df.loc[df['change_trigger'] >= 0].shape[0] tempDict[f'{y}年{m}月'] = validCount / commentCount result_df = result_df.append(tempDict, ignore_index=True) print(result_df.shape) # result_df.to_excel("q5_change_trigger_ratio.xls") return result_df
def checkChangeTriggerResult(owner, repo): """检查PRChangeTrigger是否计算完整""" """在切换代理的时候,数据库连接会断开,导致comments信息查不到,会遗漏review comment的情况""" """这里检查一遍pr的change_trigger里是否有review_comment数据,如果没有,重新获取一次""" """PRTimeLine表头""" PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node", "comment_type", "change_trigger", "filepath"] """初始化目标文件""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_pr_change_trigger.tsv' """1. 获取该仓库所有的pr_node""" # repo_fullname = configPraser.getOwner() + "/" + configPraser.getRepo() # pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname) # pr_nodes = list(pr_nodes) # pr_nodes = [node[0] for node in pr_nodes] """需要获取的prs改为有issue 额 review的timeline的pr""" timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0) timeline_df = timeline_df.loc[(timeline_df['typename'] == 'IssueComment') \ | (timeline_df['typename'] == 'PullRequestReview')].copy(deep=True) pr_nodes = list(set(timeline_df['pullrequest_node'])) """2. 读取pr_change_trigger文件""" change_trigger_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv' change_trigger_df = pandasHelper.readTSVFile(fileName=change_trigger_filename, header=0) change_nodes = list(set(change_trigger_df['pullrequest_node'])) # """3. 读取pr_timeline文件""" # timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_prtimeline.tsv' # timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0) """4. 将change_trigger按照pull_request_node分组""" grouped_timeline = change_trigger_df.groupby((['pullrequest_node'])) """5. 分析pullrequest_node的change_trigger信息是否完整,整理出需要重新获取的pr信息""" re_analyze_prs = [x for x in pr_nodes if x not in change_nodes] # for pr, group in grouped_timeline: # if pr not in pr_nodes: # re_analyze_prs.append(pr) # else: # review_comment_trigger = group.loc[(group['comment_type'] == StringKeyUtils.STR_LABEL_REVIEW_COMMENT) & (group['change_trigger'] >= 0)] # if review_comment_trigger is None or review_comment_trigger.empty: # re_analyze_prs.append(pr) # Logger.logi("there are {0} prs need to re analyze".format(re_analyze_prs.__len__())) """读取PullRequestData,获取pr所对应的作者""" pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv' pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """收集pr已经对应的作者 用于后面过滤属于作者评论""" pr_author_map = {} for index, row in pr_data_df.iterrows(): pr_author_map[row['node_id']] = row['user_login'] """设置fetch参数""" pos = 0 fetchLimit = 200 size = re_analyze_prs.__len__() while pos < size: Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) sub_re_analyze_prs = re_analyze_prs[pos:pos + fetchLimit] """6. 重新获取这些pr的timeline""" re_analyze_prs_timeline_df = timeline_df[timeline_df['pullrequest_node'].isin(sub_re_analyze_prs)] grouped_timeline = re_analyze_prs_timeline_df.groupby((['pullrequest_node'])) formated_data = [] for pr, group in grouped_timeline: formated_data.append(group.to_dict(orient='records')) """7. 开始分析""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map) pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y] """8. 将分析结果去重并追加到change_trigger表中""" if pr_change_trigger_comments is not None and pr_change_trigger_comments.__len__() > 0: target_content = DataFrame() target_content = target_content.append(pr_change_trigger_comments, ignore_index=True) target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True) target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first') if not target_content.empty: pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, header=pandasHelper.INT_WRITE_WITHOUT_HEADER) Logger.logi("successfully analyzed {0} prs".format(re_analyze_prs.__len__())) pos += fetchLimit
def getPRChangeTriggerData(owner, repo): """ 根据 ALL_{repo}_data_prtimeline.tsv 获取pr change_trigger数据 """ AsyncApiHelper.setRepo(owner, repo) """PRTimeLine表头""" PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node", "comment_type", "change_trigger", "filepath"] """初始化目标文件""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv' target_content = DataFrame(columns=PR_CHANGE_TRIGGER_COLUMNS) # pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, # header=pandasHelper.INT_WRITE_WITH_HEADER) """读取PRTimeline,获取需要分析change_trigger的pr列表""" pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """读取PullRequestData,获取pr所对应的作者""" pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv' pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """收集pr已经对应的作者 用于后面过滤属于作者评论""" pr_author_map = {} for index, row in pr_data_df.iterrows(): pr_author_map[row['node_id']] = row['user_login'] pr_nodes = list(set(list(pr_timeline_df['pullrequest_node']))) pr_nodes.sort() # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjE5MjEzOTc5'] # 3次reopend # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjA0MTk5ODkw'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDQwOTAxMzk0'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MzE1OTU0NDgw'] # pr外review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTQ3NDczNTIx'] # 普通用例 # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDM4NjAzMjk2'] # 超多review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0Mjg1NzExNTIx'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTAxNTUwMTcw'] """设置fetch参数""" pos = 0 fetchLimit = 400 size = pr_nodes.__len__() Logger.logi("there are {0} prs need to analyze".format(pr_nodes.__len__())) t1 = datetime.now() while pos < size: print("now:", pos, ' total:', size, 'cost time:', datetime.now() - t1) Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) """按照爬取限制取子集""" sub_prs = pr_nodes[pos:pos + fetchLimit] pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'].isin(sub_prs)] """对子集按照pull_request_node分组""" grouped_timeline = pr_timeline_items.groupby((['pullrequest_node'])) """将分组结果保存为字典{pr->pr_timeline_items}""" formated_data = [] for pr, group in grouped_timeline: record = group.to_dict(orient='records') record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION))) formated_data.append(record) """分析这些pr的timeline""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map) pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y] """将分析结果去重并追加到change_trigger表中""" if pr_change_trigger_comments.__len__() > 0: target_content = DataFrame() target_content = target_content.append(pr_change_trigger_comments, ignore_index=True) target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True) target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first') if not target_content.empty: pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, header=pandasHelper.INT_WRITE_WITHOUT_HEADER) Logger.logi("successfully analyzed {0} prs".format(pos)) pos += fetchLimit
def change_trigger_analyser(project): df_review = pandasHelper.readTSVFile(f"{project}_comment_1.cvs") df_review.columns = [ "merge_request_id", "reviewer", "id", "change_trigger", "body" ] df_review.drop_duplicates(subset=['id'], inplace=True, keep="last") df_review.sort_values(by='merge_request_id', ascending=False, inplace=True) print(df_review.shape) df_mr = pandasHelper.readTSVFile(f"mergeRequest.csv", sep=StringKeyUtils.STR_SPLIT_SEP_CSV) df_mr.columns = [ "id", "number", "state", "merged_at", "created_at", "1", "2", "3", "4" ] """日期修补""" for index, row in df_mr.iterrows(): if row["created_at"] is None: row["created_at"] = row["merged_at"] df_mr = df_mr[["number", "created_at"]].copy(deep=True) df_mr["number"] = df_mr["number"].apply(lambda x: int(x)) df_mr.drop_duplicates(subset=['number'], inplace=True) print(df_mr.shape) x = range(-2, 11) y = [] for i in x: y.append(df_review.loc[df_review['change_trigger'] == i].shape[0]) plt.bar(x=x, height=y) plt.title(f'review comment({project})') for a, b in zip(x, y): plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11) print( "review comment useful:", df_review.shape[0] - df_review.loc[df_review['change_trigger'] == -1].shape[0]) plt.show() data = pandas.merge(left=df_review, right=df_mr, left_on="merge_request_id", right_on="number") data['label'] = data["created_at"].apply( lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))) data['label_y'] = data['label'].apply(lambda x: x.tm_year) data['label_m'] = data['label'].apply(lambda x: x.tm_mon) data = data.loc[data["change_trigger"] != -2].copy(deep=True) pandasHelper.writeTSVFile("comment.csv", df_review) """按照每个人分类""" groups = dict(list(data.groupby('reviewer'))) # 获取目标语料(即经过自定义分词后的语料) date = (2019, 5, 2020, 6) columns = ["reviewer"] for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 columns.append(str(f"{y}年{m}月")) ratio_df = DataFrame(columns=columns) # reviewer_list = ["bidinger", "mbouaziz", "raphael-proust", "romain.nl", "vect0r", "rafoo_"] reviewer_list = [] for reviewer, temp_df in groups.items(): print(reviewer, temp_df.shape[0]) if reviewer not in reviewer_list: tempDict = {"reviewer": reviewer} for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 df = temp_df.loc[(temp_df['label_y'] == y) & (temp_df['label_m'] == m)].copy( deep=True) sum = df.shape[0] if sum == 0: pass # tempDict[f'{y}年{m}月'] = 0 else: valid = df.loc[df['change_trigger'] >= 0].shape[0] tempDict[f'{y}年{m}月'] = valid / sum ratio_df = ratio_df.append(tempDict, ignore_index=True) print(ratio_df.shape)
def preProcess(df, date, project, isSTD=False, isNOR=False, m=3): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 m: 超参数,窗口时间 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df['label_y'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year) df['label_m'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon) df.reset_index(drop=True, inplace=True) """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息 所以添加之前必须两者分别处理 4.13 append 函数必须在表明label后面使用""" """添加File Path Features""" df = appendFilePathFeatureVector(df, project, date, 'pr_number') """读取User Follow的信息""" user_follow_relation_path = projectConfig.getUserFollowRelation() userFollowRelation = pandasHelper.readTSVFile( os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) def isInTimeGap(x, m, maxYear, maxMonth): d = x['label_y'] * 12 + x['label_m'] d2 = maxYear * 12 + maxMonth return d >= d2 - m """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login']) print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """计算contributor set""" contribute_list = list(set(df.loc[df['label'] == 1]['pr_user_login'])) reviewer_list = list(set(df.loc[df['label'] == 0]['review_user_login'])) """添加Relation ship Features""" """对 train set和test set的处理方式稍微不同 train set数据统计依照之前pr 而训练集的统计数据只限制于trianset """ """把 df 的pr_created_at 和 comment_at 转化为时间戳""" df['pr_created_at'] = df['pr_created_at'].apply( lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['comment_at'] = df['comment_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['response_time'] = df['comment_at'] - df['pr_created_at'] """Prior Evaluation reviewer cm 之前 review co的次数 Recent Evaluation reviewer cm 在 m 个月 reivew co的次数 Follow Relation co 是否follow cm Follower Relation cm 是否follow co """ startTime = datetime.now() prior_evaluation = {} recent_evaluation = {} follower_relation = {} following_relation = {} followMap = {} for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" follower_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login'])) for f in follower_list: if f in convertDict.keys(): followMap[(convertDict[k], convertDict[f])] = 1 for reviewer in reviewer_list: prior_evaluation[reviewer] = [] recent_evaluation[reviewer] = [] follower_relation[reviewer] = [] following_relation[reviewer] = [] cols = list(df.columns) for data in df.itertuples(index=False, name='Pandas'): if data.__len__() < 14: pullNumber = getattr(data, 'pr_number') author = getattr(data, 'pr_user_login') label = getattr(data, 'label') label_m = getattr(data, 'label_m') label_y = getattr(data, 'label_y') else: pullNumber = data[cols.index("pr_number")] author = data[cols.index("pr_user_login")] label = data[cols.index("label")] label_m = data[cols.index("label_m")] label_y = data[cols.index("label_y")] temp = None if label == 0: temp = df.loc[df['pr_number'] < pullNumber] else: temp = df.loc[df['label'] == 0] temp = temp.loc[df['pr_user_login'] == author].copy(deep=True) """依次遍历每个候选者统计""" prior_evaluation_dict = dict(temp['review_user_login'].value_counts()) for r in reviewer_list: prior_evaluation[r].append(prior_evaluation_dict.get(r, 0)) """temp 二次过滤 选m个月以内的""" if temp.shape[0] > 0: if label == 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1) else: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1) temp = temp.loc[temp['target'] == 1] """依次遍历每个候选者统计""" recent_evaluation_dict = dict(temp['review_user_login'].value_counts()) for r in reviewer_list: recent_evaluation[r].append(recent_evaluation_dict.get(r, 0)) """添加 follow 和 following 信息""" for r in reviewer_list: follower_relation[r].append(followMap.get((author, r), 0)) following_relation[r].append(followMap.get((r, author), 0)) """添加""" for r in reviewer_list: df[f'prior_evaluation_{r}'] = prior_evaluation[r] df[f'recent_evaluation_{r}'] = recent_evaluation[r] df[f'follower_relation_{r}'] = follower_relation[r] df[f'following_relation_{r}'] = following_relation[r] print("prior cost time:", datetime.now() - startTime) startTime = datetime.now() # 开始时间:数据集开始时间的前一天 start_time = time.strptime(str(date[0]) + "-" + str(date[1]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S") start_time = int(time.mktime(start_time) - 86400) # 结束时间:数据集的最后一天 end_time = time.strptime(str(date[2]) + "-" + str(date[3]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S") end_time = int(time.mktime(end_time) - 1) """Activeness Feature 添加""" total_pulls = {} # 项目有的所有pr evaluate_pulls = {} # co 之前review的数量 recent_pulls = {} # co 最近m月 review的数量 evaluate_time = {} # co 平均回应时间 last_time = {} # co 最后一次reivew 的时间间隔 first_time = {} # co 第一次review的时间间隔 for reviewer in reviewer_list: total_pulls[reviewer] = [] evaluate_pulls[reviewer] = [] recent_pulls[reviewer] = [] evaluate_time[reviewer] = [] last_time[reviewer] = [] first_time[reviewer] = [] count = 0 cols = list(df.columns) index_pr_number = cols.index("pr_number") index_pr_label = cols.index("label") index_pr_label_m = cols.index("label_m") index_pr_label_y = cols.index("label_y") for data in df.itertuples(index=False): print("count for active:", count) count += 1 pullNumber = data[index_pr_number] label = data[index_pr_label] label_m = data[index_pr_label_m] label_y = data[index_pr_label_y] temp = None if label == 0: temp = df.loc[df['pr_number'] < pullNumber].copy(deep=True) else: temp = df.loc[df['label'] == 0].copy(deep=True) """依次遍历每个候选者统计""" total_pull_number = list(set(temp['pr_number'])).__len__() res_reviewer_list = reviewer_list.copy() groups = dict(list(temp.groupby('review_user_login'))) """先遍历有tempDf的reviewer""" for r, tempDf in groups.items(): total_pulls[r].append(total_pull_number) res_reviewer_list.remove(r) if tempDf.shape[0] == 0: """没有历史 认为age=0, 间隔是最大间隔""" first_time[r].append(0) last_time[r].append(end_time - start_time) else: pr_created_time_list = list(tempDf['pr_created_at']) first_review_time = min(pr_created_time_list) last_review_time = max(pr_created_time_list) first_time[r].append(end_time - first_review_time) last_time[r].append(end_time - last_review_time) evaluate_pulls[r].append(tempDf.shape[0]) """平均回应时间统计""" if tempDf.shape[0] > 0: evaluate_avg = sum(tempDf['response_time']) evaluate_avg /= tempDf.shape[0] else: evaluate_avg = end_time - start_time evaluate_time[r].append(evaluate_avg) for r in res_reviewer_list: total_pulls[r].append(total_pull_number) evaluate_pulls[r].append(0) first_time[r].append(0) last_time[r].append(end_time - start_time) evaluate_avg = end_time - start_time evaluate_time[r].append(evaluate_avg) # recent_pulls[r].append(0) """过滤k个月 重新计算""" if label == 0: if temp.shape[0] > 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1) temp = temp.loc[temp['target'] == 1] else: if temp.shape[0] > 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1) temp = temp.loc[temp['target'] == 1] res_reviewer_list = reviewer_list.copy() groups = dict(list(temp.groupby('review_user_login'))) """先遍历有tempDf的reviewer""" for r, tempDf in groups.items(): recent_pulls[r].append(tempDf.shape[0]) res_reviewer_list.remove(r) for r in res_reviewer_list: recent_pulls[r].append(0) """Activeness Feature增加到 dataframe""" for r in reviewer_list: df[f'total_pulls_{r}'] = total_pulls[r] df[f'evaluate_pulls_{r}'] = evaluate_pulls[r] df[f'recent_pulls_{r}'] = recent_pulls[r] df[f'first_time_{r}'] = first_time[r] df[f'last_time_{r}'] = last_time[r] df[f'evaluate_time_{r}'] = evaluate_time[r] print("active cost time:", datetime.now() - startTime) tagDict = dict(list(df.groupby('pr_number'))) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login'], inplace=True) test_data.drop_duplicates(inplace=True) test_data.drop_duplicates(subset=['pr_number'], inplace=True) # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """获得pr list""" prList = list(test_data['pr_number']) """去除pr number""" test_data.drop(columns=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) test_data.drop(columns=['pr_created_at', 'pr_user_login', 'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True) train_data.drop(columns=['pr_created_at', 'pr_user_login', 'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList