Esempio n. 1
0
    def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, is_split=False):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            if i < date[2] * 12 + date[3]:
                if filter_train:
                    filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            else:
                if filter_test:
                    filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = CNTrain.preProcess(df, date)

        if not is_split:
            prList = list(test_data.drop_duplicates(['pull_number'])['pull_number'])
            prList.sort()

            prList, communities_data = CNTrain.RecommendByCN(project, date, train_data, train_data_y, test_data,
                                                              test_data_y, convertDict, recommendNum=recommendNum)
        else:
            prList, communities_data = CNTrain.RecommendByCNSplit(project, date, train_data,
                                                                                    train_data_y, test_data,
                                                                                    test_data_y, convertDict,
                                                                                    recommendNum=recommendNum)
        """保存推荐结果到本地"""
        DataProcessUtils.saveRecommendList(prList, communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], convertDict, communities_data['whole']['author_list'], key=project + str(date) + str(filter_train) + str(filter_test))

        """新增返回测试 训练集大小,用于做统计"""
        # from source.scikit.combine.CBTrain import CBTrain
        # recommendList, answerList = CBTrain.recoverName(recommendList, answerList, convertDict)
        """新增返回训练集 测试集大小"""
        trainSize = (train_data.shape, test_data.shape)
        print(trainSize)

        return prList, convertDict, trainSize, communities_data
Esempio n. 2
0
    def algorithmBody(date, project, recommendNum=5, filter_train=True, filter_test=True, disMapList=None):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            if i < date[2] * 12 + date[3]:
                if filter_train:
                    filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            else:
                if filter_test:
                    filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getFPSDataPath() + os.sep + f'FPS_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = FPSTrain.preProcess(df, date)

        prList = list(test_data.drop_duplicates(['pull_number'])['pull_number'])
        """2020.8.1 本来FPS的pr顺序是倒序,现在改为正序,便于和其他算法推荐名单比较"""
        prList.sort()

        recommendList, answerList = FPSAlgorithm.RecommendByFPS(train_data, train_data_y, test_data,
                                                                test_data_y, recommendNum=recommendNum, disMapList=disMapList)

        """新增返回测试 训练集大小,用于做统计"""

        """新增返回训练集 测试集大小"""
        trainSize = (train_data.shape, test_data.shape)
        print(trainSize)

        # """输出推荐名单到文件"""
        # DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict)

        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 3
0
    def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, a=0.5):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            if i < date[2] * 12 + date[3]:
                if filter_train:
                    filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            else:
                if filter_test:
                    filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = EARECTrain.preProcess(df, date)

        prList = list(test_data.drop_duplicates(['pull_number'])['pull_number'])
        # prList.sort()

        recommendList, answerList, = EARECTrain.RecommendByEAREC(train_data, train_data_y, test_data,
                                                                 test_data_y, convertDict, recommendNum=recommendNum,
                                                                 a=a)

        """保存推荐结果到本地"""
        DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date))

        """新增返回训练集 测试集大小"""
        trainSize = (train_data.shape, test_data.shape)
        print(trainSize)

        return recommendList, answerList, prList, convertDict, trainSize
    def checkPRTimeLineResult(owner, repo, limit=5):
        """检查PRTimeline数据是否完整爬取"""
        """1. 获取该仓库所有的pr_node"""
        repo_fullname = owner + "/" + repo
        pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname)
        pr_nodes = list(pr_nodes)
        pr_nodes = [node[0] for node in pr_nodes]
        """2. 读取prtimeline文件,对比pr"""
        target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        df = pandasHelper.readTSVFile(fileName=target_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        """3. 获取需要fetch的PR"""
        fetched_prs = list(df['pullrequest_node'])
        need_fetch_prs = list(set(pr_nodes).difference(set(fetched_prs)))
        Logger.logi("there are {0} pr_timeline need to fetch".format(need_fetch_prs.__len__()))

        """设置fetch参数"""
        pos = 0
        fetchLimit = 200
        size = need_fetch_prs.__len__()
        while pos < size:
            sub_need_fetch_prs = need_fetch_prs[pos:pos + fetchLimit]
            Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size))
            """4. 开始爬取"""
            results = AsyncProjectAllDataFetcher.getPullRequestTimeLine(owner=owner,
                                                                        repo=repo, nodes=sub_need_fetch_prs)
            Logger.logi("successfully fetched {0} pr! ".format(pos + fetchLimit))
            pos += fetchLimit
Esempio n. 5
0
    def loadLocalPrDistance(project):
        prDisDf_LCP = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep +
                                               f"pr_distance_{project}_LCP.tsv",
                                               header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        prDisDf_LCS = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep +
                                               f"pr_distance_{project}_LCS.tsv",
                                               header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        prDisDf_LCSubseq = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep +
                                                    f"pr_distance_{project}_LCSubseq.tsv",
                                                    header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        prDisDf_LCSubstr = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep +
                                                    f"pr_distance_{project}_LCSubstr.tsv",
                                                    header=pandasHelper.INT_READ_FILE_WITH_HEAD)

        DisMapLCP = {}
        DisMapLCS = {}
        DisMapLCSubseq = {}
        DisMapLCSubstr = {}
        for row in prDisDf_LCP.itertuples(index=False, name='Pandas'):
            p1 = row[0]
            p2 = row[1]
            dis = row[2]
            DisMapLCP[(p1, p2)] = dis
            DisMapLCP[(p2, p1)] = dis

        for row in prDisDf_LCS.itertuples(index=False, name='Pandas'):
            p1 = row[0]
            p2 = row[1]
            dis = row[2]
            DisMapLCS[(p1, p2)] = dis
            DisMapLCS[(p2, p1)] = dis

        for row in prDisDf_LCSubseq.itertuples(index=False, name='Pandas'):
            p1 = row[0]
            p2 = row[1]
            dis = row[2]
            DisMapLCSubseq[(p1, p2)] = dis
            DisMapLCSubseq[(p2, p1)] = dis

        for row in prDisDf_LCSubstr.itertuples(index=False, name='Pandas'):
            p1 = row[0]
            p2 = row[1]
            dis = row[2]
            DisMapLCSubstr[(p1, p2)] = dis
            DisMapLCSubstr[(p2, p1)] = dis

        return [DisMapLCS, DisMapLCP, DisMapLCSubseq, DisMapLCSubstr]
Esempio n. 6
0
    def algorithmBody(date, project, algorithmType, recommendNum=5, featureType=3, filter_train=False,
                      filter_test=False):
        df = None
        """对需求文件做合并 """
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            if i < date[2] * 12 + date[3]:
                if filter_train:
                    filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            else:
                if filter_test:
                    filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """获取测试的 pull number列表"""
        train_data, train_data_y, test_data, test_data_y, convertDict, prList = MLTrain.preProcess(df, date, project,
                                                                                                   featureType,
                                                                                                   isNOR=True)
        print("train data:", train_data.shape)
        print("test data:", test_data.shape)

        recommendList, answerList = MultipleLabelAlgorithm. \
            RecommendByAlgorithm(train_data, train_data_y, test_data, test_data_y, algorithmType)

        trainSize = (train_data.shape[0], test_data.shape[0])

        """保存推荐结果到本地"""
        DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date))

        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 7
0
    def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            print(y, m)

            if i < date[2] * 12 + date[3]:
                if filter_train:
                    filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            else:
                if filter_test:
                    filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getXFDataPath() + os.sep + f'XF_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'

            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """预处理新增返回测试pr列表 2020.4.11"""
        train_data, train_data_y, test_data, test_data_y, convertDict = XFTrain.preProcess(df, date)

        prList = list(set(test_data['pr_number']))
        prList.sort()

        """根据算法获得推荐列表"""
        recommendList, answerList = XFTrain.RecommendByXF(train_data, train_data_y, test_data,
                                                          test_data_y, recommendNum=recommendNum)
        trainSize = (train_data.shape[0], test_data.shape[0])
        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 8
0
    def algorithmBody(date, project, recommendNum=5, alpha=0.98, K=20, c=1):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            filename = projectConfig.getHGDataPath() + os.sep + f'HG_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = HGTrain.preProcess(df, date)

        prList = list(set(test_data['pr_number']))
        prList.sort()

        recommendList, answerList, authorList = HGTrain.RecommendByHG(train_data, train_data_y, test_data,
                                                          test_data_y, date, project, convertDict, recommendNum=recommendNum,
                                                          alpha=alpha, K=K, c=c, useLocalPrDis=False)

        """保存推荐结果,用于做统计"""
        DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date),
                                           authorList=authorList)

        """新增返回训练集 测试集大小"""
        trainSize = (train_data.shape[0], test_data.shape[0])
        print(trainSize)

        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 9
0
    def testMLAlgorithms(project, dates, algorithm):
        """
           测试算法接口,把流程相似的算法统一
           algorithm : svm, dt, rf
        """

        recommendNum = 5  # 推荐数量
        excelName = f'output{algorithm}.xlsx'
        sheetName = 'result'

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])

        for date in dates:
            startTime = datetime.now()

            """直接读取不带路径的信息"""
            filename = projectConfig.getRootPath() + os.sep + 'data' + os.sep + 'train' + os.sep + \
                       f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv'
            df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD)
            print("raw df:", df.shape)

            # """读取带路径的文件信息"""
            # filename = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep + \
            #            f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv'
            # df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD,
            #                               sep=StringKeyUtils.STR_SPLIT_SEP_CSV)

            """df做预处理"""
            train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project,
                                                                                                isNOR=True)
            recommendList = None
            answerList = None
            """根据算法获得推荐列表"""
            if algorithm == StringKeyUtils.STR_ALGORITHM_SVM:  # 支持向量机
                recommendList, answerList = MLTrain.RecommendBySVM(train_data, train_data_y, test_data,
                                                                   test_data_y, recommendNum=recommendNum)
            elif algorithm == StringKeyUtils.STR_ALGORITHM_DT:  # 决策树
                recommendList, answerList = MLTrain.RecommendByDecisionTree(train_data, train_data_y, test_data,
                                                                            test_data_y, recommendNum=recommendNum)
            elif algorithm == StringKeyUtils.STR_ALGORITHM_RF:  # 随机森林
                recommendList, answerList = MLTrain.RecommendByRandomForest(train_data, train_data_y, test_data,
                                                                            test_data_y, recommendNum=recommendNum)

            """根据推荐列表做评价"""
            topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

            """结果写入excel"""
            DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

            print("cost time:", datetime.now() - startTime)
Esempio n. 10
0
    def algorithmBody(date, project, recommendNum=5, response_limit_time=8, active_limit_time=10):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            filename = projectConfig.getGADataPath() + os.sep + f'GA_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = GATrain.preProcess(df, date)

        prList = list(test_data_y.keys())
        prList.sort(reverse=False)

        recommendList, answerList = GATrain.RecommendByGA(train_data, train_data_y, test_data,
                                                          test_data_y, recommendNum=recommendNum,
                                                          response_limit_time=response_limit_time,
                                                          active_limit_time=active_limit_time)

        """新增返回测试 训练集大小,用于做统计"""

        """新增返回训练集 测试集大小"""
        trainSize = (list(set(train_data['pr_number'])).__len__(), list(set(test_data['pr_number'])).__len__())
        print(trainSize)

        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 11
0
def processFileNameVector(filename):
    """
    手工计算tf-idf
    @param filename: 要读取的文件名(文件是带"include_filepath"的数据)
    @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法
    """
    # 获取包含filename的df
    df = pandasHelper.readTSVFile(fileName=filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD,
                                  sep=StringKeyUtils.STR_SPLIT_SEP_CSV)
    # 统计包含s的pr
    sub2pr = {}
    # 统计每个pr中s出现的次数 数据结构:key: prNumber, value: {s:2}
    pr2sub = {}
    for index, row in df.iterrows():
        subs = splitFileName(row['filename'])
        for sub in subs:
            if sub not in sub2pr:
                sub2pr[sub] = set()
            # 添加出现sub的pr
            sub2pr[sub].add(row['pr_number'])
            if row['pr_number'] not in pr2sub:
                pr2sub[row['pr_number']] = {}
            if sub not in pr2sub[row['pr_number']]:
                pr2sub[row['pr_number']][sub] = 0
            # sub在该pr中出现的次数+1
            pr2sub[row['pr_number']][sub] += 1
    # 获取所有出现过的s,添加到表头作为维度
    path_vector = list(sub2pr.keys())
    # 计算weight(pr,s) = (s在pr中出现的次数)* (log(所有pr的数量/出现s的PR数量) + 1)
    pr_path_weight_df_columns = ['pr_number'].extend(path_vector)
    pr_path_weight_df = pandas.DataFrame(columns=pr_path_weight_df_columns)
    # 所有pr的数量
    nt = len(pr2sub.keys())
    for pr in pr2sub:
        new_row = {'pr_number': pr}
        for sub in sub2pr:
            # 在df中添加新列,默认path权值都为0
            df[sub] = 0
            # s在pr中出现的次数
            tf = 0
            if sub in pr2sub[pr]:
                s_cnt = pr2sub[pr][sub]
            # 出现s的pr数量
            pr_cnt = len(sub2pr[sub])
            idf = math.log(nt / pr_cnt) + 1
            # 计算s在pr中的权值
            pr_s_weight = tf * idf
            new_row[sub] = pr_s_weight
        pr_path_weight_df = pr_path_weight_df.append([new_row], ignore_index=True)

    # 根据pr_number关联pr_path_weight_df和pr_df
    df = pandas.merge(df, pr_path_weight_df, on="pr_number", how="left")
    return df
Esempio n. 12
0
    def algorithmBody(date, project, recommendNum=5):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        print(date)
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            # print(y, m)
            filename = projectConfig.getCFDataPath() + os.sep + f'CF_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            """数据自带head"""
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """新增人名映射字典"""
        train_data, train_data_y, test_data, test_data_y, convertDict = CFTrain.preProcess(df, date)

        prList = list(test_data.drop_duplicates(['pull_number'])['pull_number'])
        prList.sort()

        recommendList, answerList = CFTrain.RecommendByCF(date, train_data, train_data_y, test_data,
                                                          test_data_y, convertDict, recommendNum=recommendNum)

        """新增返回测试 训练集大小,用于做统计"""
        trainSize = (train_data.shape, test_data.shape)
        print(trainSize)

        return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 13
0
    def algorithmBody(date, project, recommendNum=5):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            print(y, m)

            filename = projectConfig.getCADataPath() + os.sep \
                       + f'CA_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        """df做预处理"""
        """预处理新增返回测试pr列表 2020.4.11"""
        # train_data, train_data_y, test_data, test_data_y, convertDict = CATrain.preProcess(df, date)
        #
        # prList = list(test_data['pr_number'])
        #
        # """根据算法获得推荐列表"""
        # recommendList, answerList = IRTrain.RecommendByIR(train_data, train_data_y, test_data,
        #                                                   test_data_y, recommendNum=recommendNum)
        # trainSize = (train_data.shape[0], test_data.shape[0])
        # return recommendList, answerList, prList, convertDict, trainSize

        CATrain.preProcess(df, date)
Esempio n. 14
0
    def loadLocalPrDistance(project):
        prDisDf_FPS = pandasHelper.readTSVFile(projectConfig.getPullRequestDistancePath() + os.sep +
                                               f"pr_distance_{project}_FPS.tsv",
                                               header=pandasHelper.INT_READ_FILE_WITH_HEAD)

        DisMapFPS = {}

        for row in prDisDf_FPS.itertuples(index=False, name='Pandas'):
            p1 = row[0]
            p2 = row[1]
            dis = row[2]
            DisMapFPS[(p1, p2)] = dis

        return DisMapFPS
Esempio n. 15
0
def pr_review_ratio():
    """绘制某个pr和对应review数量的分布"""
    train_path = projectConfig.getDataTrainPath()
    filename = os.path.join(train_path, 'pr_review_ratio_akka.tsv')
    df = pandasHelper.readTSVFile(filename)
    # MLTrain.getSeriesBarPlot(df[1])

    import matplotlib.pyplot as plt

    fig = plt.figure()
    # fig.add_subplot(2, 1, 1)
    counts = df[1].value_counts()
    print(counts)
    counts.sort_index().plot(kind='bar')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.title('项目akka每一个pull-request对应的review数量')
    plt.xlabel('pull-request数量')
    plt.ylabel('review数量')
    plt.show()
Esempio n. 16
0
def attachFileNameToOriginData(project, date):
    """
    在训练集中加入file信息
    @rtype: None
    """
    print("-----------------start------------------")
    start_time = datetime.now()

    # 训练数据路径
    train_data_path = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep
    # 表格文件路径
    origin_filepath = train_data_path + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv'
    target_filepath = train_data_path + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv'
    # 获取原始表格
    origin_df = pandasHelper.readTSVFile(origin_filepath, pandasHelper.INT_READ_FILE_WITHOUT_HEAD)

    # 原始表格表头
    columns = ['reviewer_reviewer', 'pr_number', 'review_id', 'commit_sha', 'author', 'pr_created_at',
               'pr_commits', 'pr_additions', 'pr_deletions', 'pr_head_label', 'pr_base_label',
               'review_submitted_at', 'commit_status_total', 'commit_status_additions',
               'commit_status_deletions', 'commit_files', 'author_review_count',
               'author_push_count', 'author_submit_gap']
    origin_df.columns = columns
    print("fetch origin data success!")

    print("start fetching commit_file data from mysql......")
    # 从数据库获取commitFiles DataFrame,包含了每次commit的文件信息
    results = query(project)
    commit_files = results[0]
    cur_time = datetime.now()
    print("fetch commit_file data success! cur_cost_time: ", cur_time - start_time)

    # 根据commit_sha,合并原始数据和commitFile
    new_df = pandas.merge(origin_df, commit_files, on="commit_sha", how="left")
    new_df.to_csv(target_filepath, encoding='utf-8', index=False, header=True)
    print("attach commit_file data to origin data success! result output to :" + target_filepath)
    print("-----------------finish------------------")
    def testChangeTriggerAnalyzer(owner, repo, pull_request_node):
        AsyncApiHelper.setRepo(owner, repo)

        """读取PRTimeline,获取需要分析change_trigger的pr列表"""
        pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename,
                                                  header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        pr_nodes = list(set(list(pr_timeline_df['pullrequest_node'])))
        pr_nodes.sort()

        """按照爬取限制取子集"""
        pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'] == pull_request_node]
        """对子集按照pull_request_node分组"""
        grouped_timeline = pr_timeline_items.groupby((['pullrequest_node']))
        """将分组结果保存为字典{pr->pr_timeline_items}"""
        formated_data = []
        for pr, group in grouped_timeline:
            record = group.to_dict(orient='records')
            record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION)))
            formated_data.append(record)

        """分析这些pr的timeline"""
        pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data)
        print("finish!")
Esempio n. 18
0
    def testBayesAlgorithms(project, dates):  # 输入测试日期和对应文件序列  输出一整个算法的表现

        recommendNum = 5  # 推荐数量
        excelName = 'outputNB.xlsx'
        sheetName = 'result'

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])

        for i in range(1, 4):  # Bayes 有三个模型
            for date in dates:
                filename = projectConfig.getRootPath() + r'\data\train' + r'\\' \
                           + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv'
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD)
                """df做预处理"""
                isNOR = True
                if i == 1 or i == 3:
                    isNOR = False  # 对伯努利不做归一
                train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project,
                                                                                                    isNOR=isNOR)

                """根据算法获得推荐列表"""
                recommendList, answerList = MLTrain.RecommendByNativeBayes(train_data, train_data_y, test_data,
                                                                           test_data_y, recommendNum, i)

                """根据推荐列表做评价"""
                topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

                """结果写入excel"""
                DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
Esempio n. 19
0
    def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False,
                      test_type=StringKeyUtils.STR_TEST_TYPE_SLIDE):

        """提供单个日期和项目名称
           返回推荐列表和答案
           这个接口可以被混合算法调用
        """
        df = None
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1

            print(y, m)
            filename = None
            if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE:
                if i < date[2] * 12 + date[3]:
                    if filter_train:
                        filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                    else:
                        filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    if filter_test:
                        filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                    else:
                        filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT:
                if filter_test:
                    filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                else:
                    filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv'
            if df is None:
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
            else:
                temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD)
                df = df.append(temp)  # 合并

        df.reset_index(inplace=True, drop=True)
        if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE:
            """df做预处理"""
            """预处理新增返回测试pr列表 2020.4.11"""
            train_data, train_data_y, test_data, test_data_y, convertDict = IR_ACTrain.preProcessBySlide(df, date)

            prList = list(test_data['pr_number'])

            """根据算法获得推荐列表"""
            recommendList, answerList = IR_ACTrain.RecommendByIR_AC_SLIDE(train_data, train_data_y, test_data,
                                                                          test_data_y, recommendNum=recommendNum)
            trainSize = (train_data.shape[0], test_data.shape[0])
            return recommendList, answerList, prList, convertDict, trainSize
        elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT:
            """df做预处理"""
            """新增人名映射字典"""
            test_data, test_data_y, convertDict = IR_ACTrain.preProcessByIncrement(df, date)

            prList = list(test_data.drop_duplicates(['pr_number'])['pr_number'])
            """增量预测第一个pr不预测"""
            prList.sort()
            prList.pop(0)
            recommendList, answerList = IR_ACTrain.RecommendByIR_AC_INCREMENT(test_data, test_data_y,
                                                                              recommendNum=recommendNum)

            """新增返回测试 训练集大小,用于做统计"""

            """新增返回训练集 测试集大小"""
            trainSize = (test_data.shape)
            print(trainSize)

            # """输出推荐名单到文件"""
            # DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict)

            return recommendList, answerList, prList, convertDict, trainSize
Esempio n. 20
0
    def demo():
        data = pandasHelper.readTSVFile(projectConfig.getFPSTestData(), pandasHelper.INT_READ_FILE_WITHOUT_HEAD)
        print("input data:", data.shape)
        startTime = datetime.now()
        # print(DataFrameColumnUtils.COLUMN_REVIEW_FPS)

        """导入pullrequest, review,file,commit数据"""
        pullrequests, pullrequestsIndex = \
            BeanNumpyHelper.getBeansFromDataFrame(PullRequest(),
                                                  DataFrameColumnUtils.COLUMN_REVIEW_FPS_PULL_REQUEST,
                                                  data)
        # if configPraser.getPrintMode():
        #     print(pullrequests.__len__())
        #     print(pullrequestsIndex)

        time2 = datetime.now()
        print("pull request cost time:", time2 - startTime)

        reviews, reviewsIndex = BeanNumpyHelper.getBeansFromDataFrame(Review(),
                                                                      DataFrameColumnUtils.COLUMN_REVIEW_FPS_REVIEW,
                                                                      data)

        time3 = datetime.now()
        print("review cost time:", time3 - time2)

        if configPraser.getPrintMode():
            print(reviews)
            print(reviewsIndex)
        commits, commitsIndex = BeanNumpyHelper.getBeansFromDataFrame(Commit(),
                                                                      DataFrameColumnUtils.COLUMN_REVIEW_FPS_COMMIT,
                                                                      data)
        time4 = datetime.now()
        print("commits cost time:", time4 - time3)
        # if configPraser.getPrintMode():
        #     print(commits)
        #     print(commitsIndex)
        files, filesIndex = BeanNumpyHelper.getBeansFromDataFrame(File(),
                                                                  DataFrameColumnUtils.COLUMN_REVIEW_FPS_FILE,
                                                                  data)

        time5 = datetime.now()
        print("file cost time:", time5 - time4)
        # if configPraser.getPrintMode():
        #     print(files)
        #     print(filesIndex)

        pullrequestReviewIndex = BeanNumpyHelper.beanAssociate(pullrequests, [StringKeyUtils.STR_KEY_REPO_FULL_NAME,
                                                                              StringKeyUtils.STR_KEY_NUMBER],
                                                               reviews, [StringKeyUtils.STR_KEY_REPO_FULL_NAME,
                                                                         StringKeyUtils.STR_KEY_PULL_NUMBER])
        time6 = datetime.now()
        print("pull request index time:", time6 - time5)

        # if configPraser.getPrintMode():
        #     print(pullrequestReviewIndex)

        reviewCommitIndex = BeanNumpyHelper.beanAssociate(reviews, [StringKeyUtils.STR_KEY_COMMIT_ID],
                                                          commits, [StringKeyUtils.STR_KEY_SHA])
        time7 = datetime.now()
        print("commits index cost time:", time7 - time6)
        #
        # if configPraser.getPrintMode():
        #     print(reviewCommitIndex)

        commitFileIndex = BeanNumpyHelper.beanAssociate(commits, [StringKeyUtils.STR_KEY_SHA],
                                                        files, [StringKeyUtils.STR_KEY_COMMIT_SHA])

        time8 = datetime.now()
        print("files index cost time:", time8 - time7)

        # if configPraser.getPrintMode():
        #     print(commitFileIndex)

        receiveTime = datetime.now()
        print("load cost time:", receiveTime - startTime)

        """用于做评价的结果收集"""
        recommendList = []
        answerList = []

        testNumber = configPraser.getTestNumber()

        if configPraser.getFPSCtypes():
            """调用dll库实现增加运行速度"""
            dll = CDLL("cFPS.dll")
            dll.addf.restype = c_float
            dll.addf.argtypes = [c_float, c_float]
            print(dll.addf(10, 30))

            c_prs = FPSClassCovert.convertPullRequest(pullrequests)
            c_reviews = FPSClassCovert.convertReview(reviews)
            c_commits = FPSClassCovert.convertCommit(commits)
            c_files = FPSClassCovert.convertFile(files)

            c_result = c_fps_result()
            print(c_prs)
            print(c_reviews)
            print(c_commits)
            print(c_files)

            dll.FPS.restype = None
            dll.FPS.argtypes = (POINTER(c_fps_pr), c_int, POINTER(c_fps_review), c_int,
                                POINTER(c_fps_commit), c_int, POINTER(c_fps_file), c_int,
                                POINTER(c_fps_result), c_int, c_int)

            prs_num = c_prs.__len__()
            p_c_prs = (c_fps_pr * prs_num)(*c_prs)
            reviews_num = c_reviews.__len__()
            p_c_reviews = (c_fps_review * reviews_num)(*c_reviews)
            commits_num = c_commits.__len__()
            p_c_commits = (c_fps_commit * commits_num)(*c_commits)
            files_num = c_files.__len__()
            p_c_files = (c_fps_file * files_num)(*c_files)

            dll.FPS(p_c_prs, prs_num, p_c_reviews, reviews_num, p_c_commits,
                    commits_num, p_c_files, files_num, pointer(c_result), 0, 10, True)

            endTime = datetime.now()
            print("total cost time:", endTime - startTime, " recommend cost time:", endTime - receiveTime)

            print("answer:", str(c_result.answer, encoding='utf-8'))
            print("recommend:", str(c_result.recommend, encoding='utf-8'))

        else:
            """使用Python实现算法"""
            for pos in range(0, testNumber):
                """通过review算法获取推荐名单"""
                candicateList, authorList = FPSAlgorithm.reviewerRecommend(pullrequests, pullrequestsIndex,
                                                                           reviews, reviewsIndex, commits, commitsIndex,
                                                                           files, filesIndex,
                                                                           pullrequestReviewIndex,
                                                                           reviewCommitIndex, commitFileIndex,
                                                                           pos, configPraser.getReviewerNumber())

                print("candicateList", candicateList)
                endTime = datetime.now()
                print("total cost time:", endTime - startTime, " recommend cost time:", endTime - receiveTime)

                recommendList.append(candicateList)
                answerList.append(authorList)
Esempio n. 21
0
def appendFilePathFeatureVector(inputDf, projectName, date, pull_number_name):
    """
       用tf-idf模型计算pr的所有commit的设计的文件的路径
       注: 文件改动来源于 pullrequest直接关联的changeFile  2020.7.7

       @description: 给df, 在之前的dataframe的基础上面追加   pr路径形成的tf-idf特征向量
       @notice: datafrme 必须有pull_number id,可以重复
       @param origin_df: 预先读取好的dataframe
       @param projectName: 指定项目名
       @param date: 开始年,开始月,结束年,结束月的四元组
       @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法
       """

    """对输入df做label存在检测"""
    if 'label' not in inputDf.columns:
        raise Exception("label not in input dataframe!")

    df = inputDf[[pull_number_name]].copy(deep=True)
    df.drop_duplicates(inplace=True)
    df.columns = ['pr_number']

    """读取commit pr relation文件"""
    time1 = datetime.now()
    pr_change_file_path = projectConfig.getPRChangeFilePath()

    """pr_change_file 数据库输出 自带抬头"""
    prChangeFileData = pandasHelper.readTSVFile(
        os.path.join(pr_change_file_path, f'ALL_{projectName}_data_pr_change_file.tsv'),
        pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
    )

    """做三者连接"""
    df = pandas.merge(df, prChangeFileData, left_on='pr_number', right_on='pull_number')
    print("merge relation:", df.shape)
    df = df[['pr_number', 'filename']].copy(deep=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)

    print("after merge:", df.shape)

    """获取filepath -> sub_filepath映射表"""
    file_path_list = set(df['filename'].copy(deep=True))
    file_path_dict = {}
    for file_path in file_path_list:
        sub_file_path = splitFileName(file_path)
        if file_path not in file_path_dict:
            file_path_dict[file_path] = set()
        file_path_dict[file_path] = file_path_dict[file_path].union(sub_file_path)

    """获取pr_number -> sub_filepath语料"""
    pr_to_file_path = df[['pr_number', 'filename']]
    # 按照pr_number分组,获得原始语料(未经过分词的filepath)"""
    groups = dict(list(pr_to_file_path.groupby('pr_number')))
    # 获取目标语料(即经过自定义分词后的语料)
    pr_file_path_corpora = []
    for pr in groups:
        paths = list(groups[pr]['filename'])
        sub_paths = list(map(lambda x: list(file_path_dict[x]), paths))
        sub_paths = reduce(lambda x, y: x + y, sub_paths)
        pr_file_path_corpora.append(sub_paths)

    """计算tf-idf"""
    print("start tf_idf algorithm......")
    # 建立词典
    dictionary = corpora.Dictionary(pr_file_path_corpora)
    # 基于词典建立新的语料库
    corpus = [dictionary.doc2bow(text) for text in pr_file_path_corpora]
    # 用语料库训练TF-IDF模型
    tf_idf_model = models.TfidfModel(corpus)
    # 得到加权矩阵
    path_tf_tdf = list(tf_idf_model[corpus])

    """处理path_tf_tdf,构造pr_path加权矩阵"""
    print("start merge tf_idf to origin_df......")
    pr_list = list(groups.keys())
    columns = ['pr_number']
    path_ids = list(dictionary.token2id.values())
    path_ids = list(map(lambda x: str(x), path_ids))
    columns.extend(path_ids)
    pr_path_weight_df = pandas.DataFrame(columns=columns).fillna(value=0)
    for index, row in enumerate(path_tf_tdf):
        """用字典的方式填充dataframe"""
        new_row = {'pr_number': pr_list[index]}
        row = list(map(lambda x: (str(x[0]), x[1]), row))
        path_weight = dict(row)
        new_row = dict(new_row, **path_weight)
        pr_path_weight_df = pr_path_weight_df.append(new_row, ignore_index=True)
    pr_path_weight_df = pr_path_weight_df.fillna(value=0)
    print(pr_path_weight_df.shape)

    """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理"""
    tempData = pr_path_weight_df.copy(deep=True)
    labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True)
    tempData = pandas.merge(tempData, labelData, on='pr_number')

    tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True)
    tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True)

    tempData_train.drop(columns=['pr_number', 'label'], inplace=True)
    tempData_test.drop(columns=['pr_number', 'label'], inplace=True)

    # tempData.drop(columns=['pr_number'], inplace=True)

    """PCA 做缩减"""
    pca = PCA(n_components=0.95)
    tempData_train = pca.fit_transform(tempData_train)
    print("after pca :", tempData_train.shape)
    print(pca.explained_variance_ratio_)
    tempData_train = pandas.DataFrame(tempData_train)

    tempData_test = pca.transform(tempData_test)
    print("after pca :", tempData_train.shape)
    tempData_test = pandas.DataFrame(tempData_test)

    tempData = pandas.concat([tempData_train, tempData_test], axis=0)
    tempData.reset_index(drop=True, inplace=True)

    """和提供的数据做拼接"""
    tempData['pr_number_t'] = list(pr_path_weight_df['pr_number'])
    inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t')
    inputDf.drop(columns=['pr_number_t'], inplace=True)

    return inputDf
Esempio n. 22
0
            return math.ceil(2)
        else:
            return num

    @staticmethod
    def get_pronouncing_nums(words):
        counts = 0
        for word in words:
            counts += FleshReadableUtils.get_pronouncing_num(word)
        print('音节总数:', str(counts))
        return counts


if __name__ == "__main__":

    data = pandasHelper.readTSVFile(projectConfig.getReviewCommentTestData())
    comments = data.as_matrix()[:, (2, 4)]
    print(comments.shape)

    readable = []  # 可读性
    stopWordRate = []  # 停句率
    questionRatio = []  # 问题率
    codeElementRatio = []  # 代码元素率
    stopKeyRatio = []  # 关键字率
    conceptualSimilarity = []  # 概念相似度
    badCase = []

    stopwords = SplitWordHelper().getEnglishStopList()
    languageKeyWords = LanguageKeyWordHelper.LanguageKeyWordLanguage.getRubyKeyWordList()

    for line in comments:
Esempio n. 23
0
def appendTextualFeatureVector(inputDf, projectName, date, pull_number_name):
    """
       用tf-idf模型计算pr的所有title,pr的文本
       pr的信息直接从PRDataFile 那里获取
       @description: 给df, 在之前的dataframe的基础上面追加   pr路径形成的tf-idf特征向量
       @notice: datafrme 必须有pull_number_id,可以重复
       @param origin_df: 预先读取好的dataframe
       @param projectName: 指定项目名
       @param date: 开始年,开始月,结束年,结束月的四元组
       @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法
    """

    """对输入df做label存在检测"""
    if 'label' not in inputDf.columns:
        raise Exception("label not in input dataframe!")

    print("input shape:", inputDf.shape)
    print(date)

    df = inputDf[[pull_number_name]].copy(deep=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.columns = ['pr_number']

    """读取pullrequestData 文件"""
    pull_request_path = projectConfig.getPullRequestPath()

    pullRequestData = pandasHelper.readTSVFile(
        os.path.join(pull_request_path, f'ALL_{projectName}_data_pullrequest.tsv'),
        pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
    )
    """pull_number和pr review commit relation做拼接"""
    df = pandas.merge(df, pullRequestData, left_on='pr_number', right_on='number')
    df = df[['pr_number', 'title', 'body']].copy(deep=True)
    df.columns = ['pr_number', 'pr_title', 'pr_body']
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.fillna(value='', inplace=True)

    """用于收集所有文本向量分词"""
    stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

    textList = []
    for row in df.itertuples(index=False, name='Pandas'):
        tempList = []
        """获取pull request的标题"""
        pr_title = row[list(df.columns).index('pr_title')]
        pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

        """初步尝试提取词干效果反而下降了 。。。。"""

        """对单词做提取词干"""
        pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
        tempList.extend(pr_title_word_list)

        """pull request的body"""
        pr_body = row[list(df.columns).index('pr_body')]
        pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
        """对单词做提取词干"""
        pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
        tempList.extend(pr_body_word_list)
        textList.append(tempList)

    print(textList.__len__())
    """对分词列表建立字典 并提取特征数"""
    dictionary = corpora.Dictionary(textList)
    print('词典:', dictionary)

    feature_cnt = len(dictionary.token2id)
    print("词典特征数:", feature_cnt)

    """根据词典建立语料库"""
    corpus = [dictionary.doc2bow(text) for text in textList]
    # print('语料库:', corpus)
    """语料库训练TF-IDF模型"""
    tfidf = models.TfidfModel(corpus)

    """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
    wordVectors = []
    for i in range(0, df.shape[0]):
        wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

    """填充为向量"""
    wordVectors = DataProcessUtils.convertFeatureDictToDataFrame(wordVectors, featureNum=feature_cnt)

    """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理"""
    tempData = wordVectors.copy(deep=True)
    tempData['pr_number'] = df['pr_number']
    labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True)
    tempData = pandas.merge(tempData, labelData, on='pr_number')

    tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True)
    tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True)

    tempData_train.drop(columns=['pr_number', 'label'], inplace=True)
    tempData_test.drop(columns=['pr_number', 'label'], inplace=True)


    """PAC 做缩减"""
    pca = PCA(n_components=0.95)
    tempData_train = pca.fit_transform(tempData_train)
    print("after pca :", tempData_train.shape)
    print(pca.explained_variance_ratio_)
    tempData_train = pandas.DataFrame(tempData_train)

    tempData_test = pca.transform(tempData_test)
    print("after pca :", tempData_train.shape)
    tempData_test = pandas.DataFrame(tempData_test)

    tempData = pandas.concat([tempData_train, tempData_test], axis=0)
    tempData.reset_index(drop=True, inplace=True)
    tempData['pr_number_t'] = df['pr_number'].copy(deep=True)

    """和原来特征做拼接"""
    inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t')
    inputDf.drop(columns=['pr_number_t'], inplace=True)
    return inputDf
Esempio n. 24
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login'])
        recoverDict = {v: k for k, v in convertDict.items()}

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits',
                 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)

        """计算作者的关系"""
        df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER')

        """计算累积的历史数据"""
        request_number_prs = []  # 作者之前发出的数量
        request_number_merged_prs = []  # 作者发出的被接受的数量
        request_number_rejected_prs = []  # 作者发出被拒绝的数量
        request_accept_rate = []  # 作者pr被接受的概率
        request_reject_rate = []  # 作者pr被拒绝的概率

        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)]
            request_number_prs.append(temp_df.shape[0])
            accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0]
            request_number_merged_prs.append(accept_times)
            request_number_rejected_prs.append(temp_df.shape[0] - accept_times)
            if temp_df.shape[0] > 0:
                request_accept_rate.append(accept_times/temp_df.shape[0])
                request_reject_rate.append(1 - accept_times / temp_df.shape[0])
            else:
                request_accept_rate.append(0)
                request_reject_rate.append(0)

        df['request_number_prs'] = request_number_prs
        df['request_number_merged_prs'] = request_number_merged_prs
        df['request_number_rejected_prs'] = request_number_rejected_prs
        df['request_accept_rate'] = request_accept_rate
        df['request_reject_rate'] = request_reject_rate

        """添加作者是否关注项目"""
        user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation()
        userWatchRepoRelation = pandasHelper.readTSVFile(
            os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )
        watchRepoMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name']))
            isFollow = False
            for repo in following_list:
                owner, name = repo.split('/')
                if name == project:
                    isFollow = True
            watchRepoMap[convertDict[k]] = isFollow

        request_watches = []
        for row in df.itertuples():
            author = getattr(row, 'author_user_login')
            request_watches.append(watchRepoMap[author])
        df['request_watches'] = request_watches

        """添加作者follower数量, followings数量, 是否follow团队成员"""

        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        followMap = {}
        followerCountMap = {}
        followingCountMap = {}
        followCoreMemberMap = {}

        """收集核心成员列表"""
        coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login']))

        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            followingCountMap[convertDict[k]] = following_list.__len__()
            isFollowCoreMember = False
            for f in following_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1
                if f in coreMemberList:
                    isFollowCoreMember = True
            followCoreMemberMap[convertDict[k]] = isFollowCoreMember

            follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login']))
            followerCountMap[convertDict[k]] = follower_list.__len__()
            # for f in follower_list:
            #     if f in convertDict.keys():
            #         followMap[(convertDict[f], convertDict[k])] = 1

        request_number_follows = []
        request_number_following = []
        request_follow_ct = []
        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            request_number_following.append(followingCountMap[author])
            request_number_follows.append(followerCountMap[author])
            request_follow_ct.append(followCoreMemberMap[author])

        df['request_number_following'] = request_number_following
        df['request_number_follows'] = request_number_follows
        df['request_follow_ct'] = request_follow_ct

        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)

        """获得pr list"""
        prList = list(test_data['pr_number'])
        test_data.drop(columns=['pr_number'], inplace=True)

        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
    def commentAcceptRatioByReviewer(project):
        """计算以项目为粒度的评审意见认可度,通过时间来划分"""
        notesFileName = projectConfig.getNotesDataPath() + os.sep + f"notes_{project}.tsv"
        df_notes = pandasHelper.readTSVFile(notesFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        df_notes.drop_duplicates(subset=['id'], inplace=True, keep="last")
        df_notes.sort_values(by='merge_request_id', ascending=False, inplace=True)
        print(df_notes.shape)

        mrFileName = projectConfig.getMergeRequestDataPath() + os.sep + f"mergeRequest_{project}.tsv"
        df_mr = pandasHelper.readTSVFile(mrFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD)

        """日期修补"""
        for index, row in df_mr.iterrows():
            if row["created_at"] is None:
                row["created_at"] = row["merged_at"]

        df_mr = df_mr[["iid", "created_at"]].copy(deep=True)
        df_mr["iid"] = df_mr["iid"].apply(lambda x: int(x))
        df_mr.drop_duplicates(subset=['iid'], inplace=True)

        print(df_mr.shape)

        # x = range(-2, 11)
        # y = []
        # for i in x:
        #     y.append(df_notes.loc[df_notes['change_trigger'] == i].shape[0])
        # plt.bar(x=x, height=y)
        # plt.title(f'review comment({project})')
        # for a, b in zip(x, y):
        #     plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11)
        #
        # print("review comment useful:", df_notes.shape[0] - df_notes.loc[df_notes['change_trigger'] < 0].shape[0])
        # plt.show()

        data = pandas.merge(left=df_notes, right=df_mr, left_on="merge_request_id", right_on="iid")
        data['label'] = data["created_at_y"].apply(lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")))
        data['label_y'] = data['label'].apply(lambda x: x.tm_year)
        data['label_m'] = data['label'].apply(lambda x: x.tm_mon)

        data = data.loc[data["change_trigger"] != -2].copy(deep=True)

        # pandasHelper.writeTSVFile("comment.csv", df_notes)

        """按照每个人分类"""
        groups = dict(list(data.groupby('reviewer')))
        # 获取目标语料(即经过自定义分词后的语料)

        date = (2019, 5, 2020, 6)

        columns = ["reviewer"]
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1
            columns.append(str(f"{y}年{m}月"))

        ratio_df = DataFrame(columns=columns)

        # reviewer_list = ["bidinger", "mbouaziz", "raphael-proust", "romain.nl", "vect0r", "rafoo_"]
        reviewer_list = []
        for reviewer, temp_df in groups.items():
            print(reviewer, temp_df.shape[0])
            if reviewer not in reviewer_list:
                tempDict = {"reviewer": reviewer}
                for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
                    y = int((i - i % 12) / 12)
                    m = i % 12
                    if m == 0:
                        m = 12
                        y = y - 1

                    df = temp_df.loc[(temp_df['label_y'] == y) & (temp_df['label_m'] == m)].copy(deep=True)
                    sum = df.shape[0]
                    if sum == 0:
                        pass
                        # tempDict[f'{y}年{m}月'] = 0
                    else:
                        valid = df.loc[df['change_trigger'] >= 0].shape[0]
                        tempDict[f'{y}年{m}月'] = valid / sum
                ratio_df = ratio_df.append(tempDict, ignore_index=True)

        print(ratio_df.shape)
    def commentAcceptRatioByProject(projects, date):
        """计算以项目为粒度的评审意见认可度,通过时间来划分
           projects: 指定若干的项目
           date: 四元组,指定计算指标的开始时间和结束时间 (minYear, minMonth, maxYear, maxMonth)
           如(2019,10,2020,11) 是闭区间
        """
        columns = ["project"]
        for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1
            columns.append(str(f"{y}/{m}"))

        result_df = DataFrame(columns=columns)  # 用于存储最后结果的 dataframe

        for project in projects:
            notesFileName = projectConfig.getNotesDataPath() + os.sep + f"notes_{project}.tsv"
            df_notes = pandasHelper.readTSVFile(notesFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
            df_notes.drop_duplicates(subset=['id'], inplace=True, keep="last")
            df_notes.sort_values(by='merge_request_id', ascending=False, inplace=True)
            print(df_notes.shape)

            mrFileName = projectConfig.getMergeRequestDataPath() + os.sep + f"mergeRequest_{project}.tsv"
            df_mr = pandasHelper.readTSVFile(mrFileName, header=pandasHelper.INT_READ_FILE_WITH_HEAD)

            """日期修补"""
            for index, row in df_mr.iterrows():
                if row["created_at"] is None:
                    row["created_at"] = row["merged_at"]

            df_mr = df_mr[["iid", "created_at"]].copy(deep=True)
            df_mr["iid"] = df_mr["iid"].apply(lambda x: int(x))
            df_mr.drop_duplicates(subset=['iid'], inplace=True)

            print(df_mr.shape)

            # x = range(-2, 11)
            # y = []
            # for i in x:
            #     y.append(df_notes.loc[df_notes['change_trigger'] == i].shape[0])
            # plt.bar(x=x, height=y)
            # plt.title(f'review comment({project})')
            # for a, b in zip(x, y):
            #     plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11)
            #
            # print("review comment useful:", df_notes.shape[0] - df_notes.loc[df_notes['change_trigger'] < 0].shape[0])
            # plt.show()

            data = pandas.merge(left=df_notes, right=df_mr, left_on="merge_request_id", right_on="iid")
            data['label'] = data["created_at_y"].apply(lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")))
            data['label_y'] = data['label'].apply(lambda x: x.tm_year)
            data['label_m'] = data['label'].apply(lambda x: x.tm_mon)

            data = data.loc[data["change_trigger"] != -2].copy(deep=True)

            # pandasHelper.writeTSVFile("comment.csv", df_notes)

            # """按照时间拆分"""
            # minYear = min(data['label']).tm_year
            # minMonth = min(data['label']).tm_mon
            # maxYear = max(data['label']).tm_year
            # maxMonth = max(data['label']).tm_mon
            # date = (minYear, minMonth, maxYear, maxMonth)
            tempDict = {"project": project}

            for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
                y = int((i - i % 12) / 12)
                m = i % 12
                if m == 0:
                    m = 12
                    y = y - 1

                df = data.loc[(data['label_y'] == y) & (data['label_m'] == m)].copy(deep=True)
                commentCount = df.shape[0]
                if commentCount == 0:
                    pass
                else:
                    validCount = df.loc[df['change_trigger'] >= 0].shape[0]
                    tempDict[f'{y}年{m}月'] = validCount / commentCount
            result_df = result_df.append(tempDict, ignore_index=True)

            print(result_df.shape)
            # result_df.to_excel("q5_change_trigger_ratio.xls")

            return result_df
    def checkChangeTriggerResult(owner, repo):
        """检查PRChangeTrigger是否计算完整"""
        """在切换代理的时候,数据库连接会断开,导致comments信息查不到,会遗漏review comment的情况"""
        """这里检查一遍pr的change_trigger里是否有review_comment数据,如果没有,重新获取一次"""

        """PRTimeLine表头"""
        PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node",
                                     "comment_type", "change_trigger", "filepath"]
        """初始化目标文件"""
        target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_pr_change_trigger.tsv'

        """1. 获取该仓库所有的pr_node"""
        # repo_fullname = configPraser.getOwner() + "/" + configPraser.getRepo()
        # pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname)
        # pr_nodes = list(pr_nodes)
        # pr_nodes = [node[0] for node in pr_nodes]
        """需要获取的prs改为有issue 额 review的timeline的pr"""
        timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0)
        timeline_df = timeline_df.loc[(timeline_df['typename'] == 'IssueComment') \
                                      | (timeline_df['typename'] == 'PullRequestReview')].copy(deep=True)
        pr_nodes = list(set(timeline_df['pullrequest_node']))

        """2. 读取pr_change_trigger文件"""
        change_trigger_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv'
        change_trigger_df = pandasHelper.readTSVFile(fileName=change_trigger_filename, header=0)
        change_nodes = list(set(change_trigger_df['pullrequest_node']))

        # """3. 读取pr_timeline文件"""
        # timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_prtimeline.tsv'
        # timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0)

        """4. 将change_trigger按照pull_request_node分组"""
        grouped_timeline = change_trigger_df.groupby((['pullrequest_node']))
        """5. 分析pullrequest_node的change_trigger信息是否完整,整理出需要重新获取的pr信息"""
        re_analyze_prs = [x for x in pr_nodes if x not in change_nodes]
        # for pr, group in grouped_timeline:
        #     if pr not in pr_nodes:
        #         re_analyze_prs.append(pr)
        #     else:
        #         review_comment_trigger = group.loc[(group['comment_type'] == StringKeyUtils.STR_LABEL_REVIEW_COMMENT) & (group['change_trigger'] >= 0)]
        #         if review_comment_trigger is None or review_comment_trigger.empty:
        #             re_analyze_prs.append(pr)
        # Logger.logi("there are {0} prs need to re analyze".format(re_analyze_prs.__len__()))

        """读取PullRequestData,获取pr所对应的作者"""
        pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv'
        pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        """收集pr已经对应的作者  用于后面过滤属于作者评论"""
        pr_author_map = {}
        for index, row in pr_data_df.iterrows():
            pr_author_map[row['node_id']] = row['user_login']


        """设置fetch参数"""
        pos = 0
        fetchLimit = 200
        size = re_analyze_prs.__len__()
        while pos < size:
            Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size))
            sub_re_analyze_prs = re_analyze_prs[pos:pos + fetchLimit]
            """6. 重新获取这些pr的timeline"""
            re_analyze_prs_timeline_df = timeline_df[timeline_df['pullrequest_node'].isin(sub_re_analyze_prs)]
            grouped_timeline = re_analyze_prs_timeline_df.groupby((['pullrequest_node']))
            formated_data = []
            for pr, group in grouped_timeline:
                formated_data.append(group.to_dict(orient='records'))

            """7. 开始分析"""
            pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map)
            pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y]

            """8. 将分析结果去重并追加到change_trigger表中"""
            if pr_change_trigger_comments is not None and pr_change_trigger_comments.__len__() > 0:
                target_content = DataFrame()
                target_content = target_content.append(pr_change_trigger_comments, ignore_index=True)
                target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True)
                target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first')
                if not target_content.empty:
                    pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
                                              header=pandasHelper.INT_WRITE_WITHOUT_HEADER)
            Logger.logi("successfully analyzed {0} prs".format(re_analyze_prs.__len__()))
            pos += fetchLimit
    def getPRChangeTriggerData(owner, repo):
        """ 根据
            ALL_{repo}_data_prtimeline.tsv
            获取pr change_trigger数据
        """
        AsyncApiHelper.setRepo(owner, repo)
        """PRTimeLine表头"""
        PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node",
                                     "comment_type", "change_trigger", "filepath"]
        """初始化目标文件"""
        target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv'
        target_content = DataFrame(columns=PR_CHANGE_TRIGGER_COLUMNS)
        # pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
        #                           header=pandasHelper.INT_WRITE_WITH_HEADER)

        """读取PRTimeline,获取需要分析change_trigger的pr列表"""
        pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename,
                                                  header=pandasHelper.INT_READ_FILE_WITH_HEAD)

        """读取PullRequestData,获取pr所对应的作者"""
        pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv'
        pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        """收集pr已经对应的作者  用于后面过滤属于作者评论"""
        pr_author_map = {}
        for index, row in pr_data_df.iterrows():
            pr_author_map[row['node_id']] = row['user_login']

        pr_nodes = list(set(list(pr_timeline_df['pullrequest_node'])))
        pr_nodes.sort()
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjE5MjEzOTc5']  # 3次reopend
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjA0MTk5ODkw']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDQwOTAxMzk0']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MzE1OTU0NDgw']  # pr外review
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTQ3NDczNTIx']  # 普通用例
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDM4NjAzMjk2']  # 超多review
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0Mjg1NzExNTIx']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTAxNTUwMTcw']

        """设置fetch参数"""
        pos = 0
        fetchLimit = 400
        size = pr_nodes.__len__()
        Logger.logi("there are {0} prs need to analyze".format(pr_nodes.__len__()))
        t1 = datetime.now()

        while pos < size:
            print("now:", pos, ' total:', size, 'cost time:', datetime.now() - t1)
            Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size))

            """按照爬取限制取子集"""
            sub_prs = pr_nodes[pos:pos + fetchLimit]
            pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'].isin(sub_prs)]
            """对子集按照pull_request_node分组"""
            grouped_timeline = pr_timeline_items.groupby((['pullrequest_node']))
            """将分组结果保存为字典{pr->pr_timeline_items}"""
            formated_data = []
            for pr, group in grouped_timeline:
                record = group.to_dict(orient='records')
                record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION)))
                formated_data.append(record)

            """分析这些pr的timeline"""
            pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data,
                                                                                             pr_author_map)
            pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y]

            """将分析结果去重并追加到change_trigger表中"""
            if pr_change_trigger_comments.__len__() > 0:
                target_content = DataFrame()
                target_content = target_content.append(pr_change_trigger_comments, ignore_index=True)
                target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True)
                target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first')
                if not target_content.empty:
                    pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
                                              header=pandasHelper.INT_WRITE_WITHOUT_HEADER)
                Logger.logi("successfully analyzed {0} prs".format(pos))
            pos += fetchLimit
Esempio n. 29
0
    def change_trigger_analyser(project):
        df_review = pandasHelper.readTSVFile(f"{project}_comment_1.cvs")
        df_review.columns = [
            "merge_request_id", "reviewer", "id", "change_trigger", "body"
        ]
        df_review.drop_duplicates(subset=['id'], inplace=True, keep="last")
        df_review.sort_values(by='merge_request_id',
                              ascending=False,
                              inplace=True)
        print(df_review.shape)

        df_mr = pandasHelper.readTSVFile(f"mergeRequest.csv",
                                         sep=StringKeyUtils.STR_SPLIT_SEP_CSV)
        df_mr.columns = [
            "id", "number", "state", "merged_at", "created_at", "1", "2", "3",
            "4"
        ]
        """日期修补"""
        for index, row in df_mr.iterrows():
            if row["created_at"] is None:
                row["created_at"] = row["merged_at"]

        df_mr = df_mr[["number", "created_at"]].copy(deep=True)
        df_mr["number"] = df_mr["number"].apply(lambda x: int(x))
        df_mr.drop_duplicates(subset=['number'], inplace=True)

        print(df_mr.shape)

        x = range(-2, 11)
        y = []
        for i in x:
            y.append(df_review.loc[df_review['change_trigger'] == i].shape[0])
        plt.bar(x=x, height=y)
        plt.title(f'review comment({project})')
        for a, b in zip(x, y):
            plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=11)

        print(
            "review comment useful:", df_review.shape[0] -
            df_review.loc[df_review['change_trigger'] == -1].shape[0])
        plt.show()

        data = pandas.merge(left=df_review,
                            right=df_mr,
                            left_on="merge_request_id",
                            right_on="number")
        data['label'] = data["created_at"].apply(
            lambda x: (time.strptime(x, "%Y-%m-%dT%H:%M:%SZ")))
        data['label_y'] = data['label'].apply(lambda x: x.tm_year)
        data['label_m'] = data['label'].apply(lambda x: x.tm_mon)

        data = data.loc[data["change_trigger"] != -2].copy(deep=True)

        pandasHelper.writeTSVFile("comment.csv", df_review)
        """按照每个人分类"""
        groups = dict(list(data.groupby('reviewer')))
        # 获取目标语料(即经过自定义分词后的语料)

        date = (2019, 5, 2020, 6)

        columns = ["reviewer"]
        for i in range(date[0] * 12 + date[1],
                       date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
            y = int((i - i % 12) / 12)
            m = i % 12
            if m == 0:
                m = 12
                y = y - 1
            columns.append(str(f"{y}年{m}月"))

        ratio_df = DataFrame(columns=columns)

        # reviewer_list = ["bidinger", "mbouaziz", "raphael-proust", "romain.nl", "vect0r", "rafoo_"]
        reviewer_list = []
        for reviewer, temp_df in groups.items():
            print(reviewer, temp_df.shape[0])
            if reviewer not in reviewer_list:
                tempDict = {"reviewer": reviewer}
                for i in range(date[0] * 12 + date[1],
                               date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
                    y = int((i - i % 12) / 12)
                    m = i % 12
                    if m == 0:
                        m = 12
                        y = y - 1

                    df = temp_df.loc[(temp_df['label_y'] == y)
                                     & (temp_df['label_m'] == m)].copy(
                                         deep=True)
                    sum = df.shape[0]
                    if sum == 0:
                        pass
                        # tempDict[f'{y}年{m}月'] = 0
                    else:
                        valid = df.loc[df['change_trigger'] >= 0].shape[0]
                        tempDict[f'{y}年{m}月'] = valid / sum
                ratio_df = ratio_df.append(tempDict, ignore_index=True)

        print(ratio_df.shape)
Esempio n. 30
0
    def preProcess(df, date, project, isSTD=False, isNOR=False, m=3):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        m: 超参数,窗口时间
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df['label_y'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year)
        df['label_m'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon)
        df.reset_index(drop=True, inplace=True)

        """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息
            所以添加之前必须两者分别处理 4.13 
            append 函数必须在表明label后面使用"""

        """添加File Path Features"""
        df = appendFilePathFeatureVector(df, project, date, 'pr_number')


        """读取User Follow的信息"""
        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        def isInTimeGap(x, m, maxYear, maxMonth):
            d = x['label_y'] * 12 + x['label_m']
            d2 = maxYear * 12 + maxMonth
            return d >= d2 - m

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login'])

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """计算contributor set"""
        contribute_list = list(set(df.loc[df['label'] == 1]['pr_user_login']))
        reviewer_list = list(set(df.loc[df['label'] == 0]['review_user_login']))

        """添加Relation ship Features"""
        """对 train set和test set的处理方式稍微不同   train set数据统计依照之前pr
            而训练集的统计数据只限制于trianset
        """

        """把  df 的pr_created_at 和 comment_at 转化为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['comment_at'] = df['comment_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['response_time'] = df['comment_at'] - df['pr_created_at']

        """Prior Evaluation  reviewer cm 之前 review co的次数
           Recent Evaluation reviewer cm 在 m 个月 reivew co的次数
           Follow Relation  co 是否follow cm
           Follower Relation  cm 是否follow co
        """
        startTime = datetime.now()
        prior_evaluation = {}
        recent_evaluation = {}
        follower_relation = {}
        following_relation = {}
        followMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            follower_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            for f in follower_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1

        for reviewer in reviewer_list:
            prior_evaluation[reviewer] = []
            recent_evaluation[reviewer] = []
            follower_relation[reviewer] = []
            following_relation[reviewer] = []
        cols = list(df.columns)

        for data in df.itertuples(index=False, name='Pandas'):
            if data.__len__() < 14:
                pullNumber = getattr(data, 'pr_number')
                author = getattr(data, 'pr_user_login')
                label = getattr(data, 'label')
                label_m = getattr(data, 'label_m')
                label_y = getattr(data, 'label_y')
            else:
                pullNumber = data[cols.index("pr_number")]
                author = data[cols.index("pr_user_login")]
                label = data[cols.index("label")]
                label_m = data[cols.index("label_m")]
                label_y = data[cols.index("label_y")]

            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber]
            else:
                temp = df.loc[df['label'] == 0]
            temp = temp.loc[df['pr_user_login'] == author].copy(deep=True)
            """依次遍历每个候选者统计"""
            prior_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                prior_evaluation[r].append(prior_evaluation_dict.get(r, 0))
            """temp 二次过滤  选m个月以内的"""
            if temp.shape[0] > 0:
                if label == 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                else:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                temp = temp.loc[temp['target'] == 1]
            """依次遍历每个候选者统计"""
            recent_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                recent_evaluation[r].append(recent_evaluation_dict.get(r, 0))
            """添加 follow 和 following 信息"""
            for r in reviewer_list:
                follower_relation[r].append(followMap.get((author, r), 0))
                following_relation[r].append(followMap.get((r, author), 0))

        """添加"""
        for r in reviewer_list:
            df[f'prior_evaluation_{r}'] = prior_evaluation[r]
            df[f'recent_evaluation_{r}'] = recent_evaluation[r]
            df[f'follower_relation_{r}'] = follower_relation[r]
            df[f'following_relation_{r}'] = following_relation[r]

        print("prior cost time:", datetime.now() - startTime)
        startTime = datetime.now()

        # 开始时间:数据集开始时间的前一天
        start_time = time.strptime(str(date[0]) + "-" + str(date[1]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        start_time = int(time.mktime(start_time) - 86400)
        # 结束时间:数据集的最后一天
        end_time = time.strptime(str(date[2]) + "-" + str(date[3]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        end_time = int(time.mktime(end_time) - 1)

        """Activeness Feature 添加"""
        total_pulls = {}  # 项目有的所有pr
        evaluate_pulls = {}  # co 之前review的数量
        recent_pulls = {}  # co 最近m月 review的数量
        evaluate_time = {}  # co 平均回应时间
        last_time = {}  # co 最后一次reivew 的时间间隔
        first_time = {}  # co 第一次review的时间间隔
        for reviewer in reviewer_list:
            total_pulls[reviewer] = []
            evaluate_pulls[reviewer] = []
            recent_pulls[reviewer] = []
            evaluate_time[reviewer] = []
            last_time[reviewer] = []
            first_time[reviewer] = []
        count = 0
        cols = list(df.columns)

        index_pr_number = cols.index("pr_number")
        index_pr_label = cols.index("label")
        index_pr_label_m = cols.index("label_m")
        index_pr_label_y = cols.index("label_y")

        for data in df.itertuples(index=False):
            print("count for active:", count)
            count += 1
            pullNumber = data[index_pr_number]
            label = data[index_pr_label]
            label_m = data[index_pr_label_m]
            label_y = data[index_pr_label_y]
            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber].copy(deep=True)
            else:
                temp = df.loc[df['label'] == 0].copy(deep=True)
            """依次遍历每个候选者统计"""
            total_pull_number = list(set(temp['pr_number'])).__len__()
            res_reviewer_list = reviewer_list.copy()

            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                total_pulls[r].append(total_pull_number)
                res_reviewer_list.remove(r)
                if tempDf.shape[0] == 0:
                    """没有历史 认为age=0, 间隔是最大间隔"""
                    first_time[r].append(0)
                    last_time[r].append(end_time - start_time)
                else:
                    pr_created_time_list = list(tempDf['pr_created_at'])
                    first_review_time = min(pr_created_time_list)
                    last_review_time = max(pr_created_time_list)
                    first_time[r].append(end_time - first_review_time)
                    last_time[r].append(end_time - last_review_time)
                evaluate_pulls[r].append(tempDf.shape[0])

                """平均回应时间统计"""
                if tempDf.shape[0] > 0:
                    evaluate_avg = sum(tempDf['response_time'])
                    evaluate_avg /= tempDf.shape[0]
                else:
                    evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)

            for r in res_reviewer_list:
                total_pulls[r].append(total_pull_number)
                evaluate_pulls[r].append(0)
                first_time[r].append(0)
                last_time[r].append(end_time - start_time)
                evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)
                # recent_pulls[r].append(0)

            """过滤k个月 重新计算"""
            if label == 0:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                    temp = temp.loc[temp['target'] == 1]
            else:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                    temp = temp.loc[temp['target'] == 1]

            res_reviewer_list = reviewer_list.copy()
            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                recent_pulls[r].append(tempDf.shape[0])
                res_reviewer_list.remove(r)

            for r in res_reviewer_list:
                recent_pulls[r].append(0)

        """Activeness Feature增加到 dataframe"""
        for r in reviewer_list:
            df[f'total_pulls_{r}'] = total_pulls[r]
            df[f'evaluate_pulls_{r}'] = evaluate_pulls[r]
            df[f'recent_pulls_{r}'] = recent_pulls[r]
            df[f'first_time_{r}'] = first_time[r]
            df[f'last_time_{r}'] = last_time[r]
            df[f'evaluate_time_{r}'] = evaluate_time[r]

        print("active cost time:", datetime.now() - startTime)

        tagDict = dict(list(df.groupby('pr_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """去除pr number"""
        test_data.drop(columns=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)

        test_data.drop(columns=['pr_created_at', 'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        train_data.drop(columns=['pr_created_at',  'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList