Beispiel #1
0
    def preProcessByIncrement(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        print("before drop:", df.shape)
        df = df.copy(deep=True)
        df.drop(columns=['review_user_login'], inplace=True)
        df.drop_duplicates(['pr_number'], inplace=True)
        print("after drop:", df.shape)

        test_data = df

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        return test_data, test_data_y, convertDict
Beispiel #2
0
    def preProcessBySlide(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])

        """时间转为时间戳"""
        df['test'] = df['pr_created_at']
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pull_number')))

        print("before drop:", df.shape)
        df = df.copy(deep=True)
        df.drop(columns=['review_user_login', 'repo_full_name'], inplace=True)
        df.drop_duplicates(['pull_number', 'commit_sha', 'file_filename'], inplace=True)
        print("after drop:", df.shape)

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #3
0
    def preProcess(df, dates):
        """参数说明
                    df:读取的dataframe对象
                    dates:四元组,后两位作为测试的年月 (,,year,month)
                   """

        """注意: 输入文件中已经带有列名了"""

        """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na"""
        # """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        # df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer'])

        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pull_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """8ii处理NAN"""
        train_data.dropna(how='any', inplace=True)
        train_data.reset_index(drop=True, inplace=True)
        train_data.fillna(value='', inplace=True)

        """过滤掉评论时间在数据集时间范围内之后的数据"""
        # 结束时间:数据集pr最晚的创建时间
        pr_created_time_data = train_data['pr_created_at']
        end_time = max(pr_created_time_data.to_list())
        train_data = train_data[train_data['comment_at'] <= end_time]
        train_data.reset_index(drop=True, inplace=True)

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            test_data_y[pull_number] = reviewers

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            train_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #4
0
    def preProcessByIncrement(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,时间跨度相当于都是测试集, 没有作用
        """

        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pull_number')))

        """时间转为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))

        print("before drop:", df.shape)
        df = df.copy(deep=True)
        df.drop(columns=['review_user_login', 'repo_full_name'], inplace=True)
        df.drop_duplicates(['pull_number', 'commit_sha', 'file_filename'], inplace=True)
        print("after drop:", df.shape)

        test_data = df
        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """

        test_data_y = {}
        for pull_number in df.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        return test_data, test_data_y, convertDict
Beispiel #5
0
    def preProcessBySlide(df, dates):
        """参数说明
         df:读取的dataframe对象
         dates:作为测试的年月四元组
        """
        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))

        """创建时间转化为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['pr_created_at'] = df['pr_created_at'] / (24 * 3600)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'pr_title', 'review_user_login', 'label', 'pr_created_at']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)
        """对人名字做数字处理"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))
        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'label', 'pr_created_at']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""

        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)
        train_data['pr_number'] = list(df.loc[df['label'] == False]['pr_number'])
        test_data['pr_number'] = list(df.loc[df['label'] == True]['pr_number'])
        train_data['pr_created_at'] = list(df.loc[df['label'] == False]['pr_created_at'])
        test_data['pr_created_at'] = list(df.loc[df['label'] == True]['pr_created_at'])

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """

        train_data_y = {}
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict"""
        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #6
0
    def preProcess(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        t1 = datetime.now()

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        """由于特殊性  PB算法的训练集不是dataFrame
           { p1:set1, p2:set2, ... }
        """
        train_data = {}
        test_data = {}
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')

            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)

            wordSet = MultisetHelper.WordMultiset()
            wordSet.add(tempList)

            if label == 0:
                train_data[pr_num] = wordSet
            else:
                test_data[pr_num] = wordSet

        print("train size:", train_data.items().__len__())
        print("test size:", test_data.items().__len__())

        """问题转化为多标签问题
            train_data_y   [{pull_number:[(r1, s1), (r2, s2), ...]}, ... ,{}]
            
            r 代表reviewer
            s 代表集合
        """

        train_data_y = {}
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True)
            commentDict = dict(list(tempDf.groupby('review_user_login')))
            reviewerList = []
            for reviewer in reviewers:
                commentDf = commentDict[reviewer]
                wordSet = MultisetHelper.WordMultiset()
                for row in commentDf.itertuples(index=False, name='Pandas'):
                    comment = getattr(row, 'comment_body')
                    comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords]
                    """对单词做提取词干"""
                    comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
                    wordSet.add(comment_body_word_list)
                reviewerList.append((reviewer, wordSet))
            train_data_y[pull_number] = reviewerList

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True)
            commentDict = dict(list(tempDf.groupby('review_user_login')))
            reviewerList = []
            for reviewer in reviewers:
                commentDf = commentDict[reviewer]
                wordSet = MultisetHelper.WordMultiset()
                for row in commentDf.itertuples(index=False, name='Pandas'):
                    comment = getattr(row, 'comment_body')
                    comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords]
                    """对单词做提取词干"""
                    comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
                    wordSet.add(comment_body_word_list)
                reviewerList.append((reviewer, wordSet))
            test_data_y[pull_number] = reviewerList

        print("preprocess cost time:", datetime.now() - t1)
        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #7
0
    def preProcess(df, dates):
        """参数说明
                    df:读取的dataframe对象
                    dates:四元组,后两位作为测试的年月 (,,year,month)
                   """

        """注意: 输入文件中已经带有列名了"""

        """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na"""
        # """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        df['pr_title'].fillna(value='', inplace=True)
        df['pr_body'].fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer'])

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        """问题:lsi的过程不能在整个数据集上面做,不然会导致pr的文本重复问题"""
        df_pr = df.copy(deep=True)
        df_pr.drop_duplicates(subset=['pull_number'], keep='first', inplace=True)
        df_pr.reset_index(drop=True, inplace=True)

        # 用于记录pr中文字的数量,对于pr少于10个word的pr.直接去掉
        df_pr_word_count = []

        textList = []
        for row in df_pr.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            if tempList.__len__() >= 10 or getattr(row, 'label'):
                textList.append(tempList)
            if getattr(row, 'label'):
                df_pr_word_count.append(10)  # 以便过后面的过滤
            else:
                df_pr_word_count.append(tempList.__len__())

        """去除无用的训练pr"""
        df_pr['count'] = df_pr_word_count
        df_pr = df_pr.loc[df_pr['count'] >= 10].copy(deep=True)
        df_pr.reset_index(drop=True, inplace=True)
        df_pr.drop(['count'], inplace=True, axis=1)

        """保存只有pr的列表"""
        prList = list(df_pr['pull_number'])

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""
        trainData_index = df_pr.loc[df_pr['label'] == False].index
        testData_index = df_pr.loc[df_pr['label'] == True].index

        trainDataTextList = [textList[x] for x in trainData_index]
        testDataTextList = [textList[x] for x in testData_index]

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(trainDataTextList)
        print('词典:', dictionary)

        """感觉有问题,tfidf模型不应该是在全数据集上面计算,而是在训练集上面计算,而测试集的向量就是
        单纯的带入模型的计算结果"""

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in trainDataTextList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]

        topic_num = 10
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num)
        topic_list = lsi.print_topics()
        print("{0}个主题的单词分布为:\n".format(topic_num))
        for topic in topic_list:
            print(topic)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, trainDataTextList.__len__()):
            wordVectors.append(dict(lsi[dictionary.doc2bow(trainDataTextList[i])]))
        for i in range(0, testDataTextList.__len__()):
            wordVectors.append(dict(lsi[dictionary.doc2bow(testDataTextList[i])]))

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=topic_num)
        test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=topic_num)

        lsi_data = pandas.concat([train_v_data, test_v_data], axis=0)  # 0 轴合并
        lsi_data['pull_number'] = prList
        lsi_data.reset_index(inplace=True, drop=True)

        train_data = df.loc[df['label'] == False]
        train_data.reset_index(drop=True, inplace=True)
        test_data = df.loc[df['label'] == True]
        test_data.reset_index(drop=True, inplace=True)

        train_data = train_data.merge(lsi_data, on="pull_number")
        train_data.drop(columns=['label'], inplace=True)

        test_data = test_data.merge(lsi_data, on="pull_number")
        test_data.drop(columns=['label'], inplace=True)

        """8ii处理NAN"""
        train_data.dropna(how='any', inplace=True)
        train_data.reset_index(drop=True, inplace=True)
        train_data.fillna(value='', inplace=True)

        """先对tag做拆分"""
        trainDict = dict(list(train_data.groupby('pull_number')))
        testDict = dict(list(test_data.groupby('pull_number')))

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            test_data_y[pull_number] = reviewers

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            train_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #8
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login'])
        recoverDict = {v: k for k, v in convertDict.items()}

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits',
                 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)

        """计算作者的关系"""
        df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER')

        """计算累积的历史数据"""
        request_number_prs = []  # 作者之前发出的数量
        request_number_merged_prs = []  # 作者发出的被接受的数量
        request_number_rejected_prs = []  # 作者发出被拒绝的数量
        request_accept_rate = []  # 作者pr被接受的概率
        request_reject_rate = []  # 作者pr被拒绝的概率

        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)]
            request_number_prs.append(temp_df.shape[0])
            accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0]
            request_number_merged_prs.append(accept_times)
            request_number_rejected_prs.append(temp_df.shape[0] - accept_times)
            if temp_df.shape[0] > 0:
                request_accept_rate.append(accept_times/temp_df.shape[0])
                request_reject_rate.append(1 - accept_times / temp_df.shape[0])
            else:
                request_accept_rate.append(0)
                request_reject_rate.append(0)

        df['request_number_prs'] = request_number_prs
        df['request_number_merged_prs'] = request_number_merged_prs
        df['request_number_rejected_prs'] = request_number_rejected_prs
        df['request_accept_rate'] = request_accept_rate
        df['request_reject_rate'] = request_reject_rate

        """添加作者是否关注项目"""
        user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation()
        userWatchRepoRelation = pandasHelper.readTSVFile(
            os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )
        watchRepoMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name']))
            isFollow = False
            for repo in following_list:
                owner, name = repo.split('/')
                if name == project:
                    isFollow = True
            watchRepoMap[convertDict[k]] = isFollow

        request_watches = []
        for row in df.itertuples():
            author = getattr(row, 'author_user_login')
            request_watches.append(watchRepoMap[author])
        df['request_watches'] = request_watches

        """添加作者follower数量, followings数量, 是否follow团队成员"""

        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        followMap = {}
        followerCountMap = {}
        followingCountMap = {}
        followCoreMemberMap = {}

        """收集核心成员列表"""
        coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login']))

        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            followingCountMap[convertDict[k]] = following_list.__len__()
            isFollowCoreMember = False
            for f in following_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1
                if f in coreMemberList:
                    isFollowCoreMember = True
            followCoreMemberMap[convertDict[k]] = isFollowCoreMember

            follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login']))
            followerCountMap[convertDict[k]] = follower_list.__len__()
            # for f in follower_list:
            #     if f in convertDict.keys():
            #         followMap[(convertDict[f], convertDict[k])] = 1

        request_number_follows = []
        request_number_following = []
        request_follow_ct = []
        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            request_number_following.append(followingCountMap[author])
            request_number_follows.append(followerCountMap[author])
            request_follow_ct.append(followCoreMemberMap[author])

        df['request_number_following'] = request_number_following
        df['request_number_follows'] = request_number_follows
        df['request_follow_ct'] = request_follow_ct

        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)

        """获得pr list"""
        prList = list(test_data['pr_number'])
        test_data.drop(columns=['pr_number'], inplace=True)

        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Beispiel #9
0
    def preProcess(df, dates):
        """参数说明
         df:读取的dataframe对象
         dates:作为测试的年月四元组
        """
        """注意: 输入文件中已经带有列名了"""

        """issue comment 和  review comment关注的"""

        """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))

        """对人名字做数字处理"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login'])
        df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S"))
        """对 comment_at 处理增加具体天数的标识"""
        df['day'] = df['pr_created_at'].apply(lambda x: 10000 * x.tm_year + 100 * x.tm_mon + x.tm_mday)  # 20200821

        """先对tag做拆分"""
        temp_df = df.copy(deep=True)
        temp_df.drop(columns=['filename'], inplace=True)
        temp_df.drop_duplicates(inplace=True)
        tagDict = dict(list(temp_df.groupby('pr_number')))

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'filename', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[(r1, d1), (r2, d2), ...]}, ... ,{}]
        """

        """训练集存的是作者   测试集存的是评审者"""
        train_data_y = {}
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            tempDf = tagDict[pull_number]
            author = []
            for row in tempDf.itertuples(index=False, name='Pandas'):
                a = getattr(row, 'author_user_login')
                day = getattr(row, 'day')
                author.append((a, None, day))
                break
            train_data_y[pull_number] = author

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            tempDf = tagDict[pull_number]
            reviewers = []
            for row in tempDf.itertuples(index=False, name='Pandas'):
                r = getattr(row, 'review_user_login')
                comment_node_id = getattr(row, 'comment_node_id')
                day = getattr(row, 'day')
                reviewers.append((r, comment_node_id, day))
            test_data_y[pull_number] = reviewers

        """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict"""
        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #10
0
    def preProcess(df, date, project, featureType, isSTD=False, isNOR=False):
        """参数说明
         df:读取的dataframe对象
         testDate:作为测试的年月 (year,month)
         isSTD:对数据是否标准化
         isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))

        df.sort_values(by='pr_number', ascending=True, inplace=True)
        df.reset_index(drop=True, inplace=True)

        # """在现有的特征中添加文本路径特征"""
        """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息
           所以添加之前必须两者分别处理 4.13 
           append 函数必须在表明label后面使用"""

        if featureType == 1 or featureType == 3:
            df = appendFilePathFeatureVector(df, project, date, 'pr_number')
        """在现有的特征中添加pr标题和内容文本特征"""
        if featureType == 2 or featureType == 3:
            df = appendTextualFeatureVector(df, project, date, 'pr_number')

        # """频率统计每一个reviewer的次数,排除数量过少的reviewer"""
        # freq = {}
        # for data in df.itertuples(index=False):
        #     name = data[list(df.columns).index('review_user_login')]
        #     if freq.get(name, None) is None:
        #         freq[name] = 0
        #     """训练集用户次数加一  测试集直接保留 """
        #     if not data[list(df.columns).index('label')]:
        #         freq[name] += 1
        #     else:
        #         freq[name] += 1
        #
        # num = 5
        # df['freq'] = df['review_user_login'].apply(lambda x: freq[x])
        # df = df.loc[df['freq'] > num].copy(deep=True)
        # df.drop(columns=['freq'], inplace=True)
        # df.reset_index(drop=True, inplace=True)
        # print("after lifter unexperienced user:"******"""对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login'])
        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """对branch做处理  舍弃base,head做拆分 并数字化"""
        df.drop(axis=1, columns=['pr_base_label'], inplace=True)  # inplace 代表直接数据上面
        df['pr_head_tail'] = df['pr_head_label']
        df['pr_head_tail'] = df['pr_head_tail'].apply(lambda x: x.split(':')[1])
        df['pr_head_label'] = df['pr_head_label'].apply(lambda x: x.split(':')[0])

        df.drop(axis=1, columns=['pr_head_tail'], inplace=True)

        # MLTrain.changeStringToNumber(df, ['pr_head_tail'])
        DataProcessUtils.changeStringToNumber(df, ['pr_head_label'])

        """时间转时间戳处理"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))))

        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """去除pr number"""
        test_data.drop(columns=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            # print(train_data_std)
            # print(test_data_std.shape)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Beispiel #11
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'review_user_login', 'label']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)


        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        """pr_number  经过去重"""
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""

        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Beispiel #12
0
    def preProcess(df, date, project, isSTD=False, isNOR=False, m=3):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        m: 超参数,窗口时间
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df['label_y'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year)
        df['label_m'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon)
        df.reset_index(drop=True, inplace=True)

        """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息
            所以添加之前必须两者分别处理 4.13 
            append 函数必须在表明label后面使用"""

        """添加File Path Features"""
        df = appendFilePathFeatureVector(df, project, date, 'pr_number')


        """读取User Follow的信息"""
        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        def isInTimeGap(x, m, maxYear, maxMonth):
            d = x['label_y'] * 12 + x['label_m']
            d2 = maxYear * 12 + maxMonth
            return d >= d2 - m

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login'])

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """计算contributor set"""
        contribute_list = list(set(df.loc[df['label'] == 1]['pr_user_login']))
        reviewer_list = list(set(df.loc[df['label'] == 0]['review_user_login']))

        """添加Relation ship Features"""
        """对 train set和test set的处理方式稍微不同   train set数据统计依照之前pr
            而训练集的统计数据只限制于trianset
        """

        """把  df 的pr_created_at 和 comment_at 转化为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['comment_at'] = df['comment_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['response_time'] = df['comment_at'] - df['pr_created_at']

        """Prior Evaluation  reviewer cm 之前 review co的次数
           Recent Evaluation reviewer cm 在 m 个月 reivew co的次数
           Follow Relation  co 是否follow cm
           Follower Relation  cm 是否follow co
        """
        startTime = datetime.now()
        prior_evaluation = {}
        recent_evaluation = {}
        follower_relation = {}
        following_relation = {}
        followMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            follower_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            for f in follower_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1

        for reviewer in reviewer_list:
            prior_evaluation[reviewer] = []
            recent_evaluation[reviewer] = []
            follower_relation[reviewer] = []
            following_relation[reviewer] = []
        cols = list(df.columns)

        for data in df.itertuples(index=False, name='Pandas'):
            if data.__len__() < 14:
                pullNumber = getattr(data, 'pr_number')
                author = getattr(data, 'pr_user_login')
                label = getattr(data, 'label')
                label_m = getattr(data, 'label_m')
                label_y = getattr(data, 'label_y')
            else:
                pullNumber = data[cols.index("pr_number")]
                author = data[cols.index("pr_user_login")]
                label = data[cols.index("label")]
                label_m = data[cols.index("label_m")]
                label_y = data[cols.index("label_y")]

            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber]
            else:
                temp = df.loc[df['label'] == 0]
            temp = temp.loc[df['pr_user_login'] == author].copy(deep=True)
            """依次遍历每个候选者统计"""
            prior_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                prior_evaluation[r].append(prior_evaluation_dict.get(r, 0))
            """temp 二次过滤  选m个月以内的"""
            if temp.shape[0] > 0:
                if label == 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                else:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                temp = temp.loc[temp['target'] == 1]
            """依次遍历每个候选者统计"""
            recent_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                recent_evaluation[r].append(recent_evaluation_dict.get(r, 0))
            """添加 follow 和 following 信息"""
            for r in reviewer_list:
                follower_relation[r].append(followMap.get((author, r), 0))
                following_relation[r].append(followMap.get((r, author), 0))

        """添加"""
        for r in reviewer_list:
            df[f'prior_evaluation_{r}'] = prior_evaluation[r]
            df[f'recent_evaluation_{r}'] = recent_evaluation[r]
            df[f'follower_relation_{r}'] = follower_relation[r]
            df[f'following_relation_{r}'] = following_relation[r]

        print("prior cost time:", datetime.now() - startTime)
        startTime = datetime.now()

        # 开始时间:数据集开始时间的前一天
        start_time = time.strptime(str(date[0]) + "-" + str(date[1]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        start_time = int(time.mktime(start_time) - 86400)
        # 结束时间:数据集的最后一天
        end_time = time.strptime(str(date[2]) + "-" + str(date[3]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        end_time = int(time.mktime(end_time) - 1)

        """Activeness Feature 添加"""
        total_pulls = {}  # 项目有的所有pr
        evaluate_pulls = {}  # co 之前review的数量
        recent_pulls = {}  # co 最近m月 review的数量
        evaluate_time = {}  # co 平均回应时间
        last_time = {}  # co 最后一次reivew 的时间间隔
        first_time = {}  # co 第一次review的时间间隔
        for reviewer in reviewer_list:
            total_pulls[reviewer] = []
            evaluate_pulls[reviewer] = []
            recent_pulls[reviewer] = []
            evaluate_time[reviewer] = []
            last_time[reviewer] = []
            first_time[reviewer] = []
        count = 0
        cols = list(df.columns)

        index_pr_number = cols.index("pr_number")
        index_pr_label = cols.index("label")
        index_pr_label_m = cols.index("label_m")
        index_pr_label_y = cols.index("label_y")

        for data in df.itertuples(index=False):
            print("count for active:", count)
            count += 1
            pullNumber = data[index_pr_number]
            label = data[index_pr_label]
            label_m = data[index_pr_label_m]
            label_y = data[index_pr_label_y]
            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber].copy(deep=True)
            else:
                temp = df.loc[df['label'] == 0].copy(deep=True)
            """依次遍历每个候选者统计"""
            total_pull_number = list(set(temp['pr_number'])).__len__()
            res_reviewer_list = reviewer_list.copy()

            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                total_pulls[r].append(total_pull_number)
                res_reviewer_list.remove(r)
                if tempDf.shape[0] == 0:
                    """没有历史 认为age=0, 间隔是最大间隔"""
                    first_time[r].append(0)
                    last_time[r].append(end_time - start_time)
                else:
                    pr_created_time_list = list(tempDf['pr_created_at'])
                    first_review_time = min(pr_created_time_list)
                    last_review_time = max(pr_created_time_list)
                    first_time[r].append(end_time - first_review_time)
                    last_time[r].append(end_time - last_review_time)
                evaluate_pulls[r].append(tempDf.shape[0])

                """平均回应时间统计"""
                if tempDf.shape[0] > 0:
                    evaluate_avg = sum(tempDf['response_time'])
                    evaluate_avg /= tempDf.shape[0]
                else:
                    evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)

            for r in res_reviewer_list:
                total_pulls[r].append(total_pull_number)
                evaluate_pulls[r].append(0)
                first_time[r].append(0)
                last_time[r].append(end_time - start_time)
                evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)
                # recent_pulls[r].append(0)

            """过滤k个月 重新计算"""
            if label == 0:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                    temp = temp.loc[temp['target'] == 1]
            else:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                    temp = temp.loc[temp['target'] == 1]

            res_reviewer_list = reviewer_list.copy()
            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                recent_pulls[r].append(tempDf.shape[0])
                res_reviewer_list.remove(r)

            for r in res_reviewer_list:
                recent_pulls[r].append(0)

        """Activeness Feature增加到 dataframe"""
        for r in reviewer_list:
            df[f'total_pulls_{r}'] = total_pulls[r]
            df[f'evaluate_pulls_{r}'] = evaluate_pulls[r]
            df[f'recent_pulls_{r}'] = recent_pulls[r]
            df[f'first_time_{r}'] = first_time[r]
            df[f'last_time_{r}'] = last_time[r]
            df[f'evaluate_time_{r}'] = evaluate_time[r]

        print("active cost time:", datetime.now() - startTime)

        tagDict = dict(list(df.groupby('pr_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """去除pr number"""
        test_data.drop(columns=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)

        test_data.drop(columns=['pr_created_at', 'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        train_data.drop(columns=['pr_created_at',  'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Beispiel #13
0
    def preProcess(df, dates):
        """参数说明
                    df:读取的dataframe对象
                    dates:四元组,后两位作为测试的年月 (,,year,month)
                   """

        """注意: 输入文件中已经带有列名了"""

        """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na"""
        # """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        df['pr_title'].fillna(value='', inplace=True)
        df['pr_body'].fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer'])

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""
        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)

        train_data = df.loc[df['label'] == False]
        train_data.reset_index(drop=True, inplace=True)
        test_data = df.loc[df['label'] == True]
        test_data.reset_index(drop=True, inplace=True)

        train_data = train_data.join(train_v_data)
        train_data.drop(columns=['label'], inplace=True)

        test_data = test_data.join(test_v_data)
        test_data.drop(columns=['label'], inplace=True)

        """8ii处理NAN"""
        train_data.dropna(how='any', inplace=True)
        train_data.reset_index(drop=True, inplace=True)
        train_data.fillna(value='', inplace=True)

        """先对tag做拆分"""
        trainDict = dict(list(train_data.groupby('pull_number')))
        testDict = dict(list(test_data.groupby('pull_number')))

        """过滤掉评论时间在数据集时间范围内之后的数据"""
        end_time = str(dates[2]) + "-" + str(dates[3]) + "-" + "01 00:00:00"
        train_data = train_data[train_data['commented_at'] < end_time]
        train_data.reset_index(drop=True, inplace=True)

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            test_data_y[pull_number] = reviewers

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            train_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Beispiel #14
0
    def preProcess(df, dates):
        """参数说明
         df:读取的dataframe对象
         dates:作为测试的年月四元组
        """
        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))

        """频率统计每一个reviewer的次数,排除数量过少的reviewer"""
        freq = {}
        for data in df.itertuples(index=False):
            name = data[list(df.columns).index('review_user_login')]
            if freq.get(name, None) is None:
                freq[name] = 0
            """训练集用户次数加一  测试集直接保留 """
            if not data[list(df.columns).index('label')]:
                freq[name] += 1
            else:
                freq[name] += 1

        num = 5
        df['freq'] = df['review_user_login'].apply(lambda x: freq[x])
        df = df.loc[df['freq'] > num].copy(deep=True)
        df.drop(columns=['freq'], inplace=True)
        df.reset_index(drop=True, inplace=True)
        print("after lifter unexperienced user:"******"""先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pull_number', 'review_user_login', 'commit_sha', 'file_filename', 'label']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)
        """对人名字做数字处理"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        reviewer_num = convertDict.items().__len__()
        print("reviewer num:", convertDict.items().__len__())
        """测试集 训练集做拆分"""
        train_raw_df = df.loc[df['label'] == 0].copy(deep=True)
        # userDict = dict(list(train_raw_df.groupby('review_user_login')))
        # print(userDict)

        """建立用户历史review记录的TF-IDF模型"""

        """获取filepath -> sub_filepath映射表"""
        file_path_list = set(df['file_filename'].copy(deep=True))
        file_path_dict = {}
        for file_path in file_path_list:
            # sub_file_path = splitFileName(file_path)
            sub_file_path = file_path.split('/')
            if file_path not in file_path_dict:
                file_path_dict[file_path] = set()
            file_path_dict[file_path] = file_path_dict[file_path].union(sub_file_path)

        """获取pr_number -> sub_filepath语料"""
        reviewer_to_file_path = df[['review_user_login', 'file_filename']]
        # 按照reviewer分组,获得原始语料(未经过分词的filepath)"""
        groups = dict(list(reviewer_to_file_path.groupby('review_user_login')))
        # 获取目标语料(即经过自定义分词后的语料)
        reviewer_file_path_corpora = []
        for reviewer in groups:
            paths = list(groups[reviewer]['file_filename'])
            sub_paths = list(map(lambda x: list(file_path_dict[x]), paths))
            sub_paths = reduce(lambda x, y: x + y, sub_paths)
            reviewer_file_path_corpora.append(sub_paths)

        """计算tf-idf"""
        print("start tf_idf algorithm......")
        # 建立词典
        dictionary = corpora.Dictionary(reviewer_file_path_corpora)
        # 基于词典建立新的语料库
        corpus = [dictionary.doc2bow(text) for text in reviewer_file_path_corpora]
        # 用语料库训练TF-IDF模型
        tf_idf_model = models.TfidfModel(corpus)
        # 得到加权矩阵
        path_tf_tdf = list(tf_idf_model[corpus])
        print(path_tf_tdf)

        """处理path_tf_tdf,构造pr_path加权矩阵"""
        print("start merge tf_idf to origin_df......")
        reviewer_list = list(groups.keys())
        columns = ['review_user_login']
        path_ids = list(dictionary.token2id.values())
        path_ids = list(map(lambda x: str(x), path_ids))
        columns.extend(path_ids)
        reviewer_path_weight_df = pandas.DataFrame(columns=columns).fillna(value=0)
        for index, row in enumerate(path_tf_tdf):
            """用字典的方式填充dataframe"""
            new_row = {'review_user_login': reviewer_list[index]}
            row = list(map(lambda x: (str(x[0]), x[1]), row))
            path_weight = dict(row)
            new_row = dict(new_row, **path_weight)
            reviewer_path_weight_df = reviewer_path_weight_df.append(new_row, ignore_index=True)
        reviewer_path_weight_df = reviewer_path_weight_df.fillna(value=0)
        print(reviewer_path_weight_df.shape)

        """去除用户名"""
        reviewer_path_weight_df.drop(columns=['review_user_login'], axis=1, inplace=True)
        print("before pca size:", reviewer_path_weight_df.shape)

        """PCA"""
        pca = PCA(n_components=0.95)
        reviewer_path_weight_df = pca.fit_transform(reviewer_path_weight_df)
        print("after pca size:", reviewer_path_weight_df.shape)

        """特征标准归一化"""
        stdsc = StandardScaler()
        reviewer_path_weight_df_std = stdsc.fit_transform(reviewer_path_weight_df)


        M = []
        N = []
        max_cluster = min(20, reviewer_num)
        for n in range(2, 20):
            y_pred = KMeans(n_clusters=n, random_state=9).fit_predict(reviewer_path_weight_df_std)
            from sklearn import metrics
            # m = metrics.calinski_harabasz_score(reviewer_path_weight_df, y_pred)
            m = sklearn.metrics.silhouette_score(reviewer_path_weight_df_std, y_pred,
                                                   sample_size=len(reviewer_path_weight_df_std), metric='euclidean')
            M.append(m)
            N.append(n)
            print(n ,m)
            print(y_pred)

            """聚类归属数量子图绘制"""
            nums = []
            x = []
            for i in range(0, n):
                nums.append(list(y_pred).count(i))
                x.append(i)
            print(x)
            print(nums)
            plt.subplot(5, 4, n-1)
            plt.bar(x=range(0, n), height=nums)
            # plt.title('cluster%d'%n)
        plt.savefig(f'项目{projectName}聚类分布.png')
        # plt.show()

        M = DataFrame(M)
        N = DataFrame(N)
        # x_major_locator = MultipleLocator(1)  # x轴间距为1
        fig = plt.figure()
        # ax1 = fig.add_subplot(111)
        # ax1.scatter(N, M, s=40, marker='o')
        # ax1.xaxis.set_major_locator(x_major_locator)
        plt.plot(N, M, marker='o', markersize=5)
        plt.xlabel('cluster number')
        plt.ylabel('slihouette score')
        plt.xlim(-0.5, 22)
        # for a, b in zip(list(N), list(M)):
        #     plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
        # plt.legend()
        plt.savefig(f'项目{projectName}轮廓系数.png')
Beispiel #15
0
    def preProcess(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        t1 = datetime.now()

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        commentDf = df[['pr_number', 'review_user_login', 'comment_body', 'label']].copy(deep=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """训练和测试做分割"""
        df_train = df.loc[df['label'] == 0].copy(deep=True)
        df_test = df.loc[df['label'] == 1].copy(deep=True)
        df_test.reset_index(drop=True, inplace=True)

        """收集训练集中的pr的文本作为 文档做LDA提取主题"""
        trainTextList = []
        testTextList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')

            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]
            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)

            if label == 0:
                trainTextList.append(tempList)
            elif label == 1:
                testTextList.append(tempList)

        """收集 训练集中的comment"""
        trainCommentList = []
        review_comment_map = {}  # pr -> [(reviewer, [w1, w2, w3]), .....]
        for row in commentDf.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')
            reviewer = getattr(row, 'review_user_login')

            """获取pull request的标题"""
            comment_body = getattr(row, 'comment_body')
            comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment_body) if x not in stopwords]
            """对单词做提取词干"""
            comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
            tempList.extend(comment_body_word_list)

            if review_comment_map.get(pr_num, None) is None:
                review_comment_map[pr_num] = []

            if label == 0:
                review_comment_map[pr_num].append((reviewer, tempList.copy()))
                trainCommentList.append(tempList)

        """建立LDA模型提取数据"""
        # 接下来就是模型构建的步骤了,首先构建词频矩阵
        allTextList = []
        allTextList.extend(trainTextList)
        allTextList.extend(trainCommentList)
        dictionary = corpora.Dictionary(trainTextList)
        corpus = [dictionary.doc2bow(text) for text in trainTextList]
        lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
        topic_list = lda.print_topics(20)
        print("20个主题的单词分布为:\n")
        for topic in topic_list:
            print(topic)

        """建立训练集和测试集所需的主题分布
           pr_num -> {[(t1, p1), (t2, p2), .....]}
        """
        train_data = {}
        test_data = {}
        for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in trainTextList])):
            train_data[df_train['pr_number'][index]] = d
        for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in testTextList])):
            test_data[df_test['pr_number'][index]] = d

        train_data_y = {}  # pr -> [(reviewer, [(comment1), (comment2) ...])]
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            reviewerList = []
            for reviewer in reviewers:
                commentTopicList = []
                for r, words in review_comment_map[pull_number]:
                    if r == reviewer:
                        commentTopicList.append(words)
                commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList])
                reviewerList.append((reviewer, [x for x in commentTopicList]))
            train_data_y[pull_number] = reviewerList

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            reviewerList = []
            for reviewer in reviewers:
                commentTopicList = []
                for r, words in review_comment_map[pull_number]:
                    if r == reviewer:
                        commentTopicList.append(words)
                commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList])
                reviewerList.append((reviewer, commentTopicList))
            test_data_y[pull_number] = reviewerList

        print("preprocess cost time:", datetime.now() - t1)
        return train_data, train_data_y, test_data, test_data_y, convertDict