Ejemplo n.º 1
0
    def preProcess(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        t1 = datetime.now()

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        """由于特殊性  PB算法的训练集不是dataFrame
           { p1:set1, p2:set2, ... }
        """
        train_data = {}
        test_data = {}
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')

            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)

            wordSet = MultisetHelper.WordMultiset()
            wordSet.add(tempList)

            if label == 0:
                train_data[pr_num] = wordSet
            else:
                test_data[pr_num] = wordSet

        print("train size:", train_data.items().__len__())
        print("test size:", test_data.items().__len__())

        """问题转化为多标签问题
            train_data_y   [{pull_number:[(r1, s1), (r2, s2), ...]}, ... ,{}]
            
            r 代表reviewer
            s 代表集合
        """

        train_data_y = {}
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True)
            commentDict = dict(list(tempDf.groupby('review_user_login')))
            reviewerList = []
            for reviewer in reviewers:
                commentDf = commentDict[reviewer]
                wordSet = MultisetHelper.WordMultiset()
                for row in commentDf.itertuples(index=False, name='Pandas'):
                    comment = getattr(row, 'comment_body')
                    comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords]
                    """对单词做提取词干"""
                    comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
                    wordSet.add(comment_body_word_list)
                reviewerList.append((reviewer, wordSet))
            train_data_y[pull_number] = reviewerList

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True)
            commentDict = dict(list(tempDf.groupby('review_user_login')))
            reviewerList = []
            for reviewer in reviewers:
                commentDf = commentDict[reviewer]
                wordSet = MultisetHelper.WordMultiset()
                for row in commentDf.itertuples(index=False, name='Pandas'):
                    comment = getattr(row, 'comment_body')
                    comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords]
                    """对单词做提取词干"""
                    comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
                    wordSet.add(comment_body_word_list)
                reviewerList.append((reviewer, wordSet))
            test_data_y[pull_number] = reviewerList

        print("preprocess cost time:", datetime.now() - t1)
        return train_data, train_data_y, test_data, test_data_y, convertDict
Ejemplo n.º 2
0
    def preProcessBySlide(df, dates):
        """参数说明
         df:读取的dataframe对象
         dates:作为测试的年月四元组
        """
        """注意: 输入文件中已经带有列名了"""

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))

        """创建时间转化为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['pr_created_at'] = df['pr_created_at'] / (24 * 3600)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'pr_title', 'review_user_login', 'label', 'pr_created_at']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)
        """对人名字做数字处理"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))
        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'label', 'pr_created_at']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""

        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)
        train_data['pr_number'] = list(df.loc[df['label'] == False]['pr_number'])
        test_data['pr_number'] = list(df.loc[df['label'] == True]['pr_number'])
        train_data['pr_created_at'] = list(df.loc[df['label'] == False]['pr_created_at'])
        test_data['pr_created_at'] = list(df.loc[df['label'] == True]['pr_created_at'])

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """

        train_data_y = {}
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict"""
        return train_data, train_data_y, test_data, test_data_y, convertDict
Ejemplo n.º 3
0
    def preProcess(df, dates):
        """参数说明
                    df:读取的dataframe对象
                    dates:四元组,后两位作为测试的年月 (,,year,month)
                   """

        """注意: 输入文件中已经带有列名了"""

        """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na"""
        # """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        df['pr_title'].fillna(value='', inplace=True)
        df['pr_body'].fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer'])

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        """问题:lsi的过程不能在整个数据集上面做,不然会导致pr的文本重复问题"""
        df_pr = df.copy(deep=True)
        df_pr.drop_duplicates(subset=['pull_number'], keep='first', inplace=True)
        df_pr.reset_index(drop=True, inplace=True)

        # 用于记录pr中文字的数量,对于pr少于10个word的pr.直接去掉
        df_pr_word_count = []

        textList = []
        for row in df_pr.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            if tempList.__len__() >= 10 or getattr(row, 'label'):
                textList.append(tempList)
            if getattr(row, 'label'):
                df_pr_word_count.append(10)  # 以便过后面的过滤
            else:
                df_pr_word_count.append(tempList.__len__())

        """去除无用的训练pr"""
        df_pr['count'] = df_pr_word_count
        df_pr = df_pr.loc[df_pr['count'] >= 10].copy(deep=True)
        df_pr.reset_index(drop=True, inplace=True)
        df_pr.drop(['count'], inplace=True, axis=1)

        """保存只有pr的列表"""
        prList = list(df_pr['pull_number'])

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""
        trainData_index = df_pr.loc[df_pr['label'] == False].index
        testData_index = df_pr.loc[df_pr['label'] == True].index

        trainDataTextList = [textList[x] for x in trainData_index]
        testDataTextList = [textList[x] for x in testData_index]

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(trainDataTextList)
        print('词典:', dictionary)

        """感觉有问题,tfidf模型不应该是在全数据集上面计算,而是在训练集上面计算,而测试集的向量就是
        单纯的带入模型的计算结果"""

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in trainDataTextList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]

        topic_num = 10
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num)
        topic_list = lsi.print_topics()
        print("{0}个主题的单词分布为:\n".format(topic_num))
        for topic in topic_list:
            print(topic)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, trainDataTextList.__len__()):
            wordVectors.append(dict(lsi[dictionary.doc2bow(trainDataTextList[i])]))
        for i in range(0, testDataTextList.__len__()):
            wordVectors.append(dict(lsi[dictionary.doc2bow(testDataTextList[i])]))

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=topic_num)
        test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=topic_num)

        lsi_data = pandas.concat([train_v_data, test_v_data], axis=0)  # 0 轴合并
        lsi_data['pull_number'] = prList
        lsi_data.reset_index(inplace=True, drop=True)

        train_data = df.loc[df['label'] == False]
        train_data.reset_index(drop=True, inplace=True)
        test_data = df.loc[df['label'] == True]
        test_data.reset_index(drop=True, inplace=True)

        train_data = train_data.merge(lsi_data, on="pull_number")
        train_data.drop(columns=['label'], inplace=True)

        test_data = test_data.merge(lsi_data, on="pull_number")
        test_data.drop(columns=['label'], inplace=True)

        """8ii处理NAN"""
        train_data.dropna(how='any', inplace=True)
        train_data.reset_index(drop=True, inplace=True)
        train_data.fillna(value='', inplace=True)

        """先对tag做拆分"""
        trainDict = dict(list(train_data.groupby('pull_number')))
        testDict = dict(list(test_data.groupby('pull_number')))

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            test_data_y[pull_number] = reviewers

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            train_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Ejemplo n.º 4
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'review_user_login', 'label']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)


        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        """pr_number  经过去重"""
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""

        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Ejemplo n.º 5
0
    def preProcess(df, dates):
        """参数说明
                    df:读取的dataframe对象
                    dates:四元组,后两位作为测试的年月 (,,year,month)
                   """

        """注意: 输入文件中已经带有列名了"""

        """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na"""
        # """处理NAN"""
        # df.dropna(how='any', inplace=True)
        # df.reset_index(drop=True, inplace=True)
        df['pr_title'].fillna(value='', inplace=True)
        df['pr_body'].fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer'])

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""
        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)

        train_data = df.loc[df['label'] == False]
        train_data.reset_index(drop=True, inplace=True)
        test_data = df.loc[df['label'] == True]
        test_data.reset_index(drop=True, inplace=True)

        train_data = train_data.join(train_v_data)
        train_data.drop(columns=['label'], inplace=True)

        test_data = test_data.join(test_v_data)
        test_data.drop(columns=['label'], inplace=True)

        """8ii处理NAN"""
        train_data.dropna(how='any', inplace=True)
        train_data.reset_index(drop=True, inplace=True)
        train_data.fillna(value='', inplace=True)

        """先对tag做拆分"""
        trainDict = dict(list(train_data.groupby('pull_number')))
        testDict = dict(list(test_data.groupby('pull_number')))

        """过滤掉评论时间在数据集时间范围内之后的数据"""
        end_time = str(dates[2]) + "-" + str(dates[3]) + "-" + "01 00:00:00"
        train_data = train_data[train_data['commented_at'] < end_time]
        train_data.reset_index(drop=True, inplace=True)

        test_data_y = {}
        for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            test_data_y[pull_number] = reviewers

        train_data_y = {}
        for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']:
            reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer'])
            train_data_y[pull_number] = reviewers

        return train_data, train_data_y, test_data, test_data_y, convertDict
Ejemplo n.º 6
0
def appendTextualFeatureVector(inputDf, projectName, date, pull_number_name):
    """
       用tf-idf模型计算pr的所有title,pr的文本
       pr的信息直接从PRDataFile 那里获取
       @description: 给df, 在之前的dataframe的基础上面追加   pr路径形成的tf-idf特征向量
       @notice: datafrme 必须有pull_number_id,可以重复
       @param origin_df: 预先读取好的dataframe
       @param projectName: 指定项目名
       @param date: 开始年,开始月,结束年,结束月的四元组
       @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法
    """

    """对输入df做label存在检测"""
    if 'label' not in inputDf.columns:
        raise Exception("label not in input dataframe!")

    print("input shape:", inputDf.shape)
    print(date)

    df = inputDf[[pull_number_name]].copy(deep=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.columns = ['pr_number']

    """读取pullrequestData 文件"""
    pull_request_path = projectConfig.getPullRequestPath()

    pullRequestData = pandasHelper.readTSVFile(
        os.path.join(pull_request_path, f'ALL_{projectName}_data_pullrequest.tsv'),
        pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
    )
    """pull_number和pr review commit relation做拼接"""
    df = pandas.merge(df, pullRequestData, left_on='pr_number', right_on='number')
    df = df[['pr_number', 'title', 'body']].copy(deep=True)
    df.columns = ['pr_number', 'pr_title', 'pr_body']
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.fillna(value='', inplace=True)

    """用于收集所有文本向量分词"""
    stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

    textList = []
    for row in df.itertuples(index=False, name='Pandas'):
        tempList = []
        """获取pull request的标题"""
        pr_title = row[list(df.columns).index('pr_title')]
        pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

        """初步尝试提取词干效果反而下降了 。。。。"""

        """对单词做提取词干"""
        pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
        tempList.extend(pr_title_word_list)

        """pull request的body"""
        pr_body = row[list(df.columns).index('pr_body')]
        pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
        """对单词做提取词干"""
        pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
        tempList.extend(pr_body_word_list)
        textList.append(tempList)

    print(textList.__len__())
    """对分词列表建立字典 并提取特征数"""
    dictionary = corpora.Dictionary(textList)
    print('词典:', dictionary)

    feature_cnt = len(dictionary.token2id)
    print("词典特征数:", feature_cnt)

    """根据词典建立语料库"""
    corpus = [dictionary.doc2bow(text) for text in textList]
    # print('语料库:', corpus)
    """语料库训练TF-IDF模型"""
    tfidf = models.TfidfModel(corpus)

    """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
    wordVectors = []
    for i in range(0, df.shape[0]):
        wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

    """填充为向量"""
    wordVectors = DataProcessUtils.convertFeatureDictToDataFrame(wordVectors, featureNum=feature_cnt)

    """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理"""
    tempData = wordVectors.copy(deep=True)
    tempData['pr_number'] = df['pr_number']
    labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True)
    tempData = pandas.merge(tempData, labelData, on='pr_number')

    tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True)
    tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True)

    tempData_train.drop(columns=['pr_number', 'label'], inplace=True)
    tempData_test.drop(columns=['pr_number', 'label'], inplace=True)


    """PAC 做缩减"""
    pca = PCA(n_components=0.95)
    tempData_train = pca.fit_transform(tempData_train)
    print("after pca :", tempData_train.shape)
    print(pca.explained_variance_ratio_)
    tempData_train = pandas.DataFrame(tempData_train)

    tempData_test = pca.transform(tempData_test)
    print("after pca :", tempData_train.shape)
    tempData_test = pandas.DataFrame(tempData_test)

    tempData = pandas.concat([tempData_train, tempData_test], axis=0)
    tempData.reset_index(drop=True, inplace=True)
    tempData['pr_number_t'] = df['pr_number'].copy(deep=True)

    """和原来特征做拼接"""
    inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t')
    inputDf.drop(columns=['pr_number_t'], inplace=True)
    return inputDf
Ejemplo n.º 7
0
    def preProcess(df, dates):
        """参数说明
            df:读取的dataframe对象
            dates:四元组,后两位作为测试的年月 (,,year,month)
           """

        """注意: 输入文件中已经带有列名了"""

        t1 = datetime.now()

        """处理NAN"""
        df.dropna(how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.fillna(value='', inplace=True)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3]))
        """对reviewer名字数字化处理 存储人名映射字典做返回"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        commentDf = df[['pr_number', 'review_user_login', 'comment_body', 'label']].copy(deep=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """训练和测试做分割"""
        df_train = df.loc[df['label'] == 0].copy(deep=True)
        df_test = df.loc[df['label'] == 1].copy(deep=True)
        df_test.reset_index(drop=True, inplace=True)

        """收集训练集中的pr的文本作为 文档做LDA提取主题"""
        trainTextList = []
        testTextList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')

            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]
            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)

            if label == 0:
                trainTextList.append(tempList)
            elif label == 1:
                testTextList.append(tempList)

        """收集 训练集中的comment"""
        trainCommentList = []
        review_comment_map = {}  # pr -> [(reviewer, [w1, w2, w3]), .....]
        for row in commentDf.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的number"""
            pr_num = getattr(row, 'pr_number')
            label = getattr(row, 'label')
            reviewer = getattr(row, 'review_user_login')

            """获取pull request的标题"""
            comment_body = getattr(row, 'comment_body')
            comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment_body) if x not in stopwords]
            """对单词做提取词干"""
            comment_body_word_list = nltkFunction.stemList(comment_body_word_list)
            tempList.extend(comment_body_word_list)

            if review_comment_map.get(pr_num, None) is None:
                review_comment_map[pr_num] = []

            if label == 0:
                review_comment_map[pr_num].append((reviewer, tempList.copy()))
                trainCommentList.append(tempList)

        """建立LDA模型提取数据"""
        # 接下来就是模型构建的步骤了,首先构建词频矩阵
        allTextList = []
        allTextList.extend(trainTextList)
        allTextList.extend(trainCommentList)
        dictionary = corpora.Dictionary(trainTextList)
        corpus = [dictionary.doc2bow(text) for text in trainTextList]
        lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
        topic_list = lda.print_topics(20)
        print("20个主题的单词分布为:\n")
        for topic in topic_list:
            print(topic)

        """建立训练集和测试集所需的主题分布
           pr_num -> {[(t1, p1), (t2, p2), .....]}
        """
        train_data = {}
        test_data = {}
        for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in trainTextList])):
            train_data[df_train['pr_number'][index]] = d
        for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in testTextList])):
            test_data[df_test['pr_number'][index]] = d

        train_data_y = {}  # pr -> [(reviewer, [(comment1), (comment2) ...])]
        for pull_number in df.loc[df['label'] == False]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            reviewerList = []
            for reviewer in reviewers:
                commentTopicList = []
                for r, words in review_comment_map[pull_number]:
                    if r == reviewer:
                        commentTopicList.append(words)
                commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList])
                reviewerList.append((reviewer, [x for x in commentTopicList]))
            train_data_y[pull_number] = reviewerList

        test_data_y = {}
        for pull_number in df.loc[df['label'] == True]['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            reviewerList = []
            for reviewer in reviewers:
                commentTopicList = []
                for r, words in review_comment_map[pull_number]:
                    if r == reviewer:
                        commentTopicList.append(words)
                commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList])
                reviewerList.append((reviewer, commentTopicList))
            test_data_y[pull_number] = reviewerList

        print("preprocess cost time:", datetime.now() - t1)
        return train_data, train_data_y, test_data, test_data_y, convertDict