def preProcess(df, date, project, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.reset_index(drop=True, inplace=True) """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login']) recoverDict = {v: k for k, v in convertDict.items()} print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits', 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """计算作者的关系""" df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER') """计算累积的历史数据""" request_number_prs = [] # 作者之前发出的数量 request_number_merged_prs = [] # 作者发出的被接受的数量 request_number_rejected_prs = [] # 作者发出被拒绝的数量 request_accept_rate = [] # 作者pr被接受的概率 request_reject_rate = [] # 作者pr被拒绝的概率 for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)] request_number_prs.append(temp_df.shape[0]) accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0] request_number_merged_prs.append(accept_times) request_number_rejected_prs.append(temp_df.shape[0] - accept_times) if temp_df.shape[0] > 0: request_accept_rate.append(accept_times/temp_df.shape[0]) request_reject_rate.append(1 - accept_times / temp_df.shape[0]) else: request_accept_rate.append(0) request_reject_rate.append(0) df['request_number_prs'] = request_number_prs df['request_number_merged_prs'] = request_number_merged_prs df['request_number_rejected_prs'] = request_number_rejected_prs df['request_accept_rate'] = request_accept_rate df['request_reject_rate'] = request_reject_rate """添加作者是否关注项目""" user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation() userWatchRepoRelation = pandasHelper.readTSVFile( os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) watchRepoMap = {} for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name'])) isFollow = False for repo in following_list: owner, name = repo.split('/') if name == project: isFollow = True watchRepoMap[convertDict[k]] = isFollow request_watches = [] for row in df.itertuples(): author = getattr(row, 'author_user_login') request_watches.append(watchRepoMap[author]) df['request_watches'] = request_watches """添加作者follower数量, followings数量, 是否follow团队成员""" user_follow_relation_path = projectConfig.getUserFollowRelation() userFollowRelation = pandasHelper.readTSVFile( os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) followMap = {} followerCountMap = {} followingCountMap = {} followCoreMemberMap = {} """收集核心成员列表""" coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login'])) for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login'])) followingCountMap[convertDict[k]] = following_list.__len__() isFollowCoreMember = False for f in following_list: if f in convertDict.keys(): followMap[(convertDict[k], convertDict[f])] = 1 if f in coreMemberList: isFollowCoreMember = True followCoreMemberMap[convertDict[k]] = isFollowCoreMember follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login'])) followerCountMap[convertDict[k]] = follower_list.__len__() # for f in follower_list: # if f in convertDict.keys(): # followMap[(convertDict[f], convertDict[k])] = 1 request_number_follows = [] request_number_following = [] request_follow_ct = [] for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" request_number_following.append(followingCountMap[author]) request_number_follows.append(followerCountMap[author]) request_follow_ct.append(followCoreMemberMap[author]) df['request_number_following'] = request_number_following df['request_number_follows'] = request_number_follows df['request_follow_ct'] = request_follow_ct """先提前统计正确答案""" tagDict = dict(list(df.groupby('pr_number'))) train_data = df.loc[df['label'] == 0].copy(deep=True) test_data = df.loc[df['label'] == 1].copy(deep=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) test_data.drop_duplicates(subset=['pr_number'], inplace=True) """获得pr list""" prList = list(test_data['pr_number']) test_data.drop(columns=['pr_number'], inplace=True) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList
def preProcess(df, date, project, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.reset_index(drop=True, inplace=True) """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'pr_title', 'pr_body', 'review_user_login', 'label']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """先提前统计正确答案""" tagDict = dict(list(df.groupby('pr_number'))) train_data = df.loc[df['label'] == 0].copy(deep=True) test_data = df.loc[df['label'] == 1].copy(deep=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login'], inplace=True) test_data.drop_duplicates(inplace=True) """pr_number 经过去重""" test_data.drop_duplicates(subset=['pr_number'], inplace=True) # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """获得pr list""" prList = list(test_data['pr_number']) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df.loc[df['label'] == False].index testData_index = df.loc[df['label'] == True].index """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt) test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList
def preProcess(df, date, project, featureType, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.sort_values(by='pr_number', ascending=True, inplace=True) df.reset_index(drop=True, inplace=True) # """在现有的特征中添加文本路径特征""" """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息 所以添加之前必须两者分别处理 4.13 append 函数必须在表明label后面使用""" if featureType == 1 or featureType == 3: df = appendFilePathFeatureVector(df, project, date, 'pr_number') """在现有的特征中添加pr标题和内容文本特征""" if featureType == 2 or featureType == 3: df = appendTextualFeatureVector(df, project, date, 'pr_number') # """频率统计每一个reviewer的次数,排除数量过少的reviewer""" # freq = {} # for data in df.itertuples(index=False): # name = data[list(df.columns).index('review_user_login')] # if freq.get(name, None) is None: # freq[name] = 0 # """训练集用户次数加一 测试集直接保留 """ # if not data[list(df.columns).index('label')]: # freq[name] += 1 # else: # freq[name] += 1 # # num = 5 # df['freq'] = df['review_user_login'].apply(lambda x: freq[x]) # df = df.loc[df['freq'] > num].copy(deep=True) # df.drop(columns=['freq'], inplace=True) # df.reset_index(drop=True, inplace=True) # print("after lifter unexperienced user:"******"""对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login']) print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """对branch做处理 舍弃base,head做拆分 并数字化""" df.drop(axis=1, columns=['pr_base_label'], inplace=True) # inplace 代表直接数据上面 df['pr_head_tail'] = df['pr_head_label'] df['pr_head_tail'] = df['pr_head_tail'].apply(lambda x: x.split(':')[1]) df['pr_head_label'] = df['pr_head_label'].apply(lambda x: x.split(':')[0]) df.drop(axis=1, columns=['pr_head_tail'], inplace=True) # MLTrain.changeStringToNumber(df, ['pr_head_tail']) DataProcessUtils.changeStringToNumber(df, ['pr_head_label']) """时间转时间戳处理""" df['pr_created_at'] = df['pr_created_at'].apply( lambda x: int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login'], inplace=True) train_data.drop_duplicates(inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login'], inplace=True) test_data.drop_duplicates(inplace=True) # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """获得pr list""" prList = list(test_data['pr_number']) """去除pr number""" test_data.drop(columns=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) # print(train_data_std) # print(test_data_std.shape) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList
def preProcess(df, date, project, isSTD=False, isNOR=False, m=3): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 m: 超参数,窗口时间 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df['label_y'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year) df['label_m'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon) df.reset_index(drop=True, inplace=True) """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息 所以添加之前必须两者分别处理 4.13 append 函数必须在表明label后面使用""" """添加File Path Features""" df = appendFilePathFeatureVector(df, project, date, 'pr_number') """读取User Follow的信息""" user_follow_relation_path = projectConfig.getUserFollowRelation() userFollowRelation = pandasHelper.readTSVFile( os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) def isInTimeGap(x, m, maxYear, maxMonth): d = x['label_y'] * 12 + x['label_m'] d2 = maxYear * 12 + maxMonth return d >= d2 - m """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login']) print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """计算contributor set""" contribute_list = list(set(df.loc[df['label'] == 1]['pr_user_login'])) reviewer_list = list(set(df.loc[df['label'] == 0]['review_user_login'])) """添加Relation ship Features""" """对 train set和test set的处理方式稍微不同 train set数据统计依照之前pr 而训练集的统计数据只限制于trianset """ """把 df 的pr_created_at 和 comment_at 转化为时间戳""" df['pr_created_at'] = df['pr_created_at'].apply( lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['comment_at'] = df['comment_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['response_time'] = df['comment_at'] - df['pr_created_at'] """Prior Evaluation reviewer cm 之前 review co的次数 Recent Evaluation reviewer cm 在 m 个月 reivew co的次数 Follow Relation co 是否follow cm Follower Relation cm 是否follow co """ startTime = datetime.now() prior_evaluation = {} recent_evaluation = {} follower_relation = {} following_relation = {} followMap = {} for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" follower_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login'])) for f in follower_list: if f in convertDict.keys(): followMap[(convertDict[k], convertDict[f])] = 1 for reviewer in reviewer_list: prior_evaluation[reviewer] = [] recent_evaluation[reviewer] = [] follower_relation[reviewer] = [] following_relation[reviewer] = [] cols = list(df.columns) for data in df.itertuples(index=False, name='Pandas'): if data.__len__() < 14: pullNumber = getattr(data, 'pr_number') author = getattr(data, 'pr_user_login') label = getattr(data, 'label') label_m = getattr(data, 'label_m') label_y = getattr(data, 'label_y') else: pullNumber = data[cols.index("pr_number")] author = data[cols.index("pr_user_login")] label = data[cols.index("label")] label_m = data[cols.index("label_m")] label_y = data[cols.index("label_y")] temp = None if label == 0: temp = df.loc[df['pr_number'] < pullNumber] else: temp = df.loc[df['label'] == 0] temp = temp.loc[df['pr_user_login'] == author].copy(deep=True) """依次遍历每个候选者统计""" prior_evaluation_dict = dict(temp['review_user_login'].value_counts()) for r in reviewer_list: prior_evaluation[r].append(prior_evaluation_dict.get(r, 0)) """temp 二次过滤 选m个月以内的""" if temp.shape[0] > 0: if label == 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1) else: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1) temp = temp.loc[temp['target'] == 1] """依次遍历每个候选者统计""" recent_evaluation_dict = dict(temp['review_user_login'].value_counts()) for r in reviewer_list: recent_evaluation[r].append(recent_evaluation_dict.get(r, 0)) """添加 follow 和 following 信息""" for r in reviewer_list: follower_relation[r].append(followMap.get((author, r), 0)) following_relation[r].append(followMap.get((r, author), 0)) """添加""" for r in reviewer_list: df[f'prior_evaluation_{r}'] = prior_evaluation[r] df[f'recent_evaluation_{r}'] = recent_evaluation[r] df[f'follower_relation_{r}'] = follower_relation[r] df[f'following_relation_{r}'] = following_relation[r] print("prior cost time:", datetime.now() - startTime) startTime = datetime.now() # 开始时间:数据集开始时间的前一天 start_time = time.strptime(str(date[0]) + "-" + str(date[1]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S") start_time = int(time.mktime(start_time) - 86400) # 结束时间:数据集的最后一天 end_time = time.strptime(str(date[2]) + "-" + str(date[3]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S") end_time = int(time.mktime(end_time) - 1) """Activeness Feature 添加""" total_pulls = {} # 项目有的所有pr evaluate_pulls = {} # co 之前review的数量 recent_pulls = {} # co 最近m月 review的数量 evaluate_time = {} # co 平均回应时间 last_time = {} # co 最后一次reivew 的时间间隔 first_time = {} # co 第一次review的时间间隔 for reviewer in reviewer_list: total_pulls[reviewer] = [] evaluate_pulls[reviewer] = [] recent_pulls[reviewer] = [] evaluate_time[reviewer] = [] last_time[reviewer] = [] first_time[reviewer] = [] count = 0 cols = list(df.columns) index_pr_number = cols.index("pr_number") index_pr_label = cols.index("label") index_pr_label_m = cols.index("label_m") index_pr_label_y = cols.index("label_y") for data in df.itertuples(index=False): print("count for active:", count) count += 1 pullNumber = data[index_pr_number] label = data[index_pr_label] label_m = data[index_pr_label_m] label_y = data[index_pr_label_y] temp = None if label == 0: temp = df.loc[df['pr_number'] < pullNumber].copy(deep=True) else: temp = df.loc[df['label'] == 0].copy(deep=True) """依次遍历每个候选者统计""" total_pull_number = list(set(temp['pr_number'])).__len__() res_reviewer_list = reviewer_list.copy() groups = dict(list(temp.groupby('review_user_login'))) """先遍历有tempDf的reviewer""" for r, tempDf in groups.items(): total_pulls[r].append(total_pull_number) res_reviewer_list.remove(r) if tempDf.shape[0] == 0: """没有历史 认为age=0, 间隔是最大间隔""" first_time[r].append(0) last_time[r].append(end_time - start_time) else: pr_created_time_list = list(tempDf['pr_created_at']) first_review_time = min(pr_created_time_list) last_review_time = max(pr_created_time_list) first_time[r].append(end_time - first_review_time) last_time[r].append(end_time - last_review_time) evaluate_pulls[r].append(tempDf.shape[0]) """平均回应时间统计""" if tempDf.shape[0] > 0: evaluate_avg = sum(tempDf['response_time']) evaluate_avg /= tempDf.shape[0] else: evaluate_avg = end_time - start_time evaluate_time[r].append(evaluate_avg) for r in res_reviewer_list: total_pulls[r].append(total_pull_number) evaluate_pulls[r].append(0) first_time[r].append(0) last_time[r].append(end_time - start_time) evaluate_avg = end_time - start_time evaluate_time[r].append(evaluate_avg) # recent_pulls[r].append(0) """过滤k个月 重新计算""" if label == 0: if temp.shape[0] > 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1) temp = temp.loc[temp['target'] == 1] else: if temp.shape[0] > 0: temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1) temp = temp.loc[temp['target'] == 1] res_reviewer_list = reviewer_list.copy() groups = dict(list(temp.groupby('review_user_login'))) """先遍历有tempDf的reviewer""" for r, tempDf in groups.items(): recent_pulls[r].append(tempDf.shape[0]) res_reviewer_list.remove(r) for r in res_reviewer_list: recent_pulls[r].append(0) """Activeness Feature增加到 dataframe""" for r in reviewer_list: df[f'total_pulls_{r}'] = total_pulls[r] df[f'evaluate_pulls_{r}'] = evaluate_pulls[r] df[f'recent_pulls_{r}'] = recent_pulls[r] df[f'first_time_{r}'] = first_time[r] df[f'last_time_{r}'] = last_time[r] df[f'evaluate_time_{r}'] = evaluate_time[r] print("active cost time:", datetime.now() - startTime) tagDict = dict(list(df.groupby('pr_number'))) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login'], inplace=True) test_data.drop_duplicates(inplace=True) test_data.drop_duplicates(subset=['pr_number'], inplace=True) # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """获得pr list""" prList = list(test_data['pr_number']) """去除pr number""" test_data.drop(columns=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) test_data.drop(columns=['pr_created_at', 'pr_user_login', 'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True) train_data.drop(columns=['pr_created_at', 'pr_user_login', 'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList