def about_overlap(source_name, target_name, outputpath): json_path = os.path.join(outputpath, "transform/") sourceDomain = json_path + 'reviews_%s_5.json' % source_name targetDomain = json_path + 'reviews_%s_5.json' % target_name '''生成src和tgt的交叠部分的所有样例,df_Src_over,df_Src_over , ["user", "sourceItem",'overall','time'] ["user", "targetItem",'overall','time'] Parameters ---------- source_name : str 源域名字 target_name : str 目标域名字 outputpath : 存储路径 Returns ------- df_Src_over : pd.DataFrame df_Src_over : pd.DataFrame ''' Source = list(readJson(sourceDomain)) SourceUserItem = [(d["reviewerID"], d["asin"], d["overall"], d["unixReviewTime"]) for d in Source] del Source print('load %s' % sourceDomain) Traget = list(readJson(targetDomain)) TargetUserItem = [(d["reviewerID"], d["asin"], d["overall"], d["unixReviewTime"]) for d in Traget] del Traget print('load %s' % targetDomain) uiS = pd.DataFrame(data=SourceUserItem, columns=["user", "sourceItem", "overall", 'time']) uiT = pd.DataFrame(data=TargetUserItem, columns=["user", "targetItem", "overall", 'time']) uS = uiS[['user', 'sourceItem']].groupby("user").count() uT = uiT[['user', 'targetItem']].groupby("user").count() uBoth = pd.concat([uS, uT], axis=1).fillna(0) # coldUserSource 的user 来自于 Target domain, 在Source Domain中记录数量为0 coldUserSource = uBoth.query("sourceItem == 0") # coldUserTarget 的user 来自于 Source Domain, 在Target Domain 中记录数量为0 coldUserTarget = uBoth.query("targetItem == 0") overlapUser = uBoth.query("sourceItem != 0 and targetItem != 0") """ 下面是新加的 """ arr_coldUSrc, arr_coldUTgt, arr_UOver = list(coldUserTarget.index), list( coldUserSource.index), list(overlapUser.index) df_Tgt_over = uiT[uiT['user'].isin(arr_UOver)] df_Src_over = uiS[uiS['user'].isin(arr_UOver)] print("cold user count in %s is %d" % (source_name, len(arr_coldUSrc))) print("cold user count in %s is %d" % (target_name, len(arr_coldUTgt))) print("%s and %s, overlap user count is %d" % (source_name, target_name, len(arr_UOver))) print('') return df_Src_over, df_Tgt_over
def about_overlap(source_name, target_name, outputpath): json_path = os.path.join(outputpath, "transform/") sourceDomain = json_path + 'reviews_%s_5.json' % source_name targetDomain = json_path + 'reviews_%s_5.json' % target_name '''生成src和tgt的交叠部分的所有样例,df_Src_over,df_Src_over , ["user", "sourceItem",'overall','time'] ["user", "targetItem",'overall','time'] Parameters ---------- source_name : str 源域名字 target_name : str 目标域名字 outputpath : 存储路径 Returns ------- df_Src_over : pd.DataFrame df_Src_over : pd.DataFrame ''' Source = list(readJson(sourceDomain)) SourceUserItem = [(d["reviewerID"], d["asin"], d["overall"], d["unixReviewTime"]) for d in Source] del Source print('load %s' % sourceDomain) Traget = list(readJson(targetDomain)) TargetUserItem = [(d["reviewerID"], d["asin"], d["overall"], d["unixReviewTime"]) for d in Traget] del Traget print('load %s' % targetDomain) uiS = pd.DataFrame(data=SourceUserItem, columns=["user", "sourceItem", "overall", 'time']) uiT = pd.DataFrame(data=TargetUserItem, columns=["user", "targetItem", "overall", 'time']) """ 下面是新加的 """ arr_UOver = set(uiS['user'].unique()) & set(uiT['user'].unique()) df_Tgt_over = uiT[uiT['user'].isin(arr_UOver)] df_Src_over = uiS[uiS['user'].isin(arr_UOver)] a = list(df_Tgt_over['user'].unique()) a.sort() b = list(df_Src_over['user'].unique()) b.sort() print('if user in src and tgt is same:', a == b) print("%s and %s, overlap user count is %d" % (source_name, target_name, len(arr_UOver))) print('') return df_Src_over, df_Tgt_over
def generateColdUser(sourceDomain, targetDomain): '''生成冷用户 source domain 冷用户定义如下: 在target domain中有记录而在source domain中没有记录的用户 target domain 冷用户定义类似 Parameters ---------- sourceDomain : str sourceDomain json数据文件路径 targetDomain : str targetDomain json数据文件路径 Returns ------- coldUserSource : pd.DataFrame ''' Source = list(readJson(sourceDomain)) SourceUserItem = [(d["reviewerID"], d["asin"]) for d in Source] del Source print('load %s' % sourceDomain) Traget = list(readJson(targetDomain)) TargetUserItem = [(d["reviewerID"], d["asin"]) for d in Traget] del Traget print('load %s' % targetDomain) uiS = pd.DataFrame(data=SourceUserItem, columns=["user", "sourceItem"]) uiT = pd.DataFrame(data=TargetUserItem, columns=["user", "targetItem"]) uS = uiS.groupby("user").count() uT = uiT.groupby("user").count() uBoth = pd.concat([uS, uT], axis=1).fillna(0) # coldUserSource 的user 来自于 Target domain, 在Source Domain中记录数量为0 coldUserSource = uBoth.query("sourceItem == 0") # coldUserTarget 的user 来自于 Source Domain, 在Target Domain 中记录数量为0 coldUserTarget = uBoth.query("targetItem == 0") overlapUser = uBoth.query("sourceItem != 0 and targetItem != 0") return list(coldUserTarget.index), list(coldUserSource.index), list( overlapUser.index)
def __init__(self, filename, fields=None, trainTestFrac=0.7): ''' Parameters ---------- filename : str 预处理后的文件路径 fields : list of str 使用的字段列表 ''' self.data = np.array([i for i in readJson(filename)]) print('len(data):',len(self.data)) self.fields = fields self.trainTestFrac = trainTestFrac self.index = np.arange(len(self.data)) np.random.shuffle(self.index) splitPoint = int(len(self.data)*self.trainTestFrac) self.trainIndex = self.index[:splitPoint] self.testIndex = self.index[splitPoint:]
def transCSV(fn): inf = os.path.join(transform, fn) ext = os.path.splitext(fn)[-1] out_train = os.path.join(output_dir, fn).replace(ext, "_train.csv") out_test = os.path.join(output_dir, fn).replace(ext, "_test.csv") data = [getter(d) for d in readJson(inf)] train, test = [], [] user_dic = {} for d in data: if d[0] in cold_user: test.append(",".join(map(str, d))) else: train.append(",".join(map(str, d))) with open(out_train, "w") as f: f.write("\n".join(train)) with open(out_test, "w") as f: f.write("\n".join(test))