Example #1
0
def about_overlap(source_name, target_name, outputpath):
    json_path = os.path.join(outputpath, "transform/")
    sourceDomain = json_path + 'reviews_%s_5.json' % source_name
    targetDomain = json_path + 'reviews_%s_5.json' % target_name
    '''生成src和tgt的交叠部分的所有样例,df_Src_over,df_Src_over , ["user", "sourceItem",'overall','time']  ["user", "targetItem",'overall','time']

    Parameters
    ----------
    source_name : str
        源域名字

    target_name : str
        目标域名字
    outputpath : 存储路径
    
    Returns
    -------
        df_Src_over : pd.DataFrame
        df_Src_over : pd.DataFrame
    '''

    Source = list(readJson(sourceDomain))
    SourceUserItem = [(d["reviewerID"], d["asin"], d["overall"],
                       d["unixReviewTime"]) for d in Source]
    del Source
    print('load %s' % sourceDomain)
    Traget = list(readJson(targetDomain))
    TargetUserItem = [(d["reviewerID"], d["asin"], d["overall"],
                       d["unixReviewTime"]) for d in Traget]
    del Traget
    print('load %s' % targetDomain)
    uiS = pd.DataFrame(data=SourceUserItem,
                       columns=["user", "sourceItem", "overall", 'time'])
    uiT = pd.DataFrame(data=TargetUserItem,
                       columns=["user", "targetItem", "overall", 'time'])
    uS = uiS[['user', 'sourceItem']].groupby("user").count()
    uT = uiT[['user', 'targetItem']].groupby("user").count()
    uBoth = pd.concat([uS, uT], axis=1).fillna(0)
    # coldUserSource 的user 来自于 Target domain, 在Source Domain中记录数量为0
    coldUserSource = uBoth.query("sourceItem == 0")
    # coldUserTarget 的user 来自于 Source Domain, 在Target Domain 中记录数量为0
    coldUserTarget = uBoth.query("targetItem == 0")
    overlapUser = uBoth.query("sourceItem != 0 and targetItem != 0")
    """
    下面是新加的
    """
    arr_coldUSrc, arr_coldUTgt, arr_UOver = list(coldUserTarget.index), list(
        coldUserSource.index), list(overlapUser.index)
    df_Tgt_over = uiT[uiT['user'].isin(arr_UOver)]
    df_Src_over = uiS[uiS['user'].isin(arr_UOver)]
    print("cold user count in %s is %d" % (source_name, len(arr_coldUSrc)))
    print("cold user count in %s is %d" % (target_name, len(arr_coldUTgt)))
    print("%s and %s, overlap user count is %d" %
          (source_name, target_name, len(arr_UOver)))
    print('')
    return df_Src_over, df_Tgt_over
Example #2
0
def about_overlap(source_name, target_name, outputpath):
    json_path = os.path.join(outputpath, "transform/")
    sourceDomain = json_path + 'reviews_%s_5.json' % source_name
    targetDomain = json_path + 'reviews_%s_5.json' % target_name
    '''生成src和tgt的交叠部分的所有样例,df_Src_over,df_Src_over , ["user", "sourceItem",'overall','time']  ["user", "targetItem",'overall','time']

    Parameters
    ----------
    source_name : str
        源域名字

    target_name : str
        目标域名字
    outputpath : 存储路径
    
    Returns
    -------
        df_Src_over : pd.DataFrame
        df_Src_over : pd.DataFrame
    '''

    Source = list(readJson(sourceDomain))
    SourceUserItem = [(d["reviewerID"], d["asin"], d["overall"],
                       d["unixReviewTime"]) for d in Source]
    del Source
    print('load %s' % sourceDomain)
    Traget = list(readJson(targetDomain))
    TargetUserItem = [(d["reviewerID"], d["asin"], d["overall"],
                       d["unixReviewTime"]) for d in Traget]
    del Traget
    print('load %s' % targetDomain)
    uiS = pd.DataFrame(data=SourceUserItem,
                       columns=["user", "sourceItem", "overall", 'time'])
    uiT = pd.DataFrame(data=TargetUserItem,
                       columns=["user", "targetItem", "overall", 'time'])
    """
    下面是新加的
    """
    arr_UOver = set(uiS['user'].unique()) & set(uiT['user'].unique())
    df_Tgt_over = uiT[uiT['user'].isin(arr_UOver)]
    df_Src_over = uiS[uiS['user'].isin(arr_UOver)]
    a = list(df_Tgt_over['user'].unique())
    a.sort()
    b = list(df_Src_over['user'].unique())
    b.sort()
    print('if user in src and tgt is same:', a == b)
    print("%s and %s, overlap user count is %d" %
          (source_name, target_name, len(arr_UOver)))
    print('')
    return df_Src_over, df_Tgt_over
Example #3
0
def generateColdUser(sourceDomain, targetDomain):
    '''生成冷用户

    source domain 冷用户定义如下:
        在target domain中有记录而在source domain中没有记录的用户
    target domain 冷用户定义类似

    Parameters
    ----------
    sourceDomain : str
        sourceDomain json数据文件路径

    targetDomain : str
        targetDomain json数据文件路径

    Returns
    -------
        coldUserSource : pd.DataFrame
    '''

    Source = list(readJson(sourceDomain))
    SourceUserItem = [(d["reviewerID"], d["asin"]) for d in Source]
    del Source
    print('load %s' % sourceDomain)
    Traget = list(readJson(targetDomain))
    TargetUserItem = [(d["reviewerID"], d["asin"]) for d in Traget]
    del Traget
    print('load %s' % targetDomain)
    uiS = pd.DataFrame(data=SourceUserItem, columns=["user", "sourceItem"])
    uiT = pd.DataFrame(data=TargetUserItem, columns=["user", "targetItem"])
    uS = uiS.groupby("user").count()
    uT = uiT.groupby("user").count()
    uBoth = pd.concat([uS, uT], axis=1).fillna(0)
    # coldUserSource 的user 来自于 Target domain, 在Source Domain中记录数量为0
    coldUserSource = uBoth.query("sourceItem == 0")
    # coldUserTarget 的user 来自于 Source Domain, 在Target Domain 中记录数量为0
    coldUserTarget = uBoth.query("targetItem == 0")
    overlapUser = uBoth.query("sourceItem != 0 and targetItem != 0")
    return list(coldUserTarget.index), list(coldUserSource.index), list(
        overlapUser.index)
Example #4
0
 def __init__(self, filename, fields=None, trainTestFrac=0.7):
     '''
     Parameters
     ----------
     filename : str
         预处理后的文件路径
     fields : list of str
         使用的字段列表
     '''
     self.data = np.array([i for i in readJson(filename)])
     print('len(data):',len(self.data))
     self.fields = fields
     self.trainTestFrac = trainTestFrac
     self.index = np.arange(len(self.data))
     np.random.shuffle(self.index)
     splitPoint = int(len(self.data)*self.trainTestFrac)
     self.trainIndex = self.index[:splitPoint]
     self.testIndex = self.index[splitPoint:]
Example #5
0
        def transCSV(fn):
            inf = os.path.join(transform, fn)
            ext = os.path.splitext(fn)[-1]
            out_train = os.path.join(output_dir, fn).replace(ext, "_train.csv")
            out_test = os.path.join(output_dir, fn).replace(ext, "_test.csv")
            data = [getter(d) for d in readJson(inf)]
            train, test = [], []
            user_dic = {}
            for d in data:
                if d[0] in cold_user:
                    test.append(",".join(map(str, d)))
                else:
                    train.append(",".join(map(str, d)))

            with open(out_train, "w") as f:
                f.write("\n".join(train))

            with open(out_test, "w") as f:
                f.write("\n".join(test))