Example #1
0
def scrapingNotype(data):
    typeList = dict()
    processName = multiprocessing.current_process().name
    counter = 0
    np.random.shuffle(data)
    errorcount = 0
    for name in data:
        counter += 1
        try:
            #time.sleep(2)
            r = requests.get("http://www.baidu.com/s?wd=" + name)
            if r.status_code == 200:
                beautiful = BeautifulSoup(r.content.decode('utf-8'), "lxml")

                typeOfit = beautiful.find(
                    attrs={'tpl': re.compile(r'se_st_single_video_zhanzhang')})
                typeOfit = typeOfit.h3.a.text
                try:
                    if type(typeOfit) == str:
                        typeList[name] = (typeOfit)
                        print('process name is ', processName, ' name :', name,
                              ' type: ', typeOfit)
                    else:
                        typeList[name] = "no type"
                        print('process name is ', processName, name, '  ',
                              "no type")
                    #time.sleep(np.random.random()*4)

                except:
                    typeList[name] = "no type"
                    print('process name is ', processName, name, '  ',
                          "no type")
                    print("save it !")
                    save_pickle(typeList,
                                str(processName) + 'no_type_typeList.data')
                    save_pickle(counter,
                                str(processName) + 'counterRecord.data')

            else:
                typeList.append('no name')
                print(name, '  ', "no type")
            if counter % 2000 == 0:
                print('process', str(processName), 'is now having a ',
                      "counter of", counter)
                save_pickle(typeList,
                            str(processName) + 'no_type_typeList.data')
                save_pickle(counter, str(processName) + 'counterRecord.data')

        except:
            typeList[name] = "no type"
            save_pickle(typeList, str(processName) + 'no_type_typeList.data')
            save_pickle(counter, str(processName) + 'counterRecord.data')

    print(typeList)
    save_pickle(typeList, str(processName) + 'no_type_typeList.data')
def extractItemInfo():
    ####################           change dir       ######################
    os.chdir("C:\\Users\\22560\\Desktop\\recommand Sys\\recommand Sys")

    # load item data
    item = pd.read_csv("songsCSV.csv",
                       encoding="UTF-8",
                       dtype={
                           "song_length": np.uint16,
                           "language": str,
                       },
                       iterator=True)

    # use for debug
    chunksize = 10000
    item = item.get_chunk(chunksize)
    item.loc[chunksize + 1, 'song_id'] = 'specialll'
    item.loc[chunksize + 2, 'song_id'] = 'special22'

    # fill na use default value , this value is also used in build social network
    # be caution ! you just need to fill those cols will be used in
    # building the network!
    # and you must fillna with the order which is determined by the order of
    # those cols in your data
    fillnawith = collections.OrderedDict()
    fillnawith['genre_ids'] = '-1'
    fillnawith['language'] = '-1'
    fillnawith['artist_name'] = "no_artist"

    item = fillDscrtNAN(item, fillnawith)

    # fill na with special value calculated from data

    fillCntnueNAN(item, ['song_length'])
    scaleCntnueVariable(item, ['song_length'])

    # change primary key to ID
    item, item_id_dict = changeNameToID(item, 'song_id', plan="A")

    # split the dataframe to two , one of it is containing  the continue attr
    # the other containing the discrete attr

    # 注意:分类不能分的太细,分之前可以对于属性做做聚类,把歌手这种东西先聚聚类,否则分类太多
    #  一是超大矩阵运算不好做,二来分的太细做社交网络就没意义了


    (itemCntnueAttr, itemDscrtAttr) = \
        splitDF(item, "song_id",
                ["song_length"],
                ["genre_ids", "language", "artist_name"]
                )
    del item
    gc.collect()

    # create socail network of item using dask

    # do the tag combine process

    id = "song_id"
    colList = itemDscrtAttr.columns.tolist()
    colList.remove(id)

    itemWithTag = tagCombine(itemDscrtAttr, id='song_id', tagColList=colList)

    (itemTagmatrix, itemNoAttr) = findNetwork(itemWithTag,
                                              fillnawith,
                                              split=r"&|\|")

    # for those item which has no tag,let them has relationship with all the others
    # the method is to let itemTagmatrix has -1 on the elements of that row so that
    # the cosine value may be minus then  you can identify it and turn it to 1

    for row in itemNoAttr:
        itemTagmatrix[row, :] = -1

    # if you want to do it using loop , you may set num > 2
    # if you set num = 2 ,it will do it once
    # save the social network here
    fileplace = "C:\\Users\\22560\\Desktop\\"
    LargeSparseMatrixCosine(itemTagmatrix,
                            itemNoAttr,
                            num=2,
                            fileplace=fileplace,
                            prefix="item")

    # prepare largeDisMatrix
    itemCntnueAttr.set_index("song_id", inplace=True)

    largeMatrixDis(itemCntnueAttr.values,
                   num=2,
                   netFilePlace=fileplace,
                   prefix="item")

    save_pickle(item_id_dict, fileplace + "item_id_dict")
Example #3
0
from sltools import save_pickle

if __name__ == "__main__":

    os.chdir("C:\\Users\\22560\\Documents\\iptv")

    behavior_dateparse = lambda x: pd.datetime.strptime(x, '%Y%m%d%H')
    behavior = pd.read_csv("./behavior/behaviorHasName.csv")
    behavior['STATIS_TIME'] = pd.to_datetime(behavior['STATIS_TIME'],
                                             infer_datetime_format=True)

    dianbo_dateparse = lambda x: pd.datetime.strptime(x, '%Y/%m/%d %H:%M')
    dianbo = pd.read_csv("./behavior/dianbo.csv",
                         encoding='gbk',
                         date_parser=dianbo_dateparse,
                         parse_dates=['下单时间'])
    dianbo['下单时间'] = dianbo['下单时间'].map(
        lambda x: x.replace(minute=0, second=0))

    dinggou = pd.read_csv("./behavior/dinggou.csv", encoding='gbk')

    userHasPayHistory = set(dinggou['订购账号']) | set(dianbo['下单用户'])
    save_pickle(userHasPayHistory, "./temp/userHasPayHistory.data")

    # find pay item

    itemNeedPay = dianbo['业务产品名称'].str.replace(r'[\((].*[\))]',
                                               '').drop_duplicates()

    save_pickle(itemNeedPay, "./temp/itemNeedPay.data")
Example #4
0
            r = requests.get("http://www.baidu.com/s?wd=" + name)
            if r.status_code == 200:
                beautiful = BeautifulSoup(r.content.decode('utf-8'), "lxml")

                typeOfit = beautiful.find(attrs={'mu': re.compile(r'baike')})
                try:
                    if type(typeOfit.p.text) == str:
                        typeList[name] = (typeOfit.p.text)
                        print(name, '  ', typeOfit.p.text)
                    else:
                        typeList[name] = "no type"
                        print(name, '  ', "no type")
                    #time.sleep(np.random.random()*4)

                except:
                    typeList[name] = "no type"
                    print(name, '  ', "no type")
                    save_pickle(typeList, 'typeList.data')

            else:
                typeList.append('no name')
                print(name, '  ', "no type")
            if counter % 10 == 0:
                print("counter", counter)
        except:
            save_pickle(typeList, 'typeList.data')
            save_pickle(counter, 'counterRecord.data')

    print(typeList)
    save_pickle(typeList, 'typeList.data')
Example #5
0
def dataPreparation(userName, itemName, targetName, userGroupName,
                    itemGroupName):

    data = []

    # divided data by user class  and by item class
    trainSet = pd.read_csv(
        "./originalData/trainSet.csv",
        usecols=[userName, itemName, targetName, userGroupName, itemGroupName])
    trainSet = trainSet.dropna(axis=0)

    # 对于 no_type 目前无法处理
    trainSet = trainSet[trainSet.tag != "no type"]

    # # use log transform for stability
    trainSet[targetName] = np.log(trainSet[targetName])

    # # encode user , item

    behavior, mediaid = changeNameToID(trainSet, itemName, plan='B')
    behavior, userid = changeNameToID(behavior, userName, plan='B')
    behavior, userGroup = changeNameToID(behavior, userGroupName, plan='B')
    behavior, tag = changeNameToID(behavior, itemGroupName, plan='B')

    userBelong = behavior.groupby(
        [userName, userGroupName],
        as_index=False)[itemName].count()[[userName, userGroupName]]
    userBelong.to_csv("./temp/userBelong.csv", index=False)

    itemBelong = behavior.groupby(
        [itemName, itemGroupName],
        as_index=False)[userName].count()[[itemName, itemGroupName]]
    itemBelong.to_csv("./temp/itemBelong.csv", index=False)

    # saving original name

    save_pickle(mediaid, "./temp/mediaid.data")
    save_pickle(userid, "./temp/userid.data")
    save_pickle(userGroup, "./temp/userGroup.data")
    save_pickle(tag, "./temp/tag.data")

    print("encode is ready! starting to split data")

    # get a list ,contains the record of which obj in a special Group

    userdict = objGroupMapGenerator(trainSet, userGroupName, userName)
    itemdict = objGroupMapGenerator(trainSet, itemGroupName, itemName)

    save_pickle(userdict, "./temp/userdict.data")
    save_pickle(itemdict, "./temp/itemdict.data")

    behavior, test = train_test_split(behavior, test_size=0.3)

    behavior.to_csv("./originalData/behavior.csv", index=False)
    print("behavior created successfully !")
    test.to_csv("./originalData/test.csv", index=False)
    print("behavior created successfully !")

    clearDir(".\\dividedByUser\\")
    clearDir(".\\dividedByItem\\")

    for _, data in behavior.groupby(userGroupName):
        data.to_csv(".\\dividedByUser\\" + str(data[userGroupName].iloc[0]) +
                    '.csv',
                    index=False)

    for _, data in behavior.groupby(itemGroupName):
        data.to_csv(".\\dividedByItem\\" + str(data[itemGroupName].iloc[0]) +
                    '.csv',
                    index=False)

    print("data has splited ! ,starting to initailizing paras matrix ! ")

    # 初始化P,Q,S,T 对应的参数
    numOfUser = load_pickle("./temp/userid.data").keys().__len__()
    numOfItem = load_pickle("./temp/mediaid.data").keys().__len__()
    numOfUserClass = len(os.listdir(".\\dividedByUser"))
    numOfItemClass = len(os.listdir(".\\dividedByItem"))
    numOfK = 5


    userLatentFactor, itemLatentFactor,\
    userClassLatentFactor, itemClassLatentFactor = initailizerBySVD(behavior, targetName, numOfUser, numOfItem, numOfK,
                          userBelong,itemBelong,
                          userName,userGroupName,
                          itemName,itemGroupName
                          )

    userLatentFactor.to_csv(".\\oldPQST\\userLatentFactor.txt",
                            sep='\t',
                            index=False)
    userClassLatentFactor.to_csv(".\\oldPQST\\userClassLatentFactor.txt",
                                 sep='\t',
                                 index=False)
    itemLatentFactor.to_csv(".\\oldPQST\\itemLatentFactor.txt",
                            sep='\t',
                            index=False)
    itemClassLatentFactor.to_csv(".\\oldPQST\\itemClassLatentFactor.txt",
                                 sep='\t',
                                 index=False)
Example #6
0
def extractUserInfo():
    ####################           change dir       ######################
    os.chdir("C:\\Users\\22560\\Desktop\\recommand Sys\\recommand Sys")

    # the usual ways  to deal with date time:
    ## Suppose you have a column 'datetime' with your string, then:

    # dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    # df = pd.read_csv(infile, parse_dates=['datetime'], date_parser=dateparse)

    ##This way you can even combine multiple columns into a single datetime column, this merges a 'date' and a 'time' column into a single 'datetime' column:

    # dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    # df = pd.read_csv(infile, parse_dates={'datetime': ['date', 'time']}, date_parser=dateparse)



    # be attention of my way to deal with dates cols
    usertable = "members.csv"
    dateparse = lambda x: pd.datetime.strptime(x, '%Y%m%d')

    user = pd.read_csv(usertable,encoding = "UTF-8",
                dtype={
                    "city": str,
                    "registered_via":str,
                    "gender" : str
                },
                parse_dates= ['registration_init_time','expiration_date']
                ,date_parser=dateparse
                ,iterator= True
                       )


# use for test
    chunksize = 10000
    user  = user.get_chunk(chunksize)
    user.loc[chunksize+1,'msno'] = "special"
    user.loc[chunksize+2,'msno'] = 'special2'
# use for test end

    fillnawith = collections.OrderedDict()
    fillnawith['city'] = "no city"
    fillnawith['gender'] = 'no sex'
    fillnawith['registered_via'] = "no via"


    user = fillDscrtNAN(user,fillnawith)

    # make a continuous var for test

    user['cntinue'] =user.expiration_date\
                        -user.registration_init_time

    user.cntinue = user.cntinue.dt.days
    # other info can derived from user.continue.dt.components

    fillCntnueNAN(user,['cntinue'])
    scaleCntnueVariable(user,['cntinue'])



    user,user_id_dict = changeNameToID(user,'msno',plan='A')

    (userCntnueAttr, userDscrtAttr) = \
        splitDF(user, "msno",
                ["cntinue"],
                ["city","gender",  "registered_via"]
                )

    del user
    gc.collect()

    id = 'msno'
    colList = userDscrtAttr.columns.tolist()
    colList.remove(id)

    userWithTag = tagCombine(userDscrtAttr, id='msno', tagColList=colList)

    (userTagmatrix, userNoAttr) = findNetwork(userWithTag, fillnawith, split=r"&|\|")

    for row in userNoAttr:
        userTagmatrix[row, :] = -1

    fileplace = "C:\\Users\\22560\\Desktop\\"
    LargeSparseMatrixCosine(userTagmatrix,userNoAttr, num=2, fileplace=fileplace,prefix="user")

   # prepare largeDisMatrix
    userCntnueAttr.set_index("msno", inplace=True)

    largeMatrixDis(userCntnueAttr.values, num=2,
                   netFilePlace=fileplace ,prefix="user")

    save_pickle(user_id_dict, fileplace + "user_id_dict")
    # return(user_id_dict)
Example #7
0
    record = []
    counter = 0
    for parameters in iter:

        lambda_user, lambda_item, factor_num = parameters

        print("choosing parameters are : \n", "lambda_user : "******"\n", "lambda_item : ", lambda_item, "\n", "factor_num  : ",
              factor_num, "\n")
        # trainAndValidation 返回在训练集和测试集合上的rmse
        rmse, rmse_vali = trainAndValidation(
            trainFilePlace, prepare_path, prepare_name, transpose_prepare_name,
            factor_num, method, iteration_num, user_loop_num, item_loop_num,
            lambda_user, lambda_item, latentFilePlace, validationFilePlace)
        result = OrderedDict()
        result['parameters'] = parameters
        result['rmse'] = rmse[-1]
        result['rmse_vali'] = rmse_vali[-1]
        counter = counter + 1
        record.append(result)
        if (counter % 5 == 0):
            save_pickle(record, "record.rcd")

    minRmse = min([i['rmse_vali'] for i in record])
    for i in record:
        if i['rmse_vali'] == minRmse:
            print(i)

    load_pickle("record.rcd")
    #