def scrapingNotype(data): typeList = dict() processName = multiprocessing.current_process().name counter = 0 np.random.shuffle(data) errorcount = 0 for name in data: counter += 1 try: #time.sleep(2) r = requests.get("http://www.baidu.com/s?wd=" + name) if r.status_code == 200: beautiful = BeautifulSoup(r.content.decode('utf-8'), "lxml") typeOfit = beautiful.find( attrs={'tpl': re.compile(r'se_st_single_video_zhanzhang')}) typeOfit = typeOfit.h3.a.text try: if type(typeOfit) == str: typeList[name] = (typeOfit) print('process name is ', processName, ' name :', name, ' type: ', typeOfit) else: typeList[name] = "no type" print('process name is ', processName, name, ' ', "no type") #time.sleep(np.random.random()*4) except: typeList[name] = "no type" print('process name is ', processName, name, ' ', "no type") print("save it !") save_pickle(typeList, str(processName) + 'no_type_typeList.data') save_pickle(counter, str(processName) + 'counterRecord.data') else: typeList.append('no name') print(name, ' ', "no type") if counter % 2000 == 0: print('process', str(processName), 'is now having a ', "counter of", counter) save_pickle(typeList, str(processName) + 'no_type_typeList.data') save_pickle(counter, str(processName) + 'counterRecord.data') except: typeList[name] = "no type" save_pickle(typeList, str(processName) + 'no_type_typeList.data') save_pickle(counter, str(processName) + 'counterRecord.data') print(typeList) save_pickle(typeList, str(processName) + 'no_type_typeList.data')
def extractItemInfo(): #################### change dir ###################### os.chdir("C:\\Users\\22560\\Desktop\\recommand Sys\\recommand Sys") # load item data item = pd.read_csv("songsCSV.csv", encoding="UTF-8", dtype={ "song_length": np.uint16, "language": str, }, iterator=True) # use for debug chunksize = 10000 item = item.get_chunk(chunksize) item.loc[chunksize + 1, 'song_id'] = 'specialll' item.loc[chunksize + 2, 'song_id'] = 'special22' # fill na use default value , this value is also used in build social network # be caution ! you just need to fill those cols will be used in # building the network! # and you must fillna with the order which is determined by the order of # those cols in your data fillnawith = collections.OrderedDict() fillnawith['genre_ids'] = '-1' fillnawith['language'] = '-1' fillnawith['artist_name'] = "no_artist" item = fillDscrtNAN(item, fillnawith) # fill na with special value calculated from data fillCntnueNAN(item, ['song_length']) scaleCntnueVariable(item, ['song_length']) # change primary key to ID item, item_id_dict = changeNameToID(item, 'song_id', plan="A") # split the dataframe to two , one of it is containing the continue attr # the other containing the discrete attr # 注意:分类不能分的太细,分之前可以对于属性做做聚类,把歌手这种东西先聚聚类,否则分类太多 # 一是超大矩阵运算不好做,二来分的太细做社交网络就没意义了 (itemCntnueAttr, itemDscrtAttr) = \ splitDF(item, "song_id", ["song_length"], ["genre_ids", "language", "artist_name"] ) del item gc.collect() # create socail network of item using dask # do the tag combine process id = "song_id" colList = itemDscrtAttr.columns.tolist() colList.remove(id) itemWithTag = tagCombine(itemDscrtAttr, id='song_id', tagColList=colList) (itemTagmatrix, itemNoAttr) = findNetwork(itemWithTag, fillnawith, split=r"&|\|") # for those item which has no tag,let them has relationship with all the others # the method is to let itemTagmatrix has -1 on the elements of that row so that # the cosine value may be minus then you can identify it and turn it to 1 for row in itemNoAttr: itemTagmatrix[row, :] = -1 # if you want to do it using loop , you may set num > 2 # if you set num = 2 ,it will do it once # save the social network here fileplace = "C:\\Users\\22560\\Desktop\\" LargeSparseMatrixCosine(itemTagmatrix, itemNoAttr, num=2, fileplace=fileplace, prefix="item") # prepare largeDisMatrix itemCntnueAttr.set_index("song_id", inplace=True) largeMatrixDis(itemCntnueAttr.values, num=2, netFilePlace=fileplace, prefix="item") save_pickle(item_id_dict, fileplace + "item_id_dict")
from sltools import save_pickle if __name__ == "__main__": os.chdir("C:\\Users\\22560\\Documents\\iptv") behavior_dateparse = lambda x: pd.datetime.strptime(x, '%Y%m%d%H') behavior = pd.read_csv("./behavior/behaviorHasName.csv") behavior['STATIS_TIME'] = pd.to_datetime(behavior['STATIS_TIME'], infer_datetime_format=True) dianbo_dateparse = lambda x: pd.datetime.strptime(x, '%Y/%m/%d %H:%M') dianbo = pd.read_csv("./behavior/dianbo.csv", encoding='gbk', date_parser=dianbo_dateparse, parse_dates=['下单时间']) dianbo['下单时间'] = dianbo['下单时间'].map( lambda x: x.replace(minute=0, second=0)) dinggou = pd.read_csv("./behavior/dinggou.csv", encoding='gbk') userHasPayHistory = set(dinggou['订购账号']) | set(dianbo['下单用户']) save_pickle(userHasPayHistory, "./temp/userHasPayHistory.data") # find pay item itemNeedPay = dianbo['业务产品名称'].str.replace(r'[\((].*[\))]', '').drop_duplicates() save_pickle(itemNeedPay, "./temp/itemNeedPay.data")
r = requests.get("http://www.baidu.com/s?wd=" + name) if r.status_code == 200: beautiful = BeautifulSoup(r.content.decode('utf-8'), "lxml") typeOfit = beautiful.find(attrs={'mu': re.compile(r'baike')}) try: if type(typeOfit.p.text) == str: typeList[name] = (typeOfit.p.text) print(name, ' ', typeOfit.p.text) else: typeList[name] = "no type" print(name, ' ', "no type") #time.sleep(np.random.random()*4) except: typeList[name] = "no type" print(name, ' ', "no type") save_pickle(typeList, 'typeList.data') else: typeList.append('no name') print(name, ' ', "no type") if counter % 10 == 0: print("counter", counter) except: save_pickle(typeList, 'typeList.data') save_pickle(counter, 'counterRecord.data') print(typeList) save_pickle(typeList, 'typeList.data')
def dataPreparation(userName, itemName, targetName, userGroupName, itemGroupName): data = [] # divided data by user class and by item class trainSet = pd.read_csv( "./originalData/trainSet.csv", usecols=[userName, itemName, targetName, userGroupName, itemGroupName]) trainSet = trainSet.dropna(axis=0) # 对于 no_type 目前无法处理 trainSet = trainSet[trainSet.tag != "no type"] # # use log transform for stability trainSet[targetName] = np.log(trainSet[targetName]) # # encode user , item behavior, mediaid = changeNameToID(trainSet, itemName, plan='B') behavior, userid = changeNameToID(behavior, userName, plan='B') behavior, userGroup = changeNameToID(behavior, userGroupName, plan='B') behavior, tag = changeNameToID(behavior, itemGroupName, plan='B') userBelong = behavior.groupby( [userName, userGroupName], as_index=False)[itemName].count()[[userName, userGroupName]] userBelong.to_csv("./temp/userBelong.csv", index=False) itemBelong = behavior.groupby( [itemName, itemGroupName], as_index=False)[userName].count()[[itemName, itemGroupName]] itemBelong.to_csv("./temp/itemBelong.csv", index=False) # saving original name save_pickle(mediaid, "./temp/mediaid.data") save_pickle(userid, "./temp/userid.data") save_pickle(userGroup, "./temp/userGroup.data") save_pickle(tag, "./temp/tag.data") print("encode is ready! starting to split data") # get a list ,contains the record of which obj in a special Group userdict = objGroupMapGenerator(trainSet, userGroupName, userName) itemdict = objGroupMapGenerator(trainSet, itemGroupName, itemName) save_pickle(userdict, "./temp/userdict.data") save_pickle(itemdict, "./temp/itemdict.data") behavior, test = train_test_split(behavior, test_size=0.3) behavior.to_csv("./originalData/behavior.csv", index=False) print("behavior created successfully !") test.to_csv("./originalData/test.csv", index=False) print("behavior created successfully !") clearDir(".\\dividedByUser\\") clearDir(".\\dividedByItem\\") for _, data in behavior.groupby(userGroupName): data.to_csv(".\\dividedByUser\\" + str(data[userGroupName].iloc[0]) + '.csv', index=False) for _, data in behavior.groupby(itemGroupName): data.to_csv(".\\dividedByItem\\" + str(data[itemGroupName].iloc[0]) + '.csv', index=False) print("data has splited ! ,starting to initailizing paras matrix ! ") # 初始化P,Q,S,T 对应的参数 numOfUser = load_pickle("./temp/userid.data").keys().__len__() numOfItem = load_pickle("./temp/mediaid.data").keys().__len__() numOfUserClass = len(os.listdir(".\\dividedByUser")) numOfItemClass = len(os.listdir(".\\dividedByItem")) numOfK = 5 userLatentFactor, itemLatentFactor,\ userClassLatentFactor, itemClassLatentFactor = initailizerBySVD(behavior, targetName, numOfUser, numOfItem, numOfK, userBelong,itemBelong, userName,userGroupName, itemName,itemGroupName ) userLatentFactor.to_csv(".\\oldPQST\\userLatentFactor.txt", sep='\t', index=False) userClassLatentFactor.to_csv(".\\oldPQST\\userClassLatentFactor.txt", sep='\t', index=False) itemLatentFactor.to_csv(".\\oldPQST\\itemLatentFactor.txt", sep='\t', index=False) itemClassLatentFactor.to_csv(".\\oldPQST\\itemClassLatentFactor.txt", sep='\t', index=False)
def extractUserInfo(): #################### change dir ###################### os.chdir("C:\\Users\\22560\\Desktop\\recommand Sys\\recommand Sys") # the usual ways to deal with date time: ## Suppose you have a column 'datetime' with your string, then: # dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') # df = pd.read_csv(infile, parse_dates=['datetime'], date_parser=dateparse) ##This way you can even combine multiple columns into a single datetime column, this merges a 'date' and a 'time' column into a single 'datetime' column: # dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') # df = pd.read_csv(infile, parse_dates={'datetime': ['date', 'time']}, date_parser=dateparse) # be attention of my way to deal with dates cols usertable = "members.csv" dateparse = lambda x: pd.datetime.strptime(x, '%Y%m%d') user = pd.read_csv(usertable,encoding = "UTF-8", dtype={ "city": str, "registered_via":str, "gender" : str }, parse_dates= ['registration_init_time','expiration_date'] ,date_parser=dateparse ,iterator= True ) # use for test chunksize = 10000 user = user.get_chunk(chunksize) user.loc[chunksize+1,'msno'] = "special" user.loc[chunksize+2,'msno'] = 'special2' # use for test end fillnawith = collections.OrderedDict() fillnawith['city'] = "no city" fillnawith['gender'] = 'no sex' fillnawith['registered_via'] = "no via" user = fillDscrtNAN(user,fillnawith) # make a continuous var for test user['cntinue'] =user.expiration_date\ -user.registration_init_time user.cntinue = user.cntinue.dt.days # other info can derived from user.continue.dt.components fillCntnueNAN(user,['cntinue']) scaleCntnueVariable(user,['cntinue']) user,user_id_dict = changeNameToID(user,'msno',plan='A') (userCntnueAttr, userDscrtAttr) = \ splitDF(user, "msno", ["cntinue"], ["city","gender", "registered_via"] ) del user gc.collect() id = 'msno' colList = userDscrtAttr.columns.tolist() colList.remove(id) userWithTag = tagCombine(userDscrtAttr, id='msno', tagColList=colList) (userTagmatrix, userNoAttr) = findNetwork(userWithTag, fillnawith, split=r"&|\|") for row in userNoAttr: userTagmatrix[row, :] = -1 fileplace = "C:\\Users\\22560\\Desktop\\" LargeSparseMatrixCosine(userTagmatrix,userNoAttr, num=2, fileplace=fileplace,prefix="user") # prepare largeDisMatrix userCntnueAttr.set_index("msno", inplace=True) largeMatrixDis(userCntnueAttr.values, num=2, netFilePlace=fileplace ,prefix="user") save_pickle(user_id_dict, fileplace + "user_id_dict") # return(user_id_dict)
record = [] counter = 0 for parameters in iter: lambda_user, lambda_item, factor_num = parameters print("choosing parameters are : \n", "lambda_user : "******"\n", "lambda_item : ", lambda_item, "\n", "factor_num : ", factor_num, "\n") # trainAndValidation 返回在训练集和测试集合上的rmse rmse, rmse_vali = trainAndValidation( trainFilePlace, prepare_path, prepare_name, transpose_prepare_name, factor_num, method, iteration_num, user_loop_num, item_loop_num, lambda_user, lambda_item, latentFilePlace, validationFilePlace) result = OrderedDict() result['parameters'] = parameters result['rmse'] = rmse[-1] result['rmse_vali'] = rmse_vali[-1] counter = counter + 1 record.append(result) if (counter % 5 == 0): save_pickle(record, "record.rcd") minRmse = min([i['rmse_vali'] for i in record]) for i in record: if i['rmse_vali'] == minRmse: print(i) load_pickle("record.rcd") #