def func_main(): print('start process movie based recall task:' + str(datetime.datetime.now())) start_time = datetime.datetime.now() conn = common.get_connection() df_comment_new_data_ldim = get_comment_data() conn = common.get_connection() for i in df_comment_new_data_ldim.index: print(df_comment_new_data_ldim.iloc[i]['MOVIEID'], df_comment_new_data_ldim.iloc[i]['USERID']) ibmovie_list = get_ibmovie_by_movieid( df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn) for j in ibmovie_list: is_exist = exist_in_comment( j[0], df_comment_new_data_ldim.iloc[i]['USERID'], conn) if is_exist: print('exist in comment') else: insert_or_update_recmovie( j[0], df_comment_new_data_ldim.iloc[i]['USERID'], df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn) #update_comment_new_data_flag(df_comment_new_data_ldim.iloc[i]['ID'], conn) conn.commit() end_time = datetime.datetime.now() print(end_time - start_time) print('finish process movie based recall task:' + str(datetime.datetime.now()))
def func_main(): print('start process movie based recall task:' + str(datetime.datetime.now())) start_time = datetime.datetime.now() conn = common.get_connection() # 获取增量的评论评分数据 df_comment_new_data_ldim = get_comment_data() conn = common.get_connection() # 遍历评分数据 for i in df_comment_new_data_ldim.index: print(df_comment_new_data_ldim.iloc[i]['MOVIEID'], df_comment_new_data_ldim.iloc[i]['USERID']) # 从相似推荐中找到相似的电影 ibmovie_list = get_ibmovie_by_movieid( df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn) for j in ibmovie_list: is_exist = exist_in_comment( j[0], df_comment_new_data_ldim.iloc[i]['USERID'], conn) # 如果推荐的电影,该用户已经评分过(看过, 则不推荐 # 否则, 插入到recmove表中(recmove表的插入也有存在判断,如果存在,则更新) if is_exist: print('exist in comment') else: insert_or_update_recmovie( j[0], df_comment_new_data_ldim.iloc[i]['USERID'], df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn) #update_comment_new_data_flag(df_comment_new_data_ldim.iloc[i]['ID'], conn) conn.commit() end_time = datetime.datetime.now() print(end_time - start_time) print('finish process movie based recall task:' + str(datetime.datetime.now()))
def insert_or_update_one_userproex(userid, rmax, rmin, ravg, rmedium, rcount, rsum): print('insert_or_update_one_userproex start') is_insert = True connection = common.get_connection() try: sql = 'select count(*) from userproex_new where userid=\'%s\'' % userid with connection.cursor() as cursor: cursor.execute(sql) connection.commit() res_cnt = cursor.fetchall()[0][0] if res_cnt > 0: is_insert = False except Exception as e: print('query for exist info error' + str(e)) if is_insert: sql = 'insert into userproex_new(userid, rmax, rmin, ravg, rcount, rsum, rmedian) values(\'%s\', %s, %s, %s, %s, %s, %s)' % ( userid, rmax, rmin, ravg, rcount, rsum, rmedium) else: sql = 'update userproex_new set rmax=%s, rmin=%s, ravg=%s, rmedian=%s, rcount=%s, rsum=%s where userid=\'%s\'' % ( rmax, rmin, ravg, rmedium, rcount, rsum, userid) try: with connection.cursor() as cursor: cursor.execute(sql) connection.commit() print(('insert userid:' if is_insert else 'update userid:') + str(userid) + ' success.') except Exception as e: print(e) connection.close() connection.close()
def get_user_list_in_comment(is_all_or_sample, process_func, insert_or_update_func): connection = common.get_connection() if is_all_or_sample: sql = 'select DISTINCT userid from comment_new where userid is not null and userid in (select userid from userproex_new where rmedian=0)' else: sql = 'select DISTINCT userid from comment_new where userid is not null and userid in (select userid from userproex_new where rmedian=0) limit 0,10' try: with connection.cursor() as cursor: cursor.execute(sql) connection.commit() process_size = 0 while True: r = cursor.fetchone() if r is None: break process_func(r[0], insert_or_update_func, connection) process_size += 1 if process_size >= 1000: connection.commit() process_size = 0 connection.commit() except Exception as e: print(e) connection.close() connection.close()
def load_main_df_from_csv(): conn = common.get_connection() csv_url = cfg.get_config_property('csv_last_url', conn) df = pd.read_csv(csv_url, sep='\t', encoding='utf-8') df = df.drop_duplicates() df = df.drop(['Unnamed: 0'], axis=1) df = df.drop_duplicates(['ID']) df = df.drop(['CONTENT'], axis=1) df = df.drop(['ADD_TIME_x', 'ADD_TIME_y'], axis=1) df = df.reset_index(drop=True) df_main = df.drop( ['name', 'CREATOR', 'description', 'img', 'ID', 'NEWDATA'], axis=1) df_main = df_main.rename(columns={'MOVIEID': 'movieid'}) df_main = df_main.drop(['enable'], axis=1) # datetime.datetime.strptime(df['TIME'][0],'%Y-%m-%d %H:%M:%S').year - 2000 df_main = df_main.dropna(subset=['USERID', 'rcount']).reset_index( drop=True) def process_time(t): try: return datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').year - 2000 except Exception as e: print(e) df_main['TIME_DIS'] = df_main['TIME'].apply(lambda x: process_time(x)) df_main = df_main.drop(['TIME'], axis=1) df_main = df_main.drop(['userid'], axis=1) return df_main
def process_offline_compute_by_cosdis(rec_per_num, item_similarity, df_sim, df_orgin): rec_per_num += 1 connection = common.get_connection() for i in range(0, item_similarity.shape[0]): df_sim_p = df_sim.nsmallest(rec_per_num, i) df_sim_p = df_sim_p[i] movie_id = df_orgin.iloc[i]['id'] recmovie_cnt = get_recmovie_cnt_by_movieid(movie_id, connection) if recmovie_cnt == 200: print('pass...') continue print('new...') time_now = datetime.datetime.now() for rec_movie_item in df_sim_p.to_dict().items(): if rec_movie_item[0] != i: rec_movie_index = rec_movie_item[0] rec_movie_sim = rec_movie_item[1] rec_movie_id = df_orgin.iloc[rec_movie_index]['id'] rec_movie_rat = df_orgin.iloc[rec_movie_index]['rat'] insert_one_ibmovie(id_=uuid.uuid4(), movieid=movie_id, recmovieid=rec_movie_id, recmovierat=rec_movie_rat, simrat=rec_movie_sim, time=time_now, enable='1', connection=connection) connection.commit()
def get_dicts(): conn = common.get_connection() df_movie = pd.read_sql_query("select * from movie", conn) #type_dict = get_dim_dict(df_movie, 'type') actors_dict = get_dim_dict(df_movie, 'actors') director_dict = get_dim_dict(df_movie, 'director') #trait_dict = get_dim_dict(df_movie, 'trait') return None, actors_dict, director_dict, None
def get_comment_data(): conn = common.get_connection() df_comment_new_data = pd.read_sql_query( "select * from comment_new where newdata = 1 ", conn) df_comment_new_data_ldim = df_comment_new_data.loc[:, [ 'ID', 'MOVIEID', 'USERID' ]] return df_comment_new_data_ldim
def process_task(): _conn = common.get_connection() # update_recmovie_rat('1', '1', _conn, 'FM') # os._exit(0) train_file_scaling = cfg.get_config_property('train_file_fm_url', _conn) test_file_scaling = cfg.get_config_property('test_file_fm_url', _conn) # test_fm_by_test_data(train_file_scaling, test_file_scaling) df_data = get_recmovie_by_movie_based() actor_dict_data, director_dict_data, vectorizer, scaler = get_saved_actors_dict_director_dict_vectorizer( ) dict_list = convert_dataframe_2_dict_list(df_data, actor_dict_data, director_dict_data) X_predict = vectorizer.transform(dict_list) predict_file_ = cfg.get_config_property('dir_base_url', _conn) + 'X_predict.txt' # FM PART # 把 X_predict 处理成libsvm格式,供libfm使用 dump_svmlight_file(scaler.transform(X_predict), np.zeros(X_predict.shape[0]), predict_file_) libfm_predict_final = fm(train_file_scaling, predict_file_, classification=False) update_fm_rat(df_data, libfm_predict_final) # LR PART train_file_lr_path = cfg.get_config_property('train_file_lr_url', _conn) test_file_lr_path = cfg.get_config_property('test_file_lr_url', _conn) train_X_lr, train_y = get_data(train_file_lr_path) # test_X_lr, test_y = get_data(test_file_lr_path) print(train_X_lr.shape) lr = LogisticRegression(C=0.1, penalty='l2') lr.fit(train_X_lr, train_y) # test_predict = vectorizer.transform([{'尼泊尔': 1}, # {'赵本山': 1, '赵薇': 1, '张曼玉': 1, 'rat': '8.0', # 'ravg': 3.85714, # 'rcount': 7.0, # 'rmax': 5.0, # 'rmedian': 4.0, # 'rmin': 2.0, # 'rsum': 27.0}, # {'克里斯·派恩': 1, '扎克瑞·昆图': 1, '佐伊·索尔达娜': 1,'西蒙·佩吉':1, '安东·叶利钦':1, '林诣彬':1 , # '美国':1, # 'rat': '8.0', # 'ravg': 3.85714, # 'rcount': 7.0, # 'rmax': 5.0, # 'rmedian': 4.0, # 'rmin': 2.0, # 'rsum': 27.0}]) # print(lr.predict_proba(test_predict)) lr_predict_final = lr.predict_proba(X_predict) update_lr_rat(df_data, lr_predict_final.tolist()) print(lr.classes_)
def process_per_user(userid, insert_or_update_func): connection = common.get_connection() rmedian = 0 rmax = 0 rmin = 0 ravg = 0 rcount = 0 rsum = 0 sql = 'select rating from comment_new where userid=\'%s\'' % userid try: with connection.cursor() as cursor: cursor.execute(sql) rlist = cursor.fetchall() rmedian = np.median( np.array(list(map(lambda x: x[0], rlist))).astype(float)) except Exception as e: print(e) connection.close() sql = 'select max(rating) as rmax, min(rating) as rmin, avg(rating) as ravg, count(rating) as rcount, sum(rating) as rsum from comment_new where userid=\'%s\'' % userid try: count = 0 with connection.cursor() as cursor: count = cursor.execute(sql) if count > 0: r = cursor.fetchone() rmax, rmin, ravg, rcount, rsum = r[0], r[1], r[2], r[3], r[4] except Exception as e: print(e) connection.close() if rmax is None: rmax = 0 if rmin is None: rmin = 0 if ravg is None: ravg = 0 if rcount is None: rcount = 0 if rsum is None: rsum = 0 if math.isnan(rmedian): rmedian = 0 insert_or_update_func(userid=userid, rmax=rmax, rmin=rmin, ravg=ravg, rmedium=rmedian, rcount=rcount, rsum=rsum) connection.close()
def update_fm_rat(df_data, libfm_predict_final): conn = common.get_connection() index_ = 0 for r in libfm_predict_final: r_temp = str(r) try: r_temp = str(r)[0:1] except Exception as e: print("Update FM rat exception") print(e) update_recmovie_rat(df_data.iloc[index_]['id'], r_temp, conn, 'FM') index_ += 1
def process_per_movie(movieid, sim_matrix, df_origin, rec_per_num=201): df_sim_matrix = pd.DataFrame(data=sim_matrix) print(df_sim_matrix.shape) connection = common.get_connection() time_now = datetime.datetime.now() for i in range(0, sim_matrix.shape[0]): if df_origin.iloc[i]['id'] == movieid: print(df_origin.iloc[i]) df_sim_p_self = df_sim_matrix.nsmallest(rec_per_num, i) df_sim_p_self = df_sim_p_self[i] for rec_movie_item in df_sim_p_self.to_dict().items(): if rec_movie_item[0] != i: rec_movie_index = rec_movie_item[0] rec_movie_dis = rec_movie_item[1] rec_movie_id = df_origin.iloc[rec_movie_index]['id'] rec_movie_rat = df_origin.iloc[rec_movie_index]['rat'] insert_one_ibmovie(id_=uuid.uuid4(), movieid=movieid, recmovieid=rec_movie_id, recmovierat=rec_movie_rat, simrat=rec_movie_dis, time=time_now, enable='1', connection=connection) df_sim_p = df_sim_matrix[i] sim_item_list = df_sim_p.to_dict().items() sim_item_list_new = list( filter(lambda x: True if float(x[1]) < 0.80482 else False, sim_item_list)) for sim_movie_item in sim_item_list_new: if sim_movie_item[0] != i: sim_movie_index = sim_movie_item[0] sim_movie_dis = sim_movie_item[1] sime_movie_id = df_origin.iloc[sim_movie_index]['id'] sim_movie_rat = df_origin.iloc[sim_movie_index]['rat'] min_sim_movie4self = get_min_sim_movie_by_movieid( sime_movie_id) print(min_sim_movie4self) print(sim_movie_dis) if sim_movie_dis < min_sim_movie4self[2]: print('insert as recmovie...') insert_one_ibmovie(id_=uuid.uuid4(), movieid=sime_movie_id, recmovieid=movieid, recmovierat=sim_movie_rat, simrat=sim_movie_dis, time=time_now, enable='1', connection=connection)
def get_recmovie_by_movie_based(): conn = common.get_connection() sql = 'select * from recmovie left join movie on movie.id=recmovie.movieid left join userproex_new on userproex_new.userid=recmovie.userid' df_data = pd.read_sql_query(sql, conn) df_data = df_data.drop([0], axis=1) #df_data = df_data.rename(columns = {'userid':'USERID'}) #df_data['USERID'].fillna(0) print(df_data.columns) df_data = df_data.loc[:, ~df_data.columns.duplicated()] #print(df_data.shape) df_data = df_data.rename(columns={'userid': 'USERID'}) df_data['TIME_DIS'] = np.zeros(df_data.shape[0]) df_data = df_data.fillna(0) return df_data
def test_fm_by_test_data(train_file, test_file): conn = common.get_connection() libfm_predict = fm(train_file, test_file, classification=False) libfm_predict_series = pd.Series(libfm_predict) libfm_predict_series_int = libfm_predict_series.apply(lambda x: int(x)) _, y = get_data(test_file) print('MSE(to int):' + str(mean_squared_error(y, libfm_predict_series_int.tolist()))) print('MSE(origin):' + str(mean_squared_error(y, libfm_predict_series.tolist())))
def get_min_sim_movie_by_movieid(movieid): connection = common.get_connection() sql = 'select id,simrat,convert(simrat , DECIMAL(5,5)) as simrat_5 from ibmovie where movieid = \'%s\' order by convert(simrat , DECIMAL(5,5)) desc limit 0,10' % movieid try: with connection.cursor() as cursor: cursor.execute(sql) r = cursor.fetchone() if r is None: return None return r except Exception as e: print(e) connection.close() connection.close()
def fm(train_file, test_file, classification=True, rank=10, n_iter=150): conn = common.get_connection() libfm = cfg.get_config_property('lib_fm_path', conn) task = 'c' if classification else 'r' base_dir = cfg.get_config_property('dir_base_url', conn) cmd_ = '%s -task %s -method mcmc -train %s -test %s -iter %s -dim \'1,1,%s\' -out %soutput_.libfm' % ( libfm, task, train_file, test_file, n_iter, rank, base_dir) #console_output = !$LIBFM_PATH -task $task -method als -regular '0,0,10' -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm #console_output = !$LIBFM_PATH -task $task -method sgd -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm print(libfm) console_output = run(cmd_) print(console_output) libfm_predict = pd.read_csv('%soutput_.libfm' % base_dir, header=None).values.flatten() return libfm_predict
def get_recmovie_cnt_by_movieid(movieid, connection): sql = 'select count(*) from ibmovie group by movieid having movieid=\'%s\'' % movieid try: with connection.cursor() as cursor: cout = cursor.execute(sql) if cout == 0: return 0 return cursor.fetchone()[0] except Exception as e: print('exception@get_recmovie_by_movieid:' + str(e)) try: connection.cursor().close() connection.close() print('closed') except Exception as e1: print('exception1@get_recmovie_by_movieid:' + str(e1)) connection = common.get_connection()
def get_saved_actors_dict_director_dict_vectorizer(): conn = common.get_connection() dict2vec_url = cfg.get_config_property('dict2vec', conn) actors_dict_url = cfg.get_config_property('actors_dict', conn) director_dict_url = cfg.get_config_property('director_dict', conn) scaler_url = cfg.get_config_property('scaler', conn) with open(dict2vec_url, 'rb') as f: v_from_pkl = pkl.load(f) with open(actors_dict_url, 'rb') as f: actors_dict = pkl.load(f) with open(director_dict_url, 'rb') as f: director_dict = pkl.load(f) with open(scaler_url, 'rb') as f: scaler = pkl.load(f) return actors_dict, director_dict, v_from_pkl, scaler
def process_by_movie_change_log(process_func, sim_matrix, df_origin): connection = common.get_connection() sql = 'select * from mqlog where logtype = \'m\' and pulled = 0 limit 0,10' try: with connection.cursor() as cursor: cursor.execute(sql) while True: r = cursor.fetchone() if r is None: break message = json.loads(r[2]) movieid = message['movieid'] print('process movie\'s id is:' + movieid) process_func(movieid, sim_matrix, df_origin) with connection.cursor() as cursor4update: update_sql = 'update mqlog set pulled=1 where id=\'%s\'' % r[0] cursor4update.execute(update_sql) connection.commit() except Exception as e: print(e) connection.close() connection.close()
def run(cmd): conn = common.get_connection() base_dir = cfg.get_config_property('dir_base_url', conn) temp_dir = base_dir + os.sep + 'tmp' + os.sep out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000) final_temp_dir = temp_dir + os.sep try: fileno = out_temp.fileno() p = subprocess.Popen(cmd, shell=False, cwd=final_temp_dir, stdout=fileno, stderr=fileno, universal_newlines=True) p.wait() out_temp.seek(0) print(out_temp.read().decode('utf8', 'replace')) except Exception as e: raise RuntimeError('run error: %s' % str(e)) finally: if out_temp: out_temp.close()
def get_user_list_in_log(process_func, insert_or_update_func): connection = common.get_connection() sql = 'select * from mqlog where logtype = \'u\' and pulled = 0 limit 0,10' try: with connection.cursor() as cursor: cursor.execute(sql) while True: r = cursor.fetchone() if r is None: break message = json.loads(r[2]) userid = message['userid'] print('process user\'s id is:' + userid) process_func(userid, insert_or_update_func) with connection.cursor() as cursor4update: update_sql = 'update mqlog set pulled=1 where id=\'%s\'' % r[0] cursor4update.execute(update_sql) connection.commit() except Exception as e: print(e) connection.close() connection.close()
def update_lr_rat(df_data, lr_predict_final): conn = common.get_connection() index_ = 0 for r in lr_predict_final: update_recmovie_rat(df_data.iloc[index_]['id'], r[1], conn, 'LR') index_ += 1
def update_fm_rat(df_data, libfm_predict_final): conn = common.get_connection() index_ = 0 for r in libfm_predict_final: update_recmovie_rat(df_data.iloc[index_]['id'], r, conn, 'FM') index_ += 1
def process_task(): global csv_url_cache start_time = datetime.datetime.now() print('start process comment to libsvm task:' + str(datetime.datetime.now())) conn = common.get_connection() csv_url = cfg.get_config_property('csv_last_url', conn) if csv_url_cache is None: csv_url_cache = csv_url elif csv_url_cache == csv_url: print('there is no new comment csv...') return # 从csv文件加载数据集 data_frame_main = load_main_df_from_csv() conn = common.get_connection() # 加载字典频次对象 _, actors_dict_, director_dict_, _ = get_dicts() actors_dict_save_url = cfg.get_config_property('actors_dict', conn) director_dict_save_url = cfg.get_config_property('director_dict', conn) with open(actors_dict_save_url, 'wb') as f: pkl.dump(actors_dict_, f) with open(director_dict_save_url, 'wb') as f: pkl.dump(director_dict_, f) train_y = data_frame_main['RATING'] data_frame_main = data_frame_main.drop(['RATING'], axis=1) # 获取整体数据集的字典形式数据 dict_data_list = get_dict_list(data_frame_main, actors_dict_, director_dict_) # 把字典形式的数据做向量化 v = DictVectorizer() train_X = v.fit_transform(dict_data_list) train_X_ = train_X[0:280000] train_y_ = train_y[:280000] test_X_ = train_X[280000:] test_y_ = train_y[280000:] print(train_X_.shape) # 对于逻辑回归的训练集和测试集数据处理, 评分大于3为用户喜爱电影 train_y_lr_ = train_y_.apply(lambda x: 1 if int(x) > 3 else 0) test_y_lr_ = test_y_.apply(lambda x: 1 if int(x) > 3 else 0) # 最大最小值归一化 scaler = preprocessing.MaxAbsScaler() scaler.fit(train_X) train_X_scaling = scaler.transform(train_X_) test_X_scaling = scaler.transform(test_X_) train_X_lr = train_X_ # no scale test_X_lr = test_X_ # no scale time_now_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") train_file_fm_base_url = cfg.get_config_property('train_file_fm_t_url', conn) test_file_fm_base_url = cfg.get_config_property('test_file_fm_t_url', conn) train_file_fm = train_file_fm_base_url % time_now_str test_file_fm = test_file_fm_base_url % time_now_str # 转换为libsvm格式数据 dump_svmlight_file(train_X_scaling, train_y_, train_file_fm) dump_svmlight_file(test_X_scaling, test_y_, test_file_fm) train_file_lr_base_url = cfg.get_config_property('train_file_lr_t_url', conn) test_file_lr_base_url = cfg.get_config_property('test_file_lr_t_url', conn) train_file_lr = train_file_lr_base_url % time_now_str test_file_lr = test_file_lr_base_url % time_now_str # 转换为libsvm格式数据 dump_svmlight_file(train_X_lr, train_y_lr_, train_file_lr) dump_svmlight_file(test_X_lr, test_y_lr_, test_file_lr) cfg.set_config_property(train_file_fm, 'train_file_fm_url', conn) cfg.set_config_property(test_file_fm, 'test_file_fm_url', conn) cfg.set_config_property(train_file_lr, 'train_file_lr_url', conn) cfg.set_config_property(test_file_lr, 'test_file_lr_url', conn) dict2vec_save_url = cfg.get_config_property('dict2vec', conn) with open(dict2vec_save_url, 'wb') as f: pkl.dump(v, f) scaler_save_url = cfg.get_config_property('scaler', conn) with open(scaler_save_url, 'wb') as f: pkl.dump(scaler, f) end_time = datetime.datetime.now() print(end_time - start_time) print('finish process comment to libsvm task:' + str(datetime.datetime.now()))
def get_movie_data(): df = pd.read_sql_query("select * from movie", common.get_connection()) df_imp = df.drop(['ADD_TIME', 'enable', 'rat', 'id', 'name'], axis=1) return df, df_imp
predict_x = v_from_pkl.transform({'美国': 1, '日本1': 1}) print(predict_x[predict_x != 0]) def run(cmd): conn = common.get_connection() base_dir = cfg.get_config_property('dir_base_url', conn) temp_dir = base_dir + os.sep + 'tmp' + os.sep out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000) final_temp_dir = temp_dir + os.sep try: fileno = out_temp.fileno() p = subprocess.Popen(cmd, shell=False, cwd=final_temp_dir, stdout=fileno, stderr=fileno, universal_newlines=True) p.wait() out_temp.seek(0) print(out_temp.read().decode('utf8', 'replace')) except Exception as e: raise RuntimeError('run error: %s' % str(e)) finally: if out_temp: out_temp.close() cmd = cfg.get_config_property('lib_fm_path', common.get_connection()) run(cmd)