def process_task(): _conn = common.get_connection() # update_recmovie_rat('1', '1', _conn, 'FM') # os._exit(0) train_file_scaling = cfg.get_config_property('train_file_fm_url', _conn) test_file_scaling = cfg.get_config_property('test_file_fm_url', _conn) # test_fm_by_test_data(train_file_scaling, test_file_scaling) df_data = get_recmovie_by_movie_based() actor_dict_data, director_dict_data, vectorizer, scaler = get_saved_actors_dict_director_dict_vectorizer( ) dict_list = convert_dataframe_2_dict_list(df_data, actor_dict_data, director_dict_data) X_predict = vectorizer.transform(dict_list) predict_file_ = cfg.get_config_property('dir_base_url', _conn) + 'X_predict.txt' # FM PART # 把 X_predict 处理成libsvm格式,供libfm使用 dump_svmlight_file(scaler.transform(X_predict), np.zeros(X_predict.shape[0]), predict_file_) libfm_predict_final = fm(train_file_scaling, predict_file_, classification=False) update_fm_rat(df_data, libfm_predict_final) # LR PART train_file_lr_path = cfg.get_config_property('train_file_lr_url', _conn) test_file_lr_path = cfg.get_config_property('test_file_lr_url', _conn) train_X_lr, train_y = get_data(train_file_lr_path) # test_X_lr, test_y = get_data(test_file_lr_path) print(train_X_lr.shape) lr = LogisticRegression(C=0.1, penalty='l2') lr.fit(train_X_lr, train_y) # test_predict = vectorizer.transform([{'尼泊尔': 1}, # {'赵本山': 1, '赵薇': 1, '张曼玉': 1, 'rat': '8.0', # 'ravg': 3.85714, # 'rcount': 7.0, # 'rmax': 5.0, # 'rmedian': 4.0, # 'rmin': 2.0, # 'rsum': 27.0}, # {'克里斯·派恩': 1, '扎克瑞·昆图': 1, '佐伊·索尔达娜': 1,'西蒙·佩吉':1, '安东·叶利钦':1, '林诣彬':1 , # '美国':1, # 'rat': '8.0', # 'ravg': 3.85714, # 'rcount': 7.0, # 'rmax': 5.0, # 'rmedian': 4.0, # 'rmin': 2.0, # 'rsum': 27.0}]) # print(lr.predict_proba(test_predict)) lr_predict_final = lr.predict_proba(X_predict) update_lr_rat(df_data, lr_predict_final.tolist()) print(lr.classes_)
def fm(train_file, test_file, classification=True, rank=10, n_iter=150): conn = common.get_connection() libfm = cfg.get_config_property('lib_fm_path', conn) task = 'c' if classification else 'r' base_dir = cfg.get_config_property('dir_base_url', conn) cmd_ = '%s -task %s -method mcmc -train %s -test %s -iter %s -dim \'1,1,%s\' -out %soutput_.libfm' % ( libfm, task, train_file, test_file, n_iter, rank, base_dir) #console_output = !$LIBFM_PATH -task $task -method als -regular '0,0,10' -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm #console_output = !$LIBFM_PATH -task $task -method sgd -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm print(libfm) console_output = run(cmd_) print(console_output) libfm_predict = pd.read_csv('%soutput_.libfm' % base_dir, header=None).values.flatten() return libfm_predict
def load_main_df_from_csv(): conn = common.get_connection() csv_url = cfg.get_config_property('csv_last_url', conn) df = pd.read_csv(csv_url, sep='\t', encoding='utf-8') df = df.drop_duplicates() df = df.drop(['Unnamed: 0'], axis=1) df = df.drop_duplicates(['ID']) df = df.drop(['CONTENT'], axis=1) df = df.drop(['ADD_TIME_x', 'ADD_TIME_y'], axis=1) df = df.reset_index(drop=True) df_main = df.drop( ['name', 'CREATOR', 'description', 'img', 'ID', 'NEWDATA'], axis=1) df_main = df_main.rename(columns={'MOVIEID': 'movieid'}) df_main = df_main.drop(['enable'], axis=1) # datetime.datetime.strptime(df['TIME'][0],'%Y-%m-%d %H:%M:%S').year - 2000 df_main = df_main.dropna(subset=['USERID', 'rcount']).reset_index( drop=True) def process_time(t): try: return datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').year - 2000 except Exception as e: print(e) df_main['TIME_DIS'] = df_main['TIME'].apply(lambda x: process_time(x)) df_main = df_main.drop(['TIME'], axis=1) df_main = df_main.drop(['userid'], axis=1) return df_main
def get_saved_actors_dict_director_dict_vectorizer(): conn = common.get_connection() dict2vec_url = cfg.get_config_property('dict2vec', conn) actors_dict_url = cfg.get_config_property('actors_dict', conn) director_dict_url = cfg.get_config_property('director_dict', conn) scaler_url = cfg.get_config_property('scaler', conn) with open(dict2vec_url, 'rb') as f: v_from_pkl = pkl.load(f) with open(actors_dict_url, 'rb') as f: actors_dict = pkl.load(f) with open(director_dict_url, 'rb') as f: director_dict = pkl.load(f) with open(scaler_url, 'rb') as f: scaler = pkl.load(f) return actors_dict, director_dict, v_from_pkl, scaler
def run(cmd): conn = common.get_connection() base_dir = cfg.get_config_property('dir_base_url', conn) temp_dir = base_dir + os.sep + 'tmp' + os.sep out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000) final_temp_dir = temp_dir + os.sep try: fileno = out_temp.fileno() p = subprocess.Popen(cmd, shell=False, cwd=final_temp_dir, stdout=fileno, stderr=fileno, universal_newlines=True) p.wait() out_temp.seek(0) print(out_temp.read().decode('utf8', 'replace')) except Exception as e: raise RuntimeError('run error: %s' % str(e)) finally: if out_temp: out_temp.close()
def process_task(): global csv_url_cache start_time = datetime.datetime.now() print('start process comment to libsvm task:' + str(datetime.datetime.now())) conn = common.get_connection() csv_url = cfg.get_config_property('csv_last_url', conn) if csv_url_cache is None: csv_url_cache = csv_url elif csv_url_cache == csv_url: print('there is no new comment csv...') return # 从csv文件加载数据集 data_frame_main = load_main_df_from_csv() conn = common.get_connection() # 加载字典频次对象 _, actors_dict_, director_dict_, _ = get_dicts() actors_dict_save_url = cfg.get_config_property('actors_dict', conn) director_dict_save_url = cfg.get_config_property('director_dict', conn) with open(actors_dict_save_url, 'wb') as f: pkl.dump(actors_dict_, f) with open(director_dict_save_url, 'wb') as f: pkl.dump(director_dict_, f) train_y = data_frame_main['RATING'] data_frame_main = data_frame_main.drop(['RATING'], axis=1) # 获取整体数据集的字典形式数据 dict_data_list = get_dict_list(data_frame_main, actors_dict_, director_dict_) # 把字典形式的数据做向量化 v = DictVectorizer() train_X = v.fit_transform(dict_data_list) train_X_ = train_X[0:280000] train_y_ = train_y[:280000] test_X_ = train_X[280000:] test_y_ = train_y[280000:] print(train_X_.shape) # 对于逻辑回归的训练集和测试集数据处理, 评分大于3为用户喜爱电影 train_y_lr_ = train_y_.apply(lambda x: 1 if int(x) > 3 else 0) test_y_lr_ = test_y_.apply(lambda x: 1 if int(x) > 3 else 0) # 最大最小值归一化 scaler = preprocessing.MaxAbsScaler() scaler.fit(train_X) train_X_scaling = scaler.transform(train_X_) test_X_scaling = scaler.transform(test_X_) train_X_lr = train_X_ # no scale test_X_lr = test_X_ # no scale time_now_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") train_file_fm_base_url = cfg.get_config_property('train_file_fm_t_url', conn) test_file_fm_base_url = cfg.get_config_property('test_file_fm_t_url', conn) train_file_fm = train_file_fm_base_url % time_now_str test_file_fm = test_file_fm_base_url % time_now_str # 转换为libsvm格式数据 dump_svmlight_file(train_X_scaling, train_y_, train_file_fm) dump_svmlight_file(test_X_scaling, test_y_, test_file_fm) train_file_lr_base_url = cfg.get_config_property('train_file_lr_t_url', conn) test_file_lr_base_url = cfg.get_config_property('test_file_lr_t_url', conn) train_file_lr = train_file_lr_base_url % time_now_str test_file_lr = test_file_lr_base_url % time_now_str # 转换为libsvm格式数据 dump_svmlight_file(train_X_lr, train_y_lr_, train_file_lr) dump_svmlight_file(test_X_lr, test_y_lr_, test_file_lr) cfg.set_config_property(train_file_fm, 'train_file_fm_url', conn) cfg.set_config_property(test_file_fm, 'test_file_fm_url', conn) cfg.set_config_property(train_file_lr, 'train_file_lr_url', conn) cfg.set_config_property(test_file_lr, 'test_file_lr_url', conn) dict2vec_save_url = cfg.get_config_property('dict2vec', conn) with open(dict2vec_save_url, 'wb') as f: pkl.dump(v, f) scaler_save_url = cfg.get_config_property('scaler', conn) with open(scaler_save_url, 'wb') as f: pkl.dump(scaler, f) end_time = datetime.datetime.now() print(end_time - start_time) print('finish process comment to libsvm task:' + str(datetime.datetime.now()))
predict_x = v_from_pkl.transform({'美国': 1, '日本1': 1}) print(predict_x[predict_x != 0]) def run(cmd): conn = common.get_connection() base_dir = cfg.get_config_property('dir_base_url', conn) temp_dir = base_dir + os.sep + 'tmp' + os.sep out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000) final_temp_dir = temp_dir + os.sep try: fileno = out_temp.fileno() p = subprocess.Popen(cmd, shell=False, cwd=final_temp_dir, stdout=fileno, stderr=fileno, universal_newlines=True) p.wait() out_temp.seek(0) print(out_temp.read().decode('utf8', 'replace')) except Exception as e: raise RuntimeError('run error: %s' % str(e)) finally: if out_temp: out_temp.close() cmd = cfg.get_config_property('lib_fm_path', common.get_connection()) run(cmd)