Beispiel #1
0
def process_task():
    _conn = common.get_connection()
    # update_recmovie_rat('1', '1', _conn, 'FM')
    # os._exit(0)
    train_file_scaling = cfg.get_config_property('train_file_fm_url', _conn)
    test_file_scaling = cfg.get_config_property('test_file_fm_url', _conn)
    # test_fm_by_test_data(train_file_scaling, test_file_scaling)
    df_data = get_recmovie_by_movie_based()
    actor_dict_data, director_dict_data, vectorizer, scaler = get_saved_actors_dict_director_dict_vectorizer(
    )
    dict_list = convert_dataframe_2_dict_list(df_data, actor_dict_data,
                                              director_dict_data)
    X_predict = vectorizer.transform(dict_list)
    predict_file_ = cfg.get_config_property('dir_base_url',
                                            _conn) + 'X_predict.txt'

    # FM PART
    # 把 X_predict 处理成libsvm格式,供libfm使用
    dump_svmlight_file(scaler.transform(X_predict),
                       np.zeros(X_predict.shape[0]), predict_file_)
    libfm_predict_final = fm(train_file_scaling,
                             predict_file_,
                             classification=False)
    update_fm_rat(df_data, libfm_predict_final)

    # LR PART
    train_file_lr_path = cfg.get_config_property('train_file_lr_url', _conn)
    test_file_lr_path = cfg.get_config_property('test_file_lr_url', _conn)

    train_X_lr, train_y = get_data(train_file_lr_path)
    # test_X_lr, test_y = get_data(test_file_lr_path)
    print(train_X_lr.shape)

    lr = LogisticRegression(C=0.1, penalty='l2')
    lr.fit(train_X_lr, train_y)

    # test_predict = vectorizer.transform([{'尼泊尔': 1},
    #                                      {'赵本山': 1, '赵薇': 1, '张曼玉': 1, 'rat': '8.0',
    #                                                   'ravg': 3.85714,
    #                                                   'rcount': 7.0,
    #                                                   'rmax': 5.0,
    #                                                   'rmedian': 4.0,
    #                                                   'rmin': 2.0,
    #                                                   'rsum': 27.0},
    #                                      {'克里斯·派恩': 1, '扎克瑞·昆图': 1, '佐伊·索尔达娜': 1,'西蒙·佩吉':1, '安东·叶利钦':1, '林诣彬':1 ,
    #                                                  '美国':1,
    #                                                   'rat': '8.0',
    #                                                   'ravg': 3.85714,
    #                                                   'rcount': 7.0,
    #                                                   'rmax': 5.0,
    #                                                   'rmedian': 4.0,
    #                                                   'rmin': 2.0,
    #                                                   'rsum': 27.0}])
    # print(lr.predict_proba(test_predict))

    lr_predict_final = lr.predict_proba(X_predict)
    update_lr_rat(df_data, lr_predict_final.tolist())
    print(lr.classes_)
Beispiel #2
0
def fm(train_file, test_file, classification=True, rank=10, n_iter=150):
    conn = common.get_connection()
    libfm = cfg.get_config_property('lib_fm_path', conn)
    task = 'c' if classification else 'r'
    base_dir = cfg.get_config_property('dir_base_url', conn)
    cmd_ = '%s -task %s -method mcmc -train %s -test %s -iter %s -dim \'1,1,%s\' -out %soutput_.libfm' % (
        libfm, task, train_file, test_file, n_iter, rank, base_dir)
    #console_output = !$LIBFM_PATH -task $task -method als -regular '0,0,10' -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm
    #console_output = !$LIBFM_PATH -task $task -method sgd -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm
    print(libfm)
    console_output = run(cmd_)
    print(console_output)
    libfm_predict = pd.read_csv('%soutput_.libfm' % base_dir,
                                header=None).values.flatten()
    return libfm_predict
Beispiel #3
0
def load_main_df_from_csv():
    conn = common.get_connection()
    csv_url = cfg.get_config_property('csv_last_url', conn)
    df = pd.read_csv(csv_url, sep='\t', encoding='utf-8')
    df = df.drop_duplicates()
    df = df.drop(['Unnamed: 0'], axis=1)
    df = df.drop_duplicates(['ID'])
    df = df.drop(['CONTENT'], axis=1)
    df = df.drop(['ADD_TIME_x', 'ADD_TIME_y'], axis=1)
    df = df.reset_index(drop=True)
    df_main = df.drop(
        ['name', 'CREATOR', 'description', 'img', 'ID', 'NEWDATA'], axis=1)
    df_main = df_main.rename(columns={'MOVIEID': 'movieid'})
    df_main = df_main.drop(['enable'], axis=1)
    # datetime.datetime.strptime(df['TIME'][0],'%Y-%m-%d %H:%M:%S').year - 2000
    df_main = df_main.dropna(subset=['USERID', 'rcount']).reset_index(
        drop=True)

    def process_time(t):
        try:
            return datetime.datetime.strptime(t,
                                              '%Y-%m-%d %H:%M:%S').year - 2000
        except Exception as e:
            print(e)

    df_main['TIME_DIS'] = df_main['TIME'].apply(lambda x: process_time(x))
    df_main = df_main.drop(['TIME'], axis=1)
    df_main = df_main.drop(['userid'], axis=1)
    return df_main
Beispiel #4
0
def get_saved_actors_dict_director_dict_vectorizer():
    conn = common.get_connection()
    dict2vec_url = cfg.get_config_property('dict2vec', conn)
    actors_dict_url = cfg.get_config_property('actors_dict', conn)
    director_dict_url = cfg.get_config_property('director_dict', conn)
    scaler_url = cfg.get_config_property('scaler', conn)

    with open(dict2vec_url, 'rb') as f:
        v_from_pkl = pkl.load(f)

    with open(actors_dict_url, 'rb') as f:
        actors_dict = pkl.load(f)

    with open(director_dict_url, 'rb') as f:
        director_dict = pkl.load(f)

    with open(scaler_url, 'rb') as f:
        scaler = pkl.load(f)

    return actors_dict, director_dict, v_from_pkl, scaler
def run(cmd):

    conn = common.get_connection()
    base_dir = cfg.get_config_property('dir_base_url', conn)
    temp_dir = base_dir + os.sep + 'tmp' + os.sep
    out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000)
    final_temp_dir = temp_dir + os.sep
    try:
        fileno = out_temp.fileno()
        p = subprocess.Popen(cmd,
                             shell=False,
                             cwd=final_temp_dir,
                             stdout=fileno,
                             stderr=fileno,
                             universal_newlines=True)
        p.wait()
        out_temp.seek(0)
        print(out_temp.read().decode('utf8', 'replace'))
    except Exception as e:
        raise RuntimeError('run error: %s' % str(e))
    finally:
        if out_temp:
            out_temp.close()
Beispiel #6
0
def process_task():
    global csv_url_cache
    start_time = datetime.datetime.now()
    print('start process comment to libsvm task:' +
          str(datetime.datetime.now()))

    conn = common.get_connection()
    csv_url = cfg.get_config_property('csv_last_url', conn)
    if csv_url_cache is None:
        csv_url_cache = csv_url
    elif csv_url_cache == csv_url:
        print('there is no new comment csv...')
        return
    # 从csv文件加载数据集
    data_frame_main = load_main_df_from_csv()
    conn = common.get_connection()
    # 加载字典频次对象
    _, actors_dict_, director_dict_, _ = get_dicts()
    actors_dict_save_url = cfg.get_config_property('actors_dict', conn)
    director_dict_save_url = cfg.get_config_property('director_dict', conn)
    with open(actors_dict_save_url, 'wb') as f:
        pkl.dump(actors_dict_, f)
    with open(director_dict_save_url, 'wb') as f:
        pkl.dump(director_dict_, f)

    train_y = data_frame_main['RATING']
    data_frame_main = data_frame_main.drop(['RATING'], axis=1)
    # 获取整体数据集的字典形式数据
    dict_data_list = get_dict_list(data_frame_main, actors_dict_,
                                   director_dict_)

    # 把字典形式的数据做向量化
    v = DictVectorizer()
    train_X = v.fit_transform(dict_data_list)

    train_X_ = train_X[0:280000]
    train_y_ = train_y[:280000]
    test_X_ = train_X[280000:]
    test_y_ = train_y[280000:]

    print(train_X_.shape)

    # 对于逻辑回归的训练集和测试集数据处理, 评分大于3为用户喜爱电影
    train_y_lr_ = train_y_.apply(lambda x: 1 if int(x) > 3 else 0)
    test_y_lr_ = test_y_.apply(lambda x: 1 if int(x) > 3 else 0)

    # 最大最小值归一化
    scaler = preprocessing.MaxAbsScaler()
    scaler.fit(train_X)
    train_X_scaling = scaler.transform(train_X_)
    test_X_scaling = scaler.transform(test_X_)

    train_X_lr = train_X_  # no scale
    test_X_lr = test_X_  # no scale

    time_now_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    train_file_fm_base_url = cfg.get_config_property('train_file_fm_t_url',
                                                     conn)
    test_file_fm_base_url = cfg.get_config_property('test_file_fm_t_url', conn)
    train_file_fm = train_file_fm_base_url % time_now_str
    test_file_fm = test_file_fm_base_url % time_now_str

    # 转换为libsvm格式数据
    dump_svmlight_file(train_X_scaling, train_y_, train_file_fm)
    dump_svmlight_file(test_X_scaling, test_y_, test_file_fm)

    train_file_lr_base_url = cfg.get_config_property('train_file_lr_t_url',
                                                     conn)
    test_file_lr_base_url = cfg.get_config_property('test_file_lr_t_url', conn)
    train_file_lr = train_file_lr_base_url % time_now_str
    test_file_lr = test_file_lr_base_url % time_now_str

    # 转换为libsvm格式数据
    dump_svmlight_file(train_X_lr, train_y_lr_, train_file_lr)
    dump_svmlight_file(test_X_lr, test_y_lr_, test_file_lr)

    cfg.set_config_property(train_file_fm, 'train_file_fm_url', conn)
    cfg.set_config_property(test_file_fm, 'test_file_fm_url', conn)
    cfg.set_config_property(train_file_lr, 'train_file_lr_url', conn)
    cfg.set_config_property(test_file_lr, 'test_file_lr_url', conn)

    dict2vec_save_url = cfg.get_config_property('dict2vec', conn)
    with open(dict2vec_save_url, 'wb') as f:
        pkl.dump(v, f)

    scaler_save_url = cfg.get_config_property('scaler', conn)
    with open(scaler_save_url, 'wb') as f:
        pkl.dump(scaler, f)

    end_time = datetime.datetime.now()
    print(end_time - start_time)
    print('finish process comment to libsvm task:' +
          str(datetime.datetime.now()))
predict_x = v_from_pkl.transform({'美国': 1, '日本1': 1})
print(predict_x[predict_x != 0])


def run(cmd):

    conn = common.get_connection()
    base_dir = cfg.get_config_property('dir_base_url', conn)
    temp_dir = base_dir + os.sep + 'tmp' + os.sep
    out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000)
    final_temp_dir = temp_dir + os.sep
    try:
        fileno = out_temp.fileno()
        p = subprocess.Popen(cmd,
                             shell=False,
                             cwd=final_temp_dir,
                             stdout=fileno,
                             stderr=fileno,
                             universal_newlines=True)
        p.wait()
        out_temp.seek(0)
        print(out_temp.read().decode('utf8', 'replace'))
    except Exception as e:
        raise RuntimeError('run error: %s' % str(e))
    finally:
        if out_temp:
            out_temp.close()


cmd = cfg.get_config_property('lib_fm_path', common.get_connection())
run(cmd)