Beispiel #1
0
def func_main():
    print('start process movie based recall task:' +
          str(datetime.datetime.now()))
    start_time = datetime.datetime.now()

    conn = common.get_connection()
    df_comment_new_data_ldim = get_comment_data()
    conn = common.get_connection()
    for i in df_comment_new_data_ldim.index:
        print(df_comment_new_data_ldim.iloc[i]['MOVIEID'],
              df_comment_new_data_ldim.iloc[i]['USERID'])
        ibmovie_list = get_ibmovie_by_movieid(
            df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn)
        for j in ibmovie_list:
            is_exist = exist_in_comment(
                j[0], df_comment_new_data_ldim.iloc[i]['USERID'], conn)
            if is_exist:
                print('exist in comment')
            else:
                insert_or_update_recmovie(
                    j[0], df_comment_new_data_ldim.iloc[i]['USERID'],
                    df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn)
        #update_comment_new_data_flag(df_comment_new_data_ldim.iloc[i]['ID'], conn)
        conn.commit()

    end_time = datetime.datetime.now()
    print(end_time - start_time)
    print('finish process movie based recall task:' +
          str(datetime.datetime.now()))
def func_main():
    print('start process movie based recall task:' +
          str(datetime.datetime.now()))
    start_time = datetime.datetime.now()

    conn = common.get_connection()
    # 获取增量的评论评分数据
    df_comment_new_data_ldim = get_comment_data()
    conn = common.get_connection()
    # 遍历评分数据
    for i in df_comment_new_data_ldim.index:
        print(df_comment_new_data_ldim.iloc[i]['MOVIEID'],
              df_comment_new_data_ldim.iloc[i]['USERID'])
        # 从相似推荐中找到相似的电影
        ibmovie_list = get_ibmovie_by_movieid(
            df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn)
        for j in ibmovie_list:
            is_exist = exist_in_comment(
                j[0], df_comment_new_data_ldim.iloc[i]['USERID'], conn)
            # 如果推荐的电影,该用户已经评分过(看过, 则不推荐
            # 否则, 插入到recmove表中(recmove表的插入也有存在判断,如果存在,则更新)
            if is_exist:
                print('exist in comment')
            else:
                insert_or_update_recmovie(
                    j[0], df_comment_new_data_ldim.iloc[i]['USERID'],
                    df_comment_new_data_ldim.iloc[i]['MOVIEID'], conn)
        #update_comment_new_data_flag(df_comment_new_data_ldim.iloc[i]['ID'], conn)
        conn.commit()

    end_time = datetime.datetime.now()
    print(end_time - start_time)
    print('finish process movie based recall task:' +
          str(datetime.datetime.now()))
Beispiel #3
0
def insert_or_update_one_userproex(userid, rmax, rmin, ravg, rmedium, rcount,
                                   rsum):
    print('insert_or_update_one_userproex start')
    is_insert = True
    connection = common.get_connection()
    try:
        sql = 'select count(*) from userproex_new where userid=\'%s\'' % userid
        with connection.cursor() as cursor:
            cursor.execute(sql)
        connection.commit()
        res_cnt = cursor.fetchall()[0][0]
        if res_cnt > 0:
            is_insert = False
    except Exception as e:
        print('query for exist info error' + str(e))

    if is_insert:
        sql = 'insert into userproex_new(userid, rmax, rmin, ravg, rcount, rsum, rmedian) values(\'%s\', %s, %s, %s, %s, %s, %s)' % (
            userid, rmax, rmin, ravg, rcount, rsum, rmedium)
    else:
        sql = 'update userproex_new set rmax=%s, rmin=%s, ravg=%s, rmedian=%s, rcount=%s, rsum=%s where userid=\'%s\'' % (
            rmax, rmin, ravg, rmedium, rcount, rsum, userid)

    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        connection.commit()
        print(('insert userid:' if is_insert else 'update userid:') +
              str(userid) + ' success.')
    except Exception as e:
        print(e)
        connection.close()
    connection.close()
Beispiel #4
0
def get_user_list_in_comment(is_all_or_sample, process_func,
                             insert_or_update_func):
    connection = common.get_connection()
    if is_all_or_sample:
        sql = 'select DISTINCT userid from comment_new where userid is not null and userid in (select userid from userproex_new where rmedian=0)'
    else:
        sql = 'select DISTINCT userid from comment_new where userid is not null and userid in (select userid from userproex_new where rmedian=0) limit 0,10'

    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        connection.commit()
        process_size = 0
        while True:
            r = cursor.fetchone()
            if r is None:
                break
            process_func(r[0], insert_or_update_func, connection)
            process_size += 1
            if process_size >= 1000:
                connection.commit()
                process_size = 0
        connection.commit()
    except Exception as e:
        print(e)
        connection.close()
    connection.close()
Beispiel #5
0
def load_main_df_from_csv():
    conn = common.get_connection()
    csv_url = cfg.get_config_property('csv_last_url', conn)
    df = pd.read_csv(csv_url, sep='\t', encoding='utf-8')
    df = df.drop_duplicates()
    df = df.drop(['Unnamed: 0'], axis=1)
    df = df.drop_duplicates(['ID'])
    df = df.drop(['CONTENT'], axis=1)
    df = df.drop(['ADD_TIME_x', 'ADD_TIME_y'], axis=1)
    df = df.reset_index(drop=True)
    df_main = df.drop(
        ['name', 'CREATOR', 'description', 'img', 'ID', 'NEWDATA'], axis=1)
    df_main = df_main.rename(columns={'MOVIEID': 'movieid'})
    df_main = df_main.drop(['enable'], axis=1)
    # datetime.datetime.strptime(df['TIME'][0],'%Y-%m-%d %H:%M:%S').year - 2000
    df_main = df_main.dropna(subset=['USERID', 'rcount']).reset_index(
        drop=True)

    def process_time(t):
        try:
            return datetime.datetime.strptime(t,
                                              '%Y-%m-%d %H:%M:%S').year - 2000
        except Exception as e:
            print(e)

    df_main['TIME_DIS'] = df_main['TIME'].apply(lambda x: process_time(x))
    df_main = df_main.drop(['TIME'], axis=1)
    df_main = df_main.drop(['userid'], axis=1)
    return df_main
def process_offline_compute_by_cosdis(rec_per_num, item_similarity, df_sim,
                                      df_orgin):
    rec_per_num += 1
    connection = common.get_connection()
    for i in range(0, item_similarity.shape[0]):
        df_sim_p = df_sim.nsmallest(rec_per_num, i)
        df_sim_p = df_sim_p[i]
        movie_id = df_orgin.iloc[i]['id']
        recmovie_cnt = get_recmovie_cnt_by_movieid(movie_id, connection)
        if recmovie_cnt == 200:
            print('pass...')
            continue
        print('new...')
        time_now = datetime.datetime.now()
        for rec_movie_item in df_sim_p.to_dict().items():
            if rec_movie_item[0] != i:
                rec_movie_index = rec_movie_item[0]
                rec_movie_sim = rec_movie_item[1]
                rec_movie_id = df_orgin.iloc[rec_movie_index]['id']
                rec_movie_rat = df_orgin.iloc[rec_movie_index]['rat']
                insert_one_ibmovie(id_=uuid.uuid4(),
                                   movieid=movie_id,
                                   recmovieid=rec_movie_id,
                                   recmovierat=rec_movie_rat,
                                   simrat=rec_movie_sim,
                                   time=time_now,
                                   enable='1',
                                   connection=connection)
        connection.commit()
Beispiel #7
0
def get_dicts():
    conn = common.get_connection()
    df_movie = pd.read_sql_query("select * from movie", conn)
    #type_dict = get_dim_dict(df_movie, 'type')
    actors_dict = get_dim_dict(df_movie, 'actors')
    director_dict = get_dim_dict(df_movie, 'director')
    #trait_dict = get_dim_dict(df_movie, 'trait')
    return None, actors_dict, director_dict, None
Beispiel #8
0
def get_comment_data():
    conn = common.get_connection()
    df_comment_new_data = pd.read_sql_query(
        "select * from comment_new where newdata = 1 ", conn)
    df_comment_new_data_ldim = df_comment_new_data.loc[:, [
        'ID', 'MOVIEID', 'USERID'
    ]]
    return df_comment_new_data_ldim
Beispiel #9
0
def process_task():
    _conn = common.get_connection()
    # update_recmovie_rat('1', '1', _conn, 'FM')
    # os._exit(0)
    train_file_scaling = cfg.get_config_property('train_file_fm_url', _conn)
    test_file_scaling = cfg.get_config_property('test_file_fm_url', _conn)
    # test_fm_by_test_data(train_file_scaling, test_file_scaling)
    df_data = get_recmovie_by_movie_based()
    actor_dict_data, director_dict_data, vectorizer, scaler = get_saved_actors_dict_director_dict_vectorizer(
    )
    dict_list = convert_dataframe_2_dict_list(df_data, actor_dict_data,
                                              director_dict_data)
    X_predict = vectorizer.transform(dict_list)
    predict_file_ = cfg.get_config_property('dir_base_url',
                                            _conn) + 'X_predict.txt'

    # FM PART
    # 把 X_predict 处理成libsvm格式,供libfm使用
    dump_svmlight_file(scaler.transform(X_predict),
                       np.zeros(X_predict.shape[0]), predict_file_)
    libfm_predict_final = fm(train_file_scaling,
                             predict_file_,
                             classification=False)
    update_fm_rat(df_data, libfm_predict_final)

    # LR PART
    train_file_lr_path = cfg.get_config_property('train_file_lr_url', _conn)
    test_file_lr_path = cfg.get_config_property('test_file_lr_url', _conn)

    train_X_lr, train_y = get_data(train_file_lr_path)
    # test_X_lr, test_y = get_data(test_file_lr_path)
    print(train_X_lr.shape)

    lr = LogisticRegression(C=0.1, penalty='l2')
    lr.fit(train_X_lr, train_y)

    # test_predict = vectorizer.transform([{'尼泊尔': 1},
    #                                      {'赵本山': 1, '赵薇': 1, '张曼玉': 1, 'rat': '8.0',
    #                                                   'ravg': 3.85714,
    #                                                   'rcount': 7.0,
    #                                                   'rmax': 5.0,
    #                                                   'rmedian': 4.0,
    #                                                   'rmin': 2.0,
    #                                                   'rsum': 27.0},
    #                                      {'克里斯·派恩': 1, '扎克瑞·昆图': 1, '佐伊·索尔达娜': 1,'西蒙·佩吉':1, '安东·叶利钦':1, '林诣彬':1 ,
    #                                                  '美国':1,
    #                                                   'rat': '8.0',
    #                                                   'ravg': 3.85714,
    #                                                   'rcount': 7.0,
    #                                                   'rmax': 5.0,
    #                                                   'rmedian': 4.0,
    #                                                   'rmin': 2.0,
    #                                                   'rsum': 27.0}])
    # print(lr.predict_proba(test_predict))

    lr_predict_final = lr.predict_proba(X_predict)
    update_lr_rat(df_data, lr_predict_final.tolist())
    print(lr.classes_)
Beispiel #10
0
def process_per_user(userid, insert_or_update_func):
    connection = common.get_connection()

    rmedian = 0
    rmax = 0
    rmin = 0
    ravg = 0
    rcount = 0
    rsum = 0

    sql = 'select rating from comment_new where userid=\'%s\'' % userid
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        rlist = cursor.fetchall()
        rmedian = np.median(
            np.array(list(map(lambda x: x[0], rlist))).astype(float))
    except Exception as e:
        print(e)
        connection.close()

    sql = 'select max(rating) as rmax, min(rating) as rmin, avg(rating) as ravg, count(rating) as rcount, sum(rating) as rsum from comment_new where userid=\'%s\'' % userid
    try:
        count = 0
        with connection.cursor() as cursor:
            count = cursor.execute(sql)
        if count > 0:
            r = cursor.fetchone()
            rmax, rmin, ravg, rcount, rsum = r[0], r[1], r[2], r[3], r[4]
    except Exception as e:
        print(e)
        connection.close()

    if rmax is None:
        rmax = 0
    if rmin is None:
        rmin = 0
    if ravg is None:
        ravg = 0
    if rcount is None:
        rcount = 0
    if rsum is None:
        rsum = 0
    if math.isnan(rmedian):
        rmedian = 0

    insert_or_update_func(userid=userid,
                          rmax=rmax,
                          rmin=rmin,
                          ravg=ravg,
                          rmedium=rmedian,
                          rcount=rcount,
                          rsum=rsum)
    connection.close()
Beispiel #11
0
def update_fm_rat(df_data, libfm_predict_final):
    conn = common.get_connection()
    index_ = 0
    for r in libfm_predict_final:
        r_temp = str(r)
        try:
            r_temp = str(r)[0:1]
        except Exception as e:
            print("Update FM rat exception")
            print(e)
        update_recmovie_rat(df_data.iloc[index_]['id'], r_temp, conn, 'FM')
        index_ += 1
def process_per_movie(movieid, sim_matrix, df_origin, rec_per_num=201):
    df_sim_matrix = pd.DataFrame(data=sim_matrix)
    print(df_sim_matrix.shape)
    connection = common.get_connection()
    time_now = datetime.datetime.now()
    for i in range(0, sim_matrix.shape[0]):
        if df_origin.iloc[i]['id'] == movieid:
            print(df_origin.iloc[i])
            df_sim_p_self = df_sim_matrix.nsmallest(rec_per_num, i)
            df_sim_p_self = df_sim_p_self[i]
            for rec_movie_item in df_sim_p_self.to_dict().items():
                if rec_movie_item[0] != i:
                    rec_movie_index = rec_movie_item[0]
                    rec_movie_dis = rec_movie_item[1]
                    rec_movie_id = df_origin.iloc[rec_movie_index]['id']
                    rec_movie_rat = df_origin.iloc[rec_movie_index]['rat']
                    insert_one_ibmovie(id_=uuid.uuid4(),
                                       movieid=movieid,
                                       recmovieid=rec_movie_id,
                                       recmovierat=rec_movie_rat,
                                       simrat=rec_movie_dis,
                                       time=time_now,
                                       enable='1',
                                       connection=connection)

            df_sim_p = df_sim_matrix[i]
            sim_item_list = df_sim_p.to_dict().items()
            sim_item_list_new = list(
                filter(lambda x: True
                       if float(x[1]) < 0.80482 else False, sim_item_list))
            for sim_movie_item in sim_item_list_new:
                if sim_movie_item[0] != i:
                    sim_movie_index = sim_movie_item[0]
                    sim_movie_dis = sim_movie_item[1]
                    sime_movie_id = df_origin.iloc[sim_movie_index]['id']
                    sim_movie_rat = df_origin.iloc[sim_movie_index]['rat']

                    min_sim_movie4self = get_min_sim_movie_by_movieid(
                        sime_movie_id)
                    print(min_sim_movie4self)
                    print(sim_movie_dis)
                    if sim_movie_dis < min_sim_movie4self[2]:
                        print('insert as recmovie...')
                        insert_one_ibmovie(id_=uuid.uuid4(),
                                           movieid=sime_movie_id,
                                           recmovieid=movieid,
                                           recmovierat=sim_movie_rat,
                                           simrat=sim_movie_dis,
                                           time=time_now,
                                           enable='1',
                                           connection=connection)
Beispiel #13
0
def get_recmovie_by_movie_based():
    conn = common.get_connection()
    sql = 'select * from recmovie left join movie on movie.id=recmovie.movieid left join userproex_new on userproex_new.userid=recmovie.userid'
    df_data = pd.read_sql_query(sql, conn)
    df_data = df_data.drop([0], axis=1)
    #df_data = df_data.rename(columns = {'userid':'USERID'})
    #df_data['USERID'].fillna(0)
    print(df_data.columns)
    df_data = df_data.loc[:, ~df_data.columns.duplicated()]
    #print(df_data.shape)
    df_data = df_data.rename(columns={'userid': 'USERID'})
    df_data['TIME_DIS'] = np.zeros(df_data.shape[0])
    df_data = df_data.fillna(0)
    return df_data
Beispiel #14
0
def test_fm_by_test_data(train_file, test_file):

    conn = common.get_connection()
    libfm_predict = fm(train_file, test_file, classification=False)

    libfm_predict_series = pd.Series(libfm_predict)
    libfm_predict_series_int = libfm_predict_series.apply(lambda x: int(x))

    _, y = get_data(test_file)

    print('MSE(to int):' +
          str(mean_squared_error(y, libfm_predict_series_int.tolist())))
    print('MSE(origin):' +
          str(mean_squared_error(y, libfm_predict_series.tolist())))
def get_min_sim_movie_by_movieid(movieid):
    connection = common.get_connection()
    sql = 'select id,simrat,convert(simrat , DECIMAL(5,5)) as simrat_5 from ibmovie where movieid = \'%s\' order by convert(simrat , DECIMAL(5,5)) desc limit 0,10' % movieid
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)

        r = cursor.fetchone()
        if r is None:
            return None
        return r
    except Exception as e:
        print(e)
        connection.close()
    connection.close()
Beispiel #16
0
def fm(train_file, test_file, classification=True, rank=10, n_iter=150):
    conn = common.get_connection()
    libfm = cfg.get_config_property('lib_fm_path', conn)
    task = 'c' if classification else 'r'
    base_dir = cfg.get_config_property('dir_base_url', conn)
    cmd_ = '%s -task %s -method mcmc -train %s -test %s -iter %s -dim \'1,1,%s\' -out %soutput_.libfm' % (
        libfm, task, train_file, test_file, n_iter, rank, base_dir)
    #console_output = !$LIBFM_PATH -task $task -method als -regular '0,0,10' -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm
    #console_output = !$LIBFM_PATH -task $task -method sgd -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -save_model recsysmode.fm -out output_.libfm
    print(libfm)
    console_output = run(cmd_)
    print(console_output)
    libfm_predict = pd.read_csv('%soutput_.libfm' % base_dir,
                                header=None).values.flatten()
    return libfm_predict
def get_recmovie_cnt_by_movieid(movieid, connection):
    sql = 'select count(*) from ibmovie group by movieid having movieid=\'%s\'' % movieid
    try:
        with connection.cursor() as cursor:
            cout = cursor.execute(sql)
            if cout == 0:
                return 0
            return cursor.fetchone()[0]
    except Exception as e:
        print('exception@get_recmovie_by_movieid:' + str(e))
        try:
            connection.cursor().close()
            connection.close()
            print('closed')
        except Exception as e1:
            print('exception1@get_recmovie_by_movieid:' + str(e1))
        connection = common.get_connection()
Beispiel #18
0
def get_saved_actors_dict_director_dict_vectorizer():
    conn = common.get_connection()
    dict2vec_url = cfg.get_config_property('dict2vec', conn)
    actors_dict_url = cfg.get_config_property('actors_dict', conn)
    director_dict_url = cfg.get_config_property('director_dict', conn)
    scaler_url = cfg.get_config_property('scaler', conn)

    with open(dict2vec_url, 'rb') as f:
        v_from_pkl = pkl.load(f)

    with open(actors_dict_url, 'rb') as f:
        actors_dict = pkl.load(f)

    with open(director_dict_url, 'rb') as f:
        director_dict = pkl.load(f)

    with open(scaler_url, 'rb') as f:
        scaler = pkl.load(f)

    return actors_dict, director_dict, v_from_pkl, scaler
def process_by_movie_change_log(process_func, sim_matrix, df_origin):
    connection = common.get_connection()
    sql = 'select * from mqlog where logtype = \'m\' and pulled = 0 limit 0,10'
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        while True:
            r = cursor.fetchone()
            if r is None:
                break
            message = json.loads(r[2])
            movieid = message['movieid']
            print('process movie\'s id is:' + movieid)
            process_func(movieid, sim_matrix, df_origin)
            with connection.cursor() as cursor4update:
                update_sql = 'update mqlog set pulled=1 where id=\'%s\'' % r[0]
                cursor4update.execute(update_sql)
        connection.commit()
    except Exception as e:
        print(e)
        connection.close()
    connection.close()
def run(cmd):

    conn = common.get_connection()
    base_dir = cfg.get_config_property('dir_base_url', conn)
    temp_dir = base_dir + os.sep + 'tmp' + os.sep
    out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000)
    final_temp_dir = temp_dir + os.sep
    try:
        fileno = out_temp.fileno()
        p = subprocess.Popen(cmd,
                             shell=False,
                             cwd=final_temp_dir,
                             stdout=fileno,
                             stderr=fileno,
                             universal_newlines=True)
        p.wait()
        out_temp.seek(0)
        print(out_temp.read().decode('utf8', 'replace'))
    except Exception as e:
        raise RuntimeError('run error: %s' % str(e))
    finally:
        if out_temp:
            out_temp.close()
Beispiel #21
0
def get_user_list_in_log(process_func, insert_or_update_func):
    connection = common.get_connection()
    sql = 'select * from mqlog where logtype = \'u\' and pulled = 0 limit 0,10'

    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        while True:
            r = cursor.fetchone()
            if r is None:
                break
            message = json.loads(r[2])
            userid = message['userid']
            print('process user\'s id is:' + userid)
            process_func(userid, insert_or_update_func)
            with connection.cursor() as cursor4update:
                update_sql = 'update mqlog set pulled=1 where id=\'%s\'' % r[0]
                cursor4update.execute(update_sql)
        connection.commit()
    except Exception as e:
        print(e)
        connection.close()
    connection.close()
Beispiel #22
0
def update_lr_rat(df_data, lr_predict_final):
    conn = common.get_connection()
    index_ = 0
    for r in lr_predict_final:
        update_recmovie_rat(df_data.iloc[index_]['id'], r[1], conn, 'LR')
        index_ += 1
Beispiel #23
0
def update_fm_rat(df_data, libfm_predict_final):
    conn = common.get_connection()
    index_ = 0
    for r in libfm_predict_final:
        update_recmovie_rat(df_data.iloc[index_]['id'], r, conn, 'FM')
        index_ += 1
Beispiel #24
0
def process_task():
    global csv_url_cache
    start_time = datetime.datetime.now()
    print('start process comment to libsvm task:' +
          str(datetime.datetime.now()))

    conn = common.get_connection()
    csv_url = cfg.get_config_property('csv_last_url', conn)
    if csv_url_cache is None:
        csv_url_cache = csv_url
    elif csv_url_cache == csv_url:
        print('there is no new comment csv...')
        return
    # 从csv文件加载数据集
    data_frame_main = load_main_df_from_csv()
    conn = common.get_connection()
    # 加载字典频次对象
    _, actors_dict_, director_dict_, _ = get_dicts()
    actors_dict_save_url = cfg.get_config_property('actors_dict', conn)
    director_dict_save_url = cfg.get_config_property('director_dict', conn)
    with open(actors_dict_save_url, 'wb') as f:
        pkl.dump(actors_dict_, f)
    with open(director_dict_save_url, 'wb') as f:
        pkl.dump(director_dict_, f)

    train_y = data_frame_main['RATING']
    data_frame_main = data_frame_main.drop(['RATING'], axis=1)
    # 获取整体数据集的字典形式数据
    dict_data_list = get_dict_list(data_frame_main, actors_dict_,
                                   director_dict_)

    # 把字典形式的数据做向量化
    v = DictVectorizer()
    train_X = v.fit_transform(dict_data_list)

    train_X_ = train_X[0:280000]
    train_y_ = train_y[:280000]
    test_X_ = train_X[280000:]
    test_y_ = train_y[280000:]

    print(train_X_.shape)

    # 对于逻辑回归的训练集和测试集数据处理, 评分大于3为用户喜爱电影
    train_y_lr_ = train_y_.apply(lambda x: 1 if int(x) > 3 else 0)
    test_y_lr_ = test_y_.apply(lambda x: 1 if int(x) > 3 else 0)

    # 最大最小值归一化
    scaler = preprocessing.MaxAbsScaler()
    scaler.fit(train_X)
    train_X_scaling = scaler.transform(train_X_)
    test_X_scaling = scaler.transform(test_X_)

    train_X_lr = train_X_  # no scale
    test_X_lr = test_X_  # no scale

    time_now_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    train_file_fm_base_url = cfg.get_config_property('train_file_fm_t_url',
                                                     conn)
    test_file_fm_base_url = cfg.get_config_property('test_file_fm_t_url', conn)
    train_file_fm = train_file_fm_base_url % time_now_str
    test_file_fm = test_file_fm_base_url % time_now_str

    # 转换为libsvm格式数据
    dump_svmlight_file(train_X_scaling, train_y_, train_file_fm)
    dump_svmlight_file(test_X_scaling, test_y_, test_file_fm)

    train_file_lr_base_url = cfg.get_config_property('train_file_lr_t_url',
                                                     conn)
    test_file_lr_base_url = cfg.get_config_property('test_file_lr_t_url', conn)
    train_file_lr = train_file_lr_base_url % time_now_str
    test_file_lr = test_file_lr_base_url % time_now_str

    # 转换为libsvm格式数据
    dump_svmlight_file(train_X_lr, train_y_lr_, train_file_lr)
    dump_svmlight_file(test_X_lr, test_y_lr_, test_file_lr)

    cfg.set_config_property(train_file_fm, 'train_file_fm_url', conn)
    cfg.set_config_property(test_file_fm, 'test_file_fm_url', conn)
    cfg.set_config_property(train_file_lr, 'train_file_lr_url', conn)
    cfg.set_config_property(test_file_lr, 'test_file_lr_url', conn)

    dict2vec_save_url = cfg.get_config_property('dict2vec', conn)
    with open(dict2vec_save_url, 'wb') as f:
        pkl.dump(v, f)

    scaler_save_url = cfg.get_config_property('scaler', conn)
    with open(scaler_save_url, 'wb') as f:
        pkl.dump(scaler, f)

    end_time = datetime.datetime.now()
    print(end_time - start_time)
    print('finish process comment to libsvm task:' +
          str(datetime.datetime.now()))
def get_movie_data():
    df = pd.read_sql_query("select * from movie", common.get_connection())
    df_imp = df.drop(['ADD_TIME', 'enable', 'rat', 'id', 'name'], axis=1)
    return df, df_imp
predict_x = v_from_pkl.transform({'美国': 1, '日本1': 1})
print(predict_x[predict_x != 0])


def run(cmd):

    conn = common.get_connection()
    base_dir = cfg.get_config_property('dir_base_url', conn)
    temp_dir = base_dir + os.sep + 'tmp' + os.sep
    out_temp = tempfile.SpooledTemporaryFile(max_size=10 * 1000 * 1000)
    final_temp_dir = temp_dir + os.sep
    try:
        fileno = out_temp.fileno()
        p = subprocess.Popen(cmd,
                             shell=False,
                             cwd=final_temp_dir,
                             stdout=fileno,
                             stderr=fileno,
                             universal_newlines=True)
        p.wait()
        out_temp.seek(0)
        print(out_temp.read().decode('utf8', 'replace'))
    except Exception as e:
        raise RuntimeError('run error: %s' % str(e))
    finally:
        if out_temp:
            out_temp.close()


cmd = cfg.get_config_property('lib_fm_path', common.get_connection())
run(cmd)