def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out):
    """
    根据预测数据,给出预测结果

    Args:
        clf: 分类器
        f_predict_vect: fin, 预测数
        f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件
        f_predict_out: fout, 存放预测结果的文件
    Returns:
        f_predict_out: fout, 存放预测结果的文件
    """
    predict_X, predict_y = generate_X_y_arrays(f_predict_vect)
    logger.debug('predict start.')
    predict_y = clf.predict(predict_X)
    logger.debug('predict done, predict result size=%s' % (len(predict_y)))

    with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout:
        counter = 0
        fin.readline()    # 忽略首行
        fout.write('user_id,item_id,tag')
        
        logger.debug('start store predict result')
        for line in fin:
            line_result = line.strip() + ',%s\n' % (predict_y[counter])
            fout.write(line_result)
            counter += 1

    if counter != len(predict_y):
        assert(counter == len(predict_y))
        logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter))
    else:
        logger.info('predict success, generate predict result in %s' % (f_predict_out))

    return f_predict_out
Ejemplo n.º 2
0
def train(clf, f_train_set):
    """
    训练分类器

    Args:
        clf: 分类器
        f_train_set: fin, 训练集文件
    Returns:
        clf: 分类器
    """
    from sklearn import cross_validation
    (X, y) = generate_X_y_arrays(f_train_set)

    # 简单验证
    #logger.debug('Start simple cross-validate.')
    #scores = cross_validation.cross_val_score(clf, X, y, cv=5)
    #logger.info('Classifier simple cross-validated(use train set) scores ars %s' % (scores))

    # 训练
    clf.fit(X, y)
    logger.info('Classifier(%s) fit Done.' % (clf))
    #logger.info('LR classifier(%s) fit Done. And Coef are: %s' % (clf, clf.coef_))
    #logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_))

    return clf
Ejemplo n.º 3
0
def generate_predict_result(f_predict='%s/predict_set/predict_result.csv' % (data_path),
                            f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path),
                            f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)):
    """
    生成预测结果

    Args:
        f_predict: string, 存放预测结果
        f_vec_set: string, 存放待预测向量的文件名
        f_uid_iid_set: string, 存放与向量对应的user_id, item_id
    Returns:

    """
    predict_X, predict_y = generate_X_y_arrays(f_vec_set)
    logger.debug('predict start.')
    predict_y = clf.predict(predict_X)
    logger.debug('predict done, predict result size=%s' % (len(predict_y)))

    with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout:
        counter = 0
        fin.readline()    # 忽略首行
        fout.write('user_id,item_id,tag')
        
        logger.debug('start store predict result')
        for line in fin:
            line_result = line.strip() + ',%s\n' % (predict_y[counter])
            fout.write(line_result)
            counter += 1

    if counter != len(predict_y):
        logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter))
    else:
        logger.info('predict success, generate predict result in %s' % (f_predict))
def train(clf, f_train_set):
    """
    训练分类器

    Args:
        clf: 分类器
        f_train_set: fin, 训练集文件
    Returns:
        clf: 分类器
    """
    from sklearn import cross_validation
    (X, y) = generate_X_y_arrays(f_train_set)

    # 简单验证
    #logger.debug('Start simple cross-validate.')
    #scores = cross_validation.cross_val_score(clf, X, y, cv=5)
    #logger.info('Classifier simple cross-validated(use train set) scores ars %s' % (scores))

    # 训练
    clf.fit(X, y)
    logger.info('Classifier(%s) fit Done.' % (clf)) 
    #logger.info('LR classifier(%s) fit Done. And Coef are: %s' % (clf, clf.coef_)) 
    #logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_))

    return clf
Ejemplo n.º 5
0
def train_svm():
    """
    训练分类器

    Args:
        None
    Returns:
        None
    """
    (X, y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path))
    clf.fit(X, y)
    logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
Ejemplo n.º 6
0
def train_svm():
    """
    训练分类器

    Args:
        None
    Returns:
        None
    """
    (X,
     y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path))
    clf.fit(X, y)
    logger.info(
        'Classifier fit Done. Best params are %s with a best score of %0.2f' %
        (clf.best_params_, clf.best_score_))
def train_svm(clf,
              f_train_set='%s/train_combined_vec_data.csv' % (data_path)):
    """
    训练SVM分类器

    Args:
        clf: 分类器
        f_train_set: string, 训练集文件
    Returns:
        clf: 分类器
    """
    from sklearn import cross_validation
    (X, y) = generate_X_y_arrays(f_train_set)
    # 简单验证
    scores = cross_validation.cross_val_score(clf, X, y, cv=5)
    logger.info('SVM classifier simple cross-validated scores ars %s' % (scores))

    # 训练
    clf.fit(X, y)
    logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_))

    return clf
def train_clf(clf,
              f_train_set='%s/train_combined_vec_data.csv' % (data_path)):
    """
    训练分类器

    Args:
        clf: 分类器
        f_train_set: string, 训练集文件
    Returns:
        clf: 分类器
    """
    from sklearn import cross_validation
    (X, y) = generate_X_y_arrays(f_train_set)

    # 简单验证
    scores = cross_validation.cross_val_score(clf, X, y, cv=5)
    logger.info('Classifier simple cross-validated scores ars %s' % (scores))

    # 训练
    clf.fit(X, y)
    logger.info('Classifier(%s) fit Done.' % (clf)) 

    return clf
Ejemplo n.º 9
0
def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out):
    """
    根据预测数据,给出预测结果

    Args:
        clf: 分类器
        f_predict_vect: fin, 预测数
        f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件
        f_predict_out: fout, 存放预测结果的文件
    Returns:
        f_predict_out: fout, 存放预测结果的文件
    """
    predict_X, predict_y = generate_X_y_arrays(f_predict_vect)
    logger.debug('predict start.')
    predict_y = clf.predict(predict_X)
    logger.debug('predict done, predict result size=%s' % (len(predict_y)))

    with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout:
        counter = 0
        fin.readline()  # 忽略首行
        fout.write('user_id,item_id,tag')

        logger.debug('start store predict result')
        for line in fin:
            line_result = line.strip() + ',%s\n' % (predict_y[counter])
            fout.write(line_result)
            counter += 1

    if counter != len(predict_y):
        assert (counter == len(predict_y))
        logger.error('predict result size:%s, but uid_iid_set size:%s' %
                     (len(predict_y), counter))
    else:
        logger.info('predict success, generate predict result in %s' %
                    (f_predict_out))

    return f_predict_out
Ejemplo n.º 10
0
def generate_predict_result(
        f_predict='%s/predict_set/predict_result.csv' % (data_path),
        f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path),
        f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)):
    """
    生成预测结果

    Args:
        f_predict: string, 存放预测结果
        f_vec_set: string, 存放待预测向量的文件名
        f_uid_iid_set: string, 存放与向量对应的user_id, item_id
    Returns:

    """
    predict_X, predict_y = generate_X_y_arrays(f_vec_set)
    logger.debug('predict start.')
    predict_y = clf.predict(predict_X)
    logger.debug('predict done, predict result size=%s' % (len(predict_y)))

    with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout:
        counter = 0
        fin.readline()  # 忽略首行
        fout.write('user_id,item_id,tag')

        logger.debug('start store predict result')
        for line in fin:
            line_result = line.strip() + ',%s\n' % (predict_y[counter])
            fout.write(line_result)
            counter += 1

    if counter != len(predict_y):
        logger.error('predict result size:%s, but uid_iid_set size:%s' %
                     (len(predict_y), counter))
    else:
        logger.info('predict success, generate predict result in %s' %
                    (f_predict))