def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert(counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def train(clf, f_train_set): """ 训练分类器 Args: clf: 分类器 f_train_set: fin, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 #logger.debug('Start simple cross-validate.') #scores = cross_validation.cross_val_score(clf, X, y, cv=5) #logger.info('Classifier simple cross-validated(use train set) scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('Classifier(%s) fit Done.' % (clf)) #logger.info('LR classifier(%s) fit Done. And Coef are: %s' % (clf, clf.coef_)) #logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_)) return clf
def generate_predict_result(f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))
def train_svm(): """ 训练分类器 Args: None Returns: None """ (X, y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path)) clf.fit(X, y) logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
def train_svm(): """ 训练分类器 Args: None Returns: None """ (X, y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path)) clf.fit(X, y) logger.info( 'Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
def train_svm(clf, f_train_set='%s/train_combined_vec_data.csv' % (data_path)): """ 训练SVM分类器 Args: clf: 分类器 f_train_set: string, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('SVM classifier simple cross-validated scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_)) return clf
def train_clf(clf, f_train_set='%s/train_combined_vec_data.csv' % (data_path)): """ 训练分类器 Args: clf: 分类器 f_train_set: string, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('Classifier simple cross-validated scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('Classifier(%s) fit Done.' % (clf)) return clf
def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert (counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def generate_predict_result( f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))