def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert(counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def combine_data(userbehavior_filepath='%s/train_set_calUserBehavior.csv' % (data_path), tail_filepath='%s/vecvalues_tail.csv' % (data_path), csv_output_path='%s/combined_vec_data.csv' % (data_path), svm_output_path='%s/svmdata.dat' % (data_path)): logger.info('start combining data') userbehavior_file = open(userbehavior_filepath, 'r') tail_file = open(tail_filepath, 'r') csvout = open(csv_output_path, 'w') csvout.write('tag,see,favorite,cart,buy,popularity,desire,behavior_rate\n') svmout = open(svm_output_path, 'w') behaviors = userbehavior_file.readlines() tails = tail_file.readlines() for index in range(1, len(behaviors)): tails_line = tails[index].replace('\n', '') data_tail = tails_line.split(',') behavior_line = behaviors[index].replace('\n', '') data_behavior = behavior_line.split(',') tag = data_tail[0] see = data_behavior[2] favorite = data_behavior[3] cart = data_behavior[4] buy = data_behavior[5] popularity = data_tail[1] desire = data_tail[2] behavior_rate = data_tail[3] # 构建svm用向量数据格式 svmstr = '%s 0:%s 1:%s 2:%s 3:%s 4:%s 5:%s 6:%s\n' % ( tag, see, favorite, cart, buy, popularity, desire, behavior_rate) csvstr = '%s,%s,%s,%s,%s,%s,%s,%s\n' % ( tag, see, favorite, cart, buy, popularity, desire, behavior_rate) svmout.write(svmstr) csvout.write(csvstr) logger.info('combine_data done')
def insert_train_user_2table(connect, fin='tianchi_mobile_recommend_train_user.csv'): """ 把train_user.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_user_new SET user_id=%s, item_id=%s," "behavior_type=%s, user_geohash='%s', item_category=%s," "time=%s;" % (cols[0], cols[1], cols[2], cols[3], cols[4], arrow.get(cols[5], 'YYYY-MM-DD HH').timestamp) ) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def insert_train_item_2table(connect, fin='tianchi_mobile_recommend_train_item.csv'): """ 把train_item.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ( "INSERT INTO train_item_new SET item_id=%s, item_geohash='%s'," "item_category=%s" % (cols[0], cols[1], cols[2])) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def generate_predict_result(f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))
def insert_train_item_2table(connect, fin='tianchi_mobile_recommend_train_item.csv'): """ 把train_item.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_item_new SET item_id=%s, item_geohash='%s'," "item_category=%s" % (cols[0], cols[1], cols[2])) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def generate_positive_userset(foutpath='../data/positive_userset.json'): # 移动位置,因为服务器上有一些包依赖不完整 from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads(open('../conf/DB_Address.conf', 'r').read())['MongoDB_Address'] # end 移动位置 logger.info('start generate_positive_userset') mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user # user = train_user.find_one() startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H') stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H') user_ids = train_user.find({"behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}}).distinct("user_id") # print startTime fout = open(foutpath, 'w') for userid in user_ids: # datastr = userid fout.write(userid) # data = {"user_id": userid} bought_item_ids = train_user.find( {'user_id': userid, "behavior_type": "4", "time": {"$gt": startTime, "$lt": stopTime}}, {'item_id': 1, '_id': 0}).distinct("item_id") # bought_items = [] for itemid in bought_item_ids: fout.write(',' + itemid) # bought_items.append(itemid) # data['bought_items'] = bought_items # jsonstr = json.dumps(data) fout.write('\n') # fout.write(jsonstr + '\n') logger.info('generate_positive_userset done,output path = ' + foutpath)
def get_buy_list(train_user_connect, timerange=('2014-12-18', '2014-12-19')): from datetime import datetime logger.info('get_buy_list start,timerange = %s to %s' % (timerange[0], timerange[1])) starttime = datetime.strptime(str(timerange[0]), '%Y-%m-%d') stoptime = datetime.strptime(str(timerange[1]), '%Y-%m-%d') buys = train_user_connect.find({'behavior_type': '4', 'time': {'$gt': starttime, '$lt': stoptime}}) # .distinct('user_id') logger.debug('database qury done') buy_dict = {} count = 0 for doc in buys: user_id = doc['user_id'] item_id = doc['item_id'] behavior_type = doc['behavior_type'] item_category = doc['item_category'] time = doc['time'] if buy_dict.has_key(user_id): category_inbuy = buy_dict[user_id] if category_inbuy.has_key(item_category): category_inbuy[item_category].append(item_id) else: category_inbuy[item_category] = [item_id] else: category_inbuy = {item_category: [item_id]} buy_dict[user_id] = category_inbuy count += 1 if count % 1000 == 0: logger.debug('No.%s done' % count) return buy_dict
def get_train_vecdata(train_set_path='%s/train/train_set.csv' % data_path, combined_out_path='%s/train/combined_out.csv' % data_path, svmdata_out_path='%s/train/svmdata.dat' % data_path, set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19')): """ 生成训练数据集 """ import MySQLdb from data_preprocess import generate_userset from data_preprocess.MongoDB_Utils import MongodbUtils logger.info('start get_train_vecdata, timerange=%s to %s' % (set_timerange[0], set_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # generate_userset.generate_train_set(connect, ('2014-12-18', '2014-12-19'), ('2014-12-18', '2014-12-19'), # r'../data/train/train_set_1819.csv') generate_userset.generate_train_set(connect, set_timerange, set_timerange, train_set_path) vectail_path = train_set_path.replace('.csv', '_vectail.csv') cal_vecvalues_tail(train_user, train_set_path, vectail_path, behavior_timerange) # predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior_path = train_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, train_set_path) combine_data(cal_user_behavior_path, vectail_path, combined_out_path, svmdata_out_path)
def test(f_recommend_intersect_set, f_real_buy_intersect_set): """ 测试推荐结果 Args: f_recommend_intersect_set: fin, 取交集后的推荐结果 f_real_buy_intersect_set: fin, 取交集后的真实购买结果 Returns: scores: list, [f1_score, precision, recall] """ prediction_set = set() reference_set = set() with open(f_real_buy_intersect_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: prediction_set.add(line.strip()) with open(f_recommend_intersect_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: reference_set.add(line.strip()) intersection_len = float(len(prediction_set.intersection(reference_set))) precision = intersection_len / len(reference_set) recall = intersection_len / len(prediction_set) f1_score = (2.0 * precision * recall) / (precision + recall) logger.info('[test result] f1_score=%s, precision=%s, recall=%s' % (f1_score, precision, recall)) return [f1_score, precision, recall]
def get_predict_vecdata(set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19'), predict_set_path='%s/predict/predict_set.csv' % (data_path), predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path), csv_output_path='%s/predict/combined_vec_data.csv' % (data_path), svm_output_path='%s/predict/svmdata.dat' % (data_path)): """ 生成预测集,需要制定时间范围与各输出路径 :param set_timerange: 潜在购买商品的时间范围 :param behavior_timerange: 计算向量值时的时间范围 """ from data_preprocess import generate_userset import MySQLdb from data_preprocess.MongoDB_Utils import MongodbUtils logger.info('start get_predict_vecdata, set_timerange=%s to %s, behavior_timerange = %s to %s' % ( set_timerange[0], set_timerange[1], behavior_timerange[0], behavior_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # predict_set_path = '%s/temp/predict_set.csv' % (data_path) generate_userset.generate_predict_set(connect, set_timerange, predict_set_path) # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path) # stoptime = timerange[1] cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path, behavior_timerange) predict_vecbehavior_path = predict_set_path.replace('.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, predict_set_path) combine_data(predict_vecbehavior_path, predict_vectail_path, csv_output_path, svm_output_path)
def cal_positive_userset_vecvalues( fin_path='../data/positive_userset_2015-04-12-14-32-11.csv', fout_path='../data/popularity_desire_behaviorRate_data.csv'): """ 计算剩下的3个维度的值(商品热门度、用户购买欲、操作比重),并保存在csv文件中 格式:[user_id]_[item_id],popularity,desire,behavior_rate :param fin_path: 正样本训练集的csv数据文件 :param fout_path: 结果输出路径 :return: """ fin = open(fin_path, 'r') fout = open(fout_path, 'w') logger.info('cal_positive_userset_vecvalues start') fout.write('user_id_item_id,popularity,desire,behavior_rate\n') for line in fin: line = line.replace('\n', '') ids = line.split(',') user_id = ids[0] desire = cal_user_desire(user_id) for index in range(1, len(ids)): item_id = ids[index] fout.write(user_id + '_' + item_id + ',') popularity = cal_item_popularity(item_id) behavior_rate = cal_useritem_behavior_rate(user_id, item_id) datastr = '%s,%s,%s\n' % (popularity, desire, behavior_rate) fout.write(datastr) logger.info('cal_positive_userset_vecvalues done,output path=' + fout_path)
def train(clf, f_train_set): """ 训练分类器 Args: clf: 分类器 f_train_set: fin, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 #logger.debug('Start simple cross-validate.') #scores = cross_validation.cross_val_score(clf, X, y, cv=5) #logger.info('Classifier simple cross-validated(use train set) scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('Classifier(%s) fit Done.' % (clf)) #logger.info('LR classifier(%s) fit Done. And Coef are: %s' % (clf, clf.coef_)) #logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_)) return clf
def test(f_recommend_intersect_set, f_real_buy_intersect_set): """ 测试推荐结果 Args: f_recommend_intersect_set: fin, 取交集后的推荐结果 f_real_buy_intersect_set: fin, 取交集后的真实购买结果 Returns: scores: list, [f1_score, precision, recall] """ prediction_set = set() reference_set = set() with open(f_real_buy_intersect_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: prediction_set.add(line.strip()) with open(f_recommend_intersect_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: reference_set.add(line.strip()) intersection_len = float(len(prediction_set.intersection(reference_set))) precision = intersection_len / len(reference_set) recall = intersection_len / len(prediction_set) f1_score = (2.0*precision*recall) / (precision+recall) logger.info('[test result] f1_score=%s, precision=%s, recall=%s' % (f1_score, precision, recall)) return [f1_score, precision, recall]
def cal_positive_userset_vecvalues(fin_path='../data/positive_userset_2015-04-12-14-32-11.csv', fout_path='../data/popularity_desire_behaviorRate_data.csv'): """ 计算剩下的3个维度的值(商品热门度、用户购买欲、操作比重),并保存在csv文件中 格式:[user_id]_[item_id],popularity,desire,behavior_rate :param fin_path: 正样本训练集的csv数据文件 :param fout_path: 结果输出路径 :return: """ fin = open(fin_path, 'r') fout = open(fout_path, 'w') logger.info('cal_positive_userset_vecvalues start') fout.write('user_id_item_id,popularity,desire,behavior_rate\n') for line in fin: line = line.replace('\n', '') ids = line.split(',') user_id = ids[0] desire = cal_user_desire(user_id) for index in range(1, len(ids)): item_id = ids[index] fout.write(user_id + '_' + item_id + ',') popularity = cal_item_popularity(item_id) behavior_rate = cal_useritem_behavior_rate(user_id, item_id) datastr = '%s,%s,%s\n' % (popularity, desire, behavior_rate) fout.write(datastr) logger.info('cal_positive_userset_vecvalues done,output path=' + fout_path)
def insert_train_user_2table(connect, fin='tianchi_mobile_recommend_train_user.csv'): """ 把train_user.csv文件的内容插入到数据库中 Args: connect: Mysqldb.connect(), 数据库连接句柄 fin: string, 用户对商品的操作记录文件 Returns: None """ cursor = connect.cursor() counter = 0 with open(fin, 'rb') as f: f.readline() # 忽略首行 for line in f: cols = line.strip().split(',') sql = ("INSERT INTO train_user_new SET user_id=%s, item_id=%s," "behavior_type=%s, user_geohash='%s', item_category=%s," "time=%s;" % (cols[0], cols[1], cols[2], cols[3], cols[4], arrow.get(cols[5], 'YYYY-MM-DD HH').timestamp)) cursor.execute(sql) counter += 1 if counter % 5000 == 0: connect.commit() logger.debug('Insert counter:%s' % (counter)) connect.commit() logger.info('Done, and insert counter:%s' % (counter)) cursor.close()
def train_classifier(clf, X, y): """ 训练分类器 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search, cross_validation """grid search 的结果 clf.fit(X, y) #logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_)) #logger.info('And scores ars %s' % (clf.grid_scores_)) """ # 简单的交叉验证 clf.fit(X, y) scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('Classifier fit Done. And simple cross-validated scores ars %s' % (scores)) # 十折法 kf = cross_validation.KFold(len(X), n_folds=10) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('10 folds cross-validated scores are %s.' % (score))
def train_svm(): """ 训练分类器 Args: None Returns: None """ (X, y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path)) clf.fit(X, y) logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
def generate_positive_userset(foutpath='../data/positive_userset.json'): # 移动位置,因为服务器上有一些包依赖不完整 from data_preprocess.MongoDB_Utils import MongodbUtils db_address = json.loads(open('../conf/DB_Address.conf', 'r').read())['MongoDB_Address'] # end 移动位置 logger.info('start generate_positive_userset') mongodb = MongodbUtils(db_address, 27017) train_user = mongodb.get_db().train_user # user = train_user.find_one() startTime = datetime.strptime(str('2014-12-18 00'), '%Y-%m-%d %H') stopTime = datetime.strptime(str('2014-12-19 00'), '%Y-%m-%d %H') user_ids = train_user.find({ "behavior_type": "4", "time": { "$gt": startTime, "$lt": stopTime } }).distinct("user_id") # print startTime fout = open(foutpath, 'w') for userid in user_ids: # datastr = userid fout.write(userid) # data = {"user_id": userid} bought_item_ids = train_user.find( { 'user_id': userid, "behavior_type": "4", "time": { "$gt": startTime, "$lt": stopTime } }, { 'item_id': 1, '_id': 0 }).distinct("item_id") # bought_items = [] for itemid in bought_item_ids: fout.write(',' + itemid) # bought_items.append(itemid) # data['bought_items'] = bought_items # jsonstr = json.dumps(data) fout.write('\n') # fout.write(jsonstr + '\n') logger.info('generate_positive_userset done,output path = ' + foutpath)
def train_svm(): """ 训练分类器 Args: None Returns: None """ (X, y) = generate_X_y_arrays('%s/train_combined_vec_data.csv' % (data_path)) clf.fit(X, y) logger.info( 'Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
def intersect_twofiles(fin1_path, fin2_path, fout_path): with open(fin1_path) as fin1, open(fin2_path) as fin2, open( fout_path, 'w') as fout: fin1.readline() # 忽略首行 tuple_list = [] for line in fin1: cols = line.strip().split(',') meta_tuple = (cols[0], cols[1]) tuple_list.append(meta_tuple) fout.write(fin2.readline()) for line in fin2: cols = line.strip().split(',') meta_tuple = (cols[0], cols[1]) if meta_tuple in tuple_list: fout.write('%s,%s\n' % (meta_tuple[0], meta_tuple[1])) logger.info('intersect %s and %s done, output=%s' % (fin1_path, fin2_path, fout_path))
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA import scipy # Exhaustive Grid Search exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]} clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)} clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter) names = ["Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y)
def train_classifier(clf, X, y): """ 训练分类器 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: clf: classifier, 训练完的分类器 """ from sklearn import grid_search, cross_validation import time """grid search 的结果 clf.fit(X, y) #logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_)) #logger.info('And scores ars %s' % (clf.grid_scores_)) """ # 简单的交叉验证 clf.fit(X, y) scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('Classifier fit Done. And simple cross-validated scores ars %s' % (scores)) # 十折法 kf = cross_validation.KFold(len(X), n_folds=10) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('10 folds cross-validated scores is %s.' % (score)) # 以 1/10的训练集作为新的训练集输入,并得出评分 test_size = 0.9 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1-test_size, score)) # 以 1/100的训练集作为新的训练集输入,并得出评分 test_size = 0.99 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1-test_size, score)) return clf
def cal_vecvalues_tail(mongo_train_user_collection, fin_path='../data/train_set.csv', fout_path='../data/vecvalues_tail.csv', timerange=('2014-12-12', '2014-12-19')): """ 计算后三维的向量,需要mongodb支持 :param fin_path:样本集csv路径 :param fout_path:结果路径 :param stoptime:计算的截止日期,格式'2014-12-19' :return: """ logger.info('cal_vecvalues_tail start') fin = open(fin_path, 'r') fin.readline() # 跳过标题行 fout = open(fout_path, 'w') fout.write('tag,popularity,desire,behavior_rate\n') count = 0 # stoptime += ' 00' for line in fin: line = line.replace('\n', '') data = line.split(',') user_id = data[0] item_id = data[1] tag = data[2] popularity = cal_item_popularity(mongo_train_user_collection, item_id, timerange=timerange) desire = cal_user_desire(mongo_train_user_collection, user_id, timerange=timerange) behavior_rate = cal_useritem_behavior_rate(mongo_train_user_collection, user_id, item_id, timerange=timerange) datastr = '%s,%s,%s,%s\n' % (tag, popularity, desire, behavior_rate) # datastr = tag + ',' + str(popularity) + ',' + str(desire) + ',' + str(behavior_rate) + '\n' fout.write(datastr) count += 1 if count % 5000 == 0: logger.info('calculated count:\t%s' % count) logger.info('cal_vecvalues_tail done, result path=' + fout_path)
def cal_vecvalues_tail(fin_path='../data/train_set.csv', fout_path='../data/vecvalues_tail.csv'): """ 计算后三维的向量,需要mongodb支持 :param fin_path:样本集csv路径 :param fout_path:结果路径 :return: """ logger.info('cal_vecvalues_tail start') fin = open(fin_path, 'r') fin.readline() # 跳过标题行 fout = open(fout_path, 'w') fout.write('tag,popularity,desire,behavior_rate\n') count = 0 for line in fin: line = line.replace('\n', '') data = line.split(',') user_id = data[0] item_id = data[1] tag = data[2] popularity = cal_item_popularity(item_id) desire = cal_user_desire(user_id) behavior_rate = cal_useritem_behavior_rate(user_id, item_id) datastr = '%s,%s,%s,%s\n' % (tag, popularity, desire, behavior_rate) # datastr = tag + ',' + str(popularity) + ',' + str(desire) + ',' + str(behavior_rate) + '\n' fout.write(datastr) count += 1 if count % 2000 == 0: logger.info('calculated count:\t%s' % count) logger.info('cal_vecvalues_tail done, result path=' + fout_path)
def get_predict_vecdata( set_timerange=('2014-12-18', '2014-12-19'), behavior_timerange=('2014-12-12', '2014-12-19'), predict_set_path='%s/predict/predict_set.csv' % (data_path), predict_vectail_path='%s/predict/predict_vectail.csv' % (data_path), csv_output_path='%s/predict/combined_vec_data.csv' % (data_path), svm_output_path='%s/predict/svmdata.dat' % (data_path)): """ 生成预测集,需要制定时间范围与各输出路径 :param set_timerange: 潜在购买商品的时间范围 :param behavior_timerange: 计算向量值时的时间范围 """ from data_preprocess import generate_userset import MySQLdb from data_preprocess.MongoDB_Utils import MongodbUtils logger.info( 'start get_predict_vecdata, set_timerange=%s to %s, behavior_timerange = %s to %s' % (set_timerange[0], set_timerange[1], behavior_timerange[0], behavior_timerange[1])) connect = MySQLdb.connect(host='10.108.192.119', user='******', passwd='tianchi_data', db='tianchi') mongo_utils = MongodbUtils(db_address, 27017) train_user = mongo_utils.get_db().train_user # predict_set_path = '%s/temp/predict_set.csv' % (data_path) generate_userset.generate_predict_set(connect, set_timerange, predict_set_path) # predict_vectail_path = '%s/temp/predict_vectail.csv' % (data_path) # stoptime = timerange[1] cal_vecvalues_tail(train_user, predict_set_path, predict_vectail_path, behavior_timerange) predict_vecbehavior_path = predict_set_path.replace( '.csv', '_calUserBehavior.csv') cal_user_behavior(connect, behavior_timerange, predict_set_path) combine_data(predict_vecbehavior_path, predict_vectail_path, csv_output_path, svm_output_path)
def train_svm(clf, f_train_set='%s/train_combined_vec_data.csv' % (data_path)): """ 训练SVM分类器 Args: clf: 分类器 f_train_set: string, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('SVM classifier simple cross-validated scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('SVM classifier(%s) fit Done. Best params are %s with a best score of %0.2f' % (clf, clf.best_params_, clf.best_score_)) return clf
def intersect(f_result='%s/UserCF_recommend_3.csv' % (data_path), f_item_set='%s/tianchi_mobile_recommend_train_item.csv' % (data_path)): """ 对结果和给出的item_set取交集,剔除结果中不属于物品子集的 Args: f_result: string, 原始的结果文件 -------------- content ------------- | item_id,item_geohash,item_category | ------------------------------------ f_item_set: string, 阿里提供的物品集文件 ---- content ---- | user_id,item_id | ----------------- Returns: fout_name: string, 取交集后的文件名 """ item_id_set = set() with open(f_item_set) as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') item_id_set.add(cols[0]) fout_name = f_result.replace('.csv', '_intersect.csv') counter = 0 with open(f_result) as fin, open(fout_name, 'w') as fout: fout.write(fin.readline()) # 首行特殊处理 for line in fin: cols = line.strip().split(',') if cols[1] in item_id_set: counter += 1 fout.write(line) logger.info( 'intersect success, intersect size =%s and generate final result in %s' % (counter, fout_name)) return fout_name
def generate_recommend_result(f_predict_set, f_recommend_set): """ 根据预测结果生成推荐结果 Args: f_predict_set: string, 存放预测结果 f_recommend_set: string, 存放推荐结果 Returns: None """ with open(f_predict_set, 'r') as fin, open(f_recommend_set, 'w') as fout: fin.readline() # 忽略首行 fout.write('user_id,item_id\n') counter = 0 for line in fin: cols = line.strip().split(',') if cols[-1] == '1': counter += 1 fout.write('%s,%s\n' % (cols[0], cols[1])) logger.info('generate recommend result Done. Total: %s' % (counter))
def generate_test_set(connect, timerange, f_train_set='%s/test_set.csv' % (data_path)): """ 构建测试集, 即用来验证结果正确性的文件 Args: connect: Mysqldb.connect(), 数据库连接句柄 timerange: tuple, 测试集的时间筛选条件, (start, end) f_train_set: string, 测试集结果文件 ---- content ---- | user_id,item_id | ----------------- Returns: None """ import arrow cursor = connect.cursor() (timerange_start, timerange_end) = map(lambda elem: arrow.get(elem).timestamp, timerange) counter = 0 with open(f_train_set, 'w') as fout: fout.write('user_id,item_id\n') sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time<=%s;' % ( timerange_start, timerange_end) logger.debug('sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() logger.debug('start generate test set') for [user_id, item_id] in result: fout.write('%s,%s\n' % (user_id, item_id)) counter += 1 logger.debug('success generate test set, and size=%s.' % (counter)) logger.info('Result store in: %s' % (f_train_set)) cursor.close()
def train_classifier(clf, X, y): """ 训练分类器 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: clf: classifier, 训练完的分类器 """ from sklearn import grid_search, cross_validation import time """grid search 的结果 clf.fit(X, y) #logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_)) #logger.info('And scores ars %s' % (clf.grid_scores_)) """ # 简单的交叉验证 clf.fit(X, y) scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info( 'Classifier fit Done. And simple cross-validated scores ars %s' % (scores)) # 十折法 kf = cross_validation.KFold(len(X), n_folds=10) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('10 folds cross-validated scores is %s.' % (score)) # 以 1/10的训练集作为新的训练集输入,并得出评分 test_size = 0.9 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1 - test_size, score)) """ # 以 1/100的训练集作为新的训练集输入,并得出评分 test_size = 0.99 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1-test_size, score)) """ return clf
def dump_train_item(csv_path, db_address): """ 将train_item.csv的数据存入MongoDB数据库 :param csv_path: csv文件的路径 :param db_address: MongoDB的地址 :return: """ logger.info('dump_train_item start') csvfile = open(csv_path) # for line in csvfile: head = csvfile.readline() head = head.replace('\n', '') title = [] for x in head.split(','): title.append(x) print title conn = pymongo.Connection(db_address, 27017) db = conn.TianchiData train_item_db = db.train_item_new # line = csvfile.readline() count = 0 for line in csvfile: line = line.replace('\n', '') data = {} temp = line.split(',') for i in range(len(title)): # if title[i] != 'time': data[title[i]] = temp[i] # else: # data[title[i]] = datetime.strptime(str(temp[i]), '%Y-%m-%d %H') train_item_db.insert(data) count += 1 if count % 10000 == 0: logger.debug('%s inserted' % count) # line = csvfile.readline() conn.disconnect() logger.info('dump_train_item done') print '处理完毕'
def train_clf(clf, f_train_set='%s/train_combined_vec_data.csv' % (data_path)): """ 训练分类器 Args: clf: 分类器 f_train_set: string, 训练集文件 Returns: clf: 分类器 """ from sklearn import cross_validation (X, y) = generate_X_y_arrays(f_train_set) # 简单验证 scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info('Classifier simple cross-validated scores ars %s' % (scores)) # 训练 clf.fit(X, y) logger.info('Classifier(%s) fit Done.' % (clf)) return clf
def predict(clf, f_predict_vect, f_predict_id_set, f_predict_out): """ 根据预测数据,给出预测结果 Args: clf: 分类器 f_predict_vect: fin, 预测数 f_predict_id_set: fin, 与预测数据对应的存放有user_id, item_id的文件 f_predict_out: fout, 存放预测结果的文件 Returns: f_predict_out: fout, 存放预测结果的文件 """ predict_X, predict_y = generate_X_y_arrays(f_predict_vect) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_predict_id_set, 'r') as fin, open(f_predict_out, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): assert (counter == len(predict_y)) logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict_out)) return f_predict_out
def get_realbuy(train_user_connect, timerange=('2014-12-05', '2014-12-06'), fresultpath='%s/result/realbuy.csv' % data_path): from datetime import datetime logger.info('get_realbuy start') fresult = open(fresultpath, 'w') fresult.write('user_id,item_id\n') starttime = datetime.strptime(str(timerange[0]), '%Y-%m-%d') stoptime = datetime.strptime(str(timerange[1]), '%Y-%m-%d') buys = train_user_connect.find({'behavior_type': '4', 'time': {'$gt': starttime, '$lt': stoptime}}) resultlist = [] # count = 0 for buy in buys: user_id = buy['user_id'] item_id = buy['item_id'] if (user_id, item_id) not in resultlist: fresult.write('%s,%s\n' % (user_id, item_id)) resultlist.append((user_id, item_id)) # count+=1 # print count # buys.distinct({'user_id', 'item_id'}) logger.info('get_realbuy done')
def intersect(f_result='%s/UserCF_recommend_3.csv' % (data_path), f_item_set='%s/tianchi_mobile_recommend_train_item.csv' % (data_path)): """ 对结果和给出的item_set取交集,剔除结果中不属于物品子集的 Args: f_result: string, 原始的结果文件 -------------- content ------------- | item_id,item_geohash,item_category | ------------------------------------ f_item_set: string, 阿里提供的物品集文件 ---- content ---- | user_id,item_id | ----------------- Returns: fout_name: string, 取交集后的文件名 """ item_id_set = set() with open(f_item_set) as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') item_id_set.add(cols[0]) fout_name = f_result.replace('.csv', '_intersect.csv') counter = 0 with open(f_result) as fin, open(fout_name, 'w') as fout: fout.write(fin.readline()) # 首行特殊处理 for line in fin: cols = line.strip().split(',') if cols[1] in item_id_set: counter += 1 fout.write(line) logger.info('intersect success, intersect size =%s and generate final result in %s' % (counter, fout_name)) return fout_name
def generate_predict_result( f_predict='%s/predict_set/predict_result.csv' % (data_path), f_vec_set='%s/predict_set/predict_combined_vec_data.csv' % (data_path), f_uid_iid_set='%s/predict_set/predict_set.csv' % (data_path)): """ 生成预测结果 Args: f_predict: string, 存放预测结果 f_vec_set: string, 存放待预测向量的文件名 f_uid_iid_set: string, 存放与向量对应的user_id, item_id Returns: """ predict_X, predict_y = generate_X_y_arrays(f_vec_set) logger.debug('predict start.') predict_y = clf.predict(predict_X) logger.debug('predict done, predict result size=%s' % (len(predict_y))) with open(f_uid_iid_set, 'r') as fin, open(f_predict, 'w') as fout: counter = 0 fin.readline() # 忽略首行 fout.write('user_id,item_id,tag') logger.debug('start store predict result') for line in fin: line_result = line.strip() + ',%s\n' % (predict_y[counter]) fout.write(line_result) counter += 1 if counter != len(predict_y): logger.error('predict result size:%s, but uid_iid_set size:%s' % (len(predict_y), counter)) else: logger.info('predict success, generate predict result in %s' % (f_predict))
def train(tokenizer, model, optimizer, train_features, dev_features, test_features, config, \ device, n_gpu, label_list, num_train_steps): global_step = 0 nb_tr_steps = 0 tr_loss = 0 dev_best_ent_acc = 0 dev_best_rel_acc = 0 dev_best_ent_precision = 0 dev_best_ent_recall = 0 dev_best_ent_f1 = 0 dev_best_rel_precision = 0 dev_best_rel_recall = 0 dev_best_rel_f1 = 0 dev_best_loss = 1000000000000000 test_best_ent_acc = 0 test_best_rel_acc = 0 test_best_ent_precision = 0 test_best_ent_recall = 0 test_best_ent_f1 = 0 test_best_rel_precision = 0 test_best_rel_recall = 0 test_best_rel_f1 = 0 test_best_loss = 1000000000000000 model.train() step = 0 for idx in range(int(config.epochs)): if idx == 4: logger.info(idx) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("#######" * 10) logger.info("EPOCH: " + str(idx)) adjust_learning_rate(optimizer) num_example = len(train_features) num_batches = int(num_example / config.train_batch_size) train_indecies = np.random.permutation(num_example) for batch_i in tqdm(range(num_batches)): step += 1 start_idx = batch_i * config.train_batch_size end_idx = min((batch_i + 1) * config.train_batch_size, num_example) mini_batch_idx = train_indecies[start_idx:end_idx] input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, input_types, entity_types, relations, doc_tokens = \ generate_mini_batch_input(train_features, mini_batch_idx, config) if config.use_cuda: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) valid_ids = valid_ids.to(device) label_mask = label_mask.to(device) loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, label_mask) if n_gpu > 1: loss = loss.mean() model.zero_grad() loss.backward() # nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip_grad) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (batch_i + 1) % config.gradient_accumulation_steps == 0: lr_this_step = config.learning_rate * warmup_linear( global_step / num_train_steps, config.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 logger.info("") logger.info("current training loss is : " + str(loss.item())) tmp_dev_loss, tmp_dev_entity, tmp_dev_relation, ent_weight, rel_weight = eval_checkpoint( model, dev_features, config, device, n_gpu, label_list, eval_sign="dev") logger.info("ent_weight: {}".format(ent_weight)) logger.info("rel_weight: {}".format(rel_weight)) _, tmp_test_entity, tmp_test_relation = eval_checkpoint( model, test_features, config, device, n_gpu, label_list, "test", tokenizer, ent_weight, rel_weight) logger.info("......" * 10) logger.info("TEST:") test_ent_acc, test_ent_pcs, test_ent_recall, test_ent_f1 = tmp_test_entity["accuracy"], tmp_test_entity["precision"], \ tmp_test_entity["recall"], tmp_test_entity["f1"] test_rel_acc, test_rel_pcs, test_rel_recall, test_rel_f1 = tmp_test_relation["accuracy"], tmp_test_relation["precision"], \ tmp_test_relation["recall"], tmp_test_relation["f1"] logger.info("question:") logger.info("entity : acc={}, precision={}, recall={}, f1={}".format( test_ent_acc, test_ent_pcs, test_ent_recall, test_ent_f1)) logger.info("relation: acc={}, precision={}, recall={}, f1={}".format( test_rel_acc, test_rel_pcs, test_rel_recall, test_rel_f1)) logger.info("") # export a trained mdoel model_to_save = model output_model_file = os.path.join(config.output_dir, "bert_model.bin") if config.export_model == "True": torch.save(model_to_save.state_dict(), output_model_file) logger.info("TEST: loss={}".format(test_best_loss)) logger.info("current best precision, recall, f1, acc :") logger.info("entity : {}, {}, {}, {}".format(test_best_ent_precision, test_best_ent_recall, test_best_ent_f1, test_best_ent_acc)) logger.info("relation: {}, {}, {}, {}".format(test_best_rel_precision, test_best_rel_recall, test_best_rel_f1, test_best_rel_acc)) logger.info("=&=" * 15)
def filter_with_category_popularity(connect, train_user_connect, f_recommend, f_category_relationship, stoptime_str): """ 用类间承接关系和类内排名过滤结果 Args: connect: MySQLdb.connect(), 数据库连接句柄 train_user_connect: Mongodb的train_user表连接 f_recommend: fin, 推荐结果文件 f_category_relationship: fin, 类间承接关系 stoptime_str: string, 截止日期 Returns: f_output: fout, 过滤后的结果 """ import arrow import random cursor = connect.cursor() f_output = f_recommend.replace('.csv', '_filter.csv') logger.debug('Start filter recommend result..') # 根据推荐文件生成 stoptime_timestamp = arrow.get(stoptime_str).timestamp recommend_dict = {} # {u_id1:[i_id1,i_id2], u_id2:[i_id3,i_id4]} with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') if recommend_dict.has_key(cols[0]): recommend_dict[cols[0]].append(cols[1]) else: recommend_dict[cols[0]] = [cols[1]] logger.debug('完成根据推荐文件生成第一步, len:%s' % (len(recommend_dict))) # 分两步生成最后的dict是为了减少mysql查询数 recommend_tuple_dict = { } # {(u_id1,i_id1):(u_last_category, id1_category), (u_id1, i_id2):(u_last_category, id2_category)} user_counter = 0 for (u_id, i_ids) in recommend_dict.iteritems(): sql = 'SELECT item_category FROM train_user WHERE user_id=%s and time<%s ORDER BY time DESC limit 1;' % ( u_id, stoptime_timestamp) cursor.execute(sql) result = cursor.fetchall() user_last_category = result[0][0] user_counter += 1 if user_counter % 200 == 0: logger.debug('No.%s user, user_id=%s, last_item_category=%s' % (user_counter, u_id, user_last_category)) for i_id in i_ids: sql = 'SELECT item_category FROM train_item WHERE item_id=%s;' % ( i_id) cursor.execute(sql) result = cursor.fetchall() recommend_tuple_dict[(u_id, i_id)] = (user_last_category, result[0][0]) logger.debug('原推荐结果长度:%s' % (len(recommend_tuple_dict))) # 根据承接关系文件生成 relationship_set = set() with open(f_category_relationship, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') relationship_set.add((cols[0], cols[1])) logger.debug('承接关系结果长度:%s' % (len(relationship_set))) # 输出结果 with open(f_output, 'w') as fout: in_counter = 0 random_counter = 0 fout.write('user_id,item_id\n') for ((user_id, item_id), category_tuple) in recommend_tuple_dict.iteritems(): if category_tuple in relationship_set: in_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) else: if random.random() <= cal_popularity_in_category( item_id, stoptime_str, train_user_connect): random_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) logger.debug('NO.%s random pick, [%s,%s]' % (random_counter, user_id, item_id)) logger.info('对推荐结果的筛选完成,结果路径:%s' % ()) logger.info('筛选前%s, 筛选后%s. 其中在承接关系的有%s, 随机挑选的有%s' % (len(recommend_tuple_dict), in_counter + random_counter, in_counter, random_counter))
def filter_with_mongodb(train_user_connect, f_recommend_path, f_category_relation_path, fout_path='%s/result/filter_output.csv' % data_path, time_range=('2014-12-04', '2014-12-05')): """ 推荐结果筛选的mongoDB版 :param train_user_connect:mongodb的train_user连接 :param f_recommend_path:推荐结果的csv文件路径 :param f_category_relation_path:精简后的类别关系csv路径 :param fout_path:筛选结果输出路径 :param time_range:“最近购买”的时间范围 :return: """ import pymongo import random import math logger.info('start filter_with_mongoDB') popularity_calculator = PopularityInCategory(train_user_connect, time_range[1]) # 推荐列表预处理 f_recommend = open(f_recommend_path) f_recommend.readline() # 忽略首行 recommend_dict = {} old_count = 0 new_count = 0 for line in f_recommend: cols = line.strip().split(',') user_id = cols[0] item_id = cols[1] if recommend_dict.has_key(user_id): recommend_dict[user_id].append(item_id) else: recommend_dict[user_id] = [item_id] # link_count = cols[2] # 类别关系列表预处理 f_category_relation = open(f_category_relation_path) f_category_relation.readline() category_dict = {} for line in f_category_relation: cols = line.strip().split(',') source = cols[0] target = cols[1] if category_dict.has_key(source): category_dict[source].append(target) else: category_dict[source] = [target] # 开始进行筛选 logger.debug('start filtering!') fout = open(fout_path, 'w') fout.write('user_id,item_id\n') result = [] usercount = 0 for u_id in recommend_dict.keys(): usercount += 1 logger.debug(usercount) i_ids = recommend_dict[u_id] last_buy_cursor = train_user_connect.find({ 'user_id': u_id, 'behavior_type': '4', 'time': { '$gt': time_range[0], '$lt': time_range[1] } }).sort('time', pymongo.DESCENDING) item_category_dict = {} item_popularity_in_category_dict = {} last_buy = next(last_buy_cursor, None) for i_id in i_ids: old_count += 1 if item_category_dict.has_key(i_id): i_category = item_category_dict.get(i_id) temp_category = i_category else: temp = train_user_connect.find_one({'item_id': i_id}) temp_category = temp['item_category'] item_category_dict[i_id] = temp_category # 判断所推荐的商品种类是否与该用户最后一次购买的类别存在顺承关系 if last_buy and temp_category in category_dict[ last_buy['item_category']]: # last_category = last_buy['item_category'] # if temp_category in category_dict[last_category]: logger.debug('有承接关系 u_id=%s,i_id=%s' % (u_id, i_id)) fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 else: # 不符合顺承关系则进入随机保留 if item_popularity_in_category_dict.has_key(i_id): item_popularity = item_popularity_in_category_dict[i_id] else: item_popularity = popularity_calculator.get_popularity_in_category( i_id) item_popularity_in_category_dict[i_id] = item_popularity item_popularity = 1 / (1 + math.e**(item_popularity + 1)) logger.debug('random prob = %s' % item_popularity) if random.random() <= item_popularity: fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info( 'done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % (old_count, new_count, fout_path)) return result
def filter_with_mongodb(train_user_connect, f_recommend_path, f_category_relation_path, fout_path='%s/result/filter_output.csv' % data_path, time_range=('2014-12-04', '2014-12-05')): """ 推荐结果筛选的mongoDB版 :param train_user_connect:mongodb的train_user连接 :param f_recommend_path:推荐结果的csv文件路径 :param f_category_relation_path:精简后的类别关系csv路径 :param fout_path:筛选结果输出路径 :param time_range:“最近购买”的时间范围 :return: """ import pymongo import random import math logger.info('start filter_with_mongoDB') popularity_calculator = PopularityInCategory(train_user_connect, time_range[1]) # 推荐列表预处理 f_recommend = open(f_recommend_path) f_recommend.readline() # 忽略首行 recommend_dict = {} old_count = 0 new_count = 0 for line in f_recommend: cols = line.strip().split(',') user_id = cols[0] item_id = cols[1] if recommend_dict.has_key(user_id): recommend_dict[user_id].append(item_id) else: recommend_dict[user_id] = [item_id] # link_count = cols[2] # 类别关系列表预处理 f_category_relation = open(f_category_relation_path) f_category_relation.readline() category_dict = {} for line in f_category_relation: cols = line.strip().split(',') source = cols[0] target = cols[1] if category_dict.has_key(source): category_dict[source].append(target) else: category_dict[source] = [target] # 开始进行筛选 logger.debug('start filtering!') fout = open(fout_path, 'w') fout.write('user_id,item_id\n') result = [] usercount = 0 for u_id in recommend_dict.keys(): usercount += 1 logger.debug(usercount) i_ids = recommend_dict[u_id] last_buy_cursor = train_user_connect.find({'user_id': u_id, 'behavior_type': '4', 'time': { '$gt': time_range[0], '$lt': time_range[1]}}).sort('time', pymongo.DESCENDING) item_category_dict = {} item_popularity_in_category_dict = {} last_buy = next(last_buy_cursor, None) for i_id in i_ids: old_count += 1 if item_category_dict.has_key(i_id): i_category = item_category_dict.get(i_id) temp_category = i_category else: temp = train_user_connect.find_one({'item_id': i_id}) temp_category = temp['item_category'] item_category_dict[i_id] = temp_category # 判断所推荐的商品种类是否与该用户最后一次购买的类别存在顺承关系 if last_buy and temp_category in category_dict[last_buy['item_category']]: # last_category = last_buy['item_category'] # if temp_category in category_dict[last_category]: logger.debug('有承接关系 u_id=%s,i_id=%s' % (u_id, i_id)) fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 else: # 不符合顺承关系则进入随机保留 if item_popularity_in_category_dict.has_key(i_id): item_popularity = item_popularity_in_category_dict[i_id] else: item_popularity = popularity_calculator.get_popularity_in_category(i_id) item_popularity_in_category_dict[i_id] = item_popularity item_popularity = 1 / (1 + math.e ** (item_popularity +1)) logger.debug('random prob = %s' % item_popularity) if random.random() <= item_popularity: fout.write('%s,%s\n' % (u_id, i_id)) result.append((u_id, i_id)) new_count += 1 if old_count % 10 == 0: # logger哨兵 logger.debug('No.%s origin recommend filtered' % old_count) logger.info('done! origin recommend num is %s, current recommend num is %s\nresult output path: %s' % ( old_count, new_count, fout_path)) return result
def generate_train_set(connect, positive_set_timerange, negative_set_timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 构建训练集 Args: connect: Mysqldb.connect(), 数据库连接句柄 positive_set_timerange: tuple, 正样本的时间筛选条件, (start, end) e.g. ('2014-12-17', '2014-12-18') negative_set_timerange: tuple, 负样本的时间筛选条件, (start, end) e.g. ('2014-11-17', '2014-12-17') f_train_set: string, 训练集结果文件 tag=1,正样本;tag=-1,负样本 ------ content ------ | user_id,item_id,tag | --------------------- Returns: None """ import arrow from random import randint cursor = connect.cursor() # 正样本的时间过滤条件 # positive_timestamp_start = arrow.get('2014-12-17').timestamp # positive_timestamp_end = arrow.get('2014-12-18').timestamp time2timestamp = lambda elem: arrow.get(elem).timestamp (positive_timestamp_start, positive_timestamp_end) = map(time2timestamp, positive_set_timerange) (negative_timestamp_start, negative_timestamp_end) = map(time2timestamp, negative_set_timerange) with open(f_train_set, 'w') as fout: fout.write('user_id,item_id,tag\n') set_counter = 0 # 正样本 tag = 1 sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time <=%s;' % ( positive_timestamp_start, positive_timestamp_end) logger.debug('positive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() positive_set = set() # 保存正样本,以防止负样本与正样本相同 logger.debug('start store positive set') for [user_id, item_id] in result: set_counter += 1 positive_set.add((user_id, item_id)) fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if set_counter % 300 == 0: logger.debug('[train set] positive No.%s' % (set_counter)) logger.info('[train set] positive set DONE, num of set = %s' % (set_counter)) # 负样本 tag = -1 log_counter = 0 # order by rand() 效率太低,使用两步法代替 """ # sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s order by rand() limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter+1000) sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter*2) logger.debug('negtive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() result = list(result) # result type: tuple -> list logger.debug('start store negtive set') shuffle(result) for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter == set_counter: break if log_counter % 1000 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) """ # Step 1: 获得PK的最小值和PK的最大值 sql_PK_min = 'select record_id from train_user where time>%s and time <=%s order by time limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_min) result = cursor.fetchall() PK_min = int(result[0][0]) logger.debug('min Primary Key = %s' % (PK_min)) sql_PK_max = 'select record_id from train_user where time>%s and time <=%s order by time DESC limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_max) result = cursor.fetchall() PK_max = int(result[0][0]) logger.debug('max Primary Key = %s' % (PK_max)) # Step 2: 生成随机数(min,max),直至取出与正样本相同数目的负样本 logger.debug('start store negtive set') while log_counter < set_counter: sql = 'select user_id, item_id from train_user where record_id=%s' % (randint(PK_min, PK_max)) cursor.execute(sql) result = cursor.fetchall() for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter % 300 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) logger.info('[train set] negtive set DONE, num of set = %s' % (log_counter)) cursor.close()
def filter_with_category_popularity(connect, train_user_connect, f_recommend, f_category_relationship, stoptime_str): """ 用类间承接关系和类内排名过滤结果 Args: connect: MySQLdb.connect(), 数据库连接句柄 train_user_connect: Mongodb的train_user表连接 f_recommend: fin, 推荐结果文件 f_category_relationship: fin, 类间承接关系 stoptime_str: string, 截止日期 Returns: f_output: fout, 过滤后的结果 """ import arrow import random cursor = connect.cursor() f_output = f_recommend.replace('.csv', '_filter.csv') logger.debug('Start filter recommend result..') # 根据推荐文件生成 stoptime_timestamp = arrow.get(stoptime_str).timestamp recommend_dict = {} # {u_id1:[i_id1,i_id2], u_id2:[i_id3,i_id4]} with open(f_recommend, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') if recommend_dict.has_key(cols[0]): recommend_dict[cols[0]].append(cols[1]) else: recommend_dict[cols[0]] = [cols[1]] logger.debug('完成根据推荐文件生成第一步, len:%s' % (len(recommend_dict))) # 分两步生成最后的dict是为了减少mysql查询数 recommend_tuple_dict = {} # {(u_id1,i_id1):(u_last_category, id1_category), (u_id1, i_id2):(u_last_category, id2_category)} user_counter = 0 for (u_id, i_ids) in recommend_dict.iteritems(): sql = 'SELECT item_category FROM train_user WHERE user_id=%s and time<%s ORDER BY time DESC limit 1;' % (u_id, stoptime_timestamp) cursor.execute(sql) result = cursor.fetchall() user_last_category = result[0][0] user_counter += 1 if user_counter % 200 == 0: logger.debug('No.%s user, user_id=%s, last_item_category=%s' % (user_counter, u_id, user_last_category)) for i_id in i_ids: sql = 'SELECT item_category FROM train_item WHERE item_id=%s;' % (i_id) cursor.execute(sql) result = cursor.fetchall() recommend_tuple_dict[(u_id, i_id)] = (user_last_category, result[0][0]) logger.debug('原推荐结果长度:%s' % (len(recommend_tuple_dict))) # 根据承接关系文件生成 relationship_set = set() with open(f_category_relationship, 'r') as fin: fin.readline() for line in fin: cols = line.strip().split(',') relationship_set.add((cols[0], cols[1])) logger.debug('承接关系结果长度:%s' % (len(relationship_set))) # 输出结果 with open(f_output, 'w') as fout: in_counter = 0 random_counter = 0 fout.write('user_id,item_id\n') for ((user_id, item_id), category_tuple) in recommend_tuple_dict.iteritems(): if category_tuple in relationship_set: in_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) else: if random.random() <= cal_popularity_in_category(item_id, stoptime_str, train_user_connect): random_counter += 1 fout.write('%s,%s\n' % (user_id, item_id)) logger.debug('NO.%s random pick, [%s,%s]' % (random_counter, user_id, item_id)) logger.info('对推荐结果的筛选完成,结果路径:%s' % ()) logger.info('筛选前%s, 筛选后%s. 其中在承接关系的有%s, 随机挑选的有%s' %(len(recommend_tuple_dict), in_counter+random_counter, in_counter, random_counter))
def find_category_relationship(train_user_connect, train_item_connect, json_output_path=r'%s/relationData_new.json' % data_path, csv_output_path=r'%s/relationData_new.csv' % data_path, time_window=1): """ 计算商品子集中所有类别的承接关系 :param train_user_connect: :param train_item_connect: :param time_window: :return: """ import pymongo import json logger.info('find_category_relationship start!') userids = train_user_connect.distinct('user_id') logger.debug('userids loaded!') # category_items = train_item_connect.distinct('item_id') # logger.debug('category_items loaded') relationDict = {} itemcount = 0 usercount = 0 for user_id in userids: usercount += 1 # print 'user_index:' + str(usercount) # 返回根据时间升序排序的所有该用户的购买行为 user_buy_behaviors = train_user_connect.find({'user_id': user_id, 'behavior_type': '4'}).sort('time', pymongo.ASCENDING) categoryList = [] # 存储(类别id,行为时间)元祖 for buy_behavior in user_buy_behaviors: categoryList.append((buy_behavior['item_category'], buy_behavior['time'])) # 根据时间窗口寻找类别之间的承接关系 len_category = len(categoryList) # print 'len_buylist = ' + str(len_category) for i in range(len_category): current_item = categoryList[i] itemcount += 1 currentCategory = current_item[0] targetCategoryDict = {} if relationDict.has_key(currentCategory): targetCategoryDict = relationDict.get(currentCategory) # else: # relationDict[currentCategory] = targetCategoryDict # else: # continue # 商品子集中没有该商品,则跳过 j = i while j < len_category: if (categoryList[j][1] - current_item[1]).days <= time_window: # 两次购买行为在时间窗口tw内,则存在承接关系 if categoryList[j][0] != current_item[0]: targetCategory = categoryList[j][0] # 更新dict中的次数计数 if targetCategoryDict.has_key(targetCategory): targetCategoryDict[targetCategory] += 1 else: targetCategoryDict[targetCategory] = 1 j += 1 else: break # 若购买行为超出了时间窗口,则跳出while if len(targetCategoryDict) > 0: relationDict[currentCategory] = targetCategoryDict # break if usercount % 1000 == 0: logger.debug('No.%s user done, user_index:%s\tlen_category = %s' % (usercount, usercount, len_category)) jsonstr = json.dumps(relationDict) output = open(json_output_path, 'w') output.write(jsonstr) # dict转存为csv csvout = open(csv_output_path, 'w') csvout.write('source_category,target_category,link_count\n') for source in relationDict.keys(): for target in relationDict.get(source): csvout.write('%s,%s,%s\n' % (source, target, relationDict[source][target])) logger.info('find_category_relationship done, json_output_path=%s\tcsv_output_path=%s' % ( json_output_path, csv_output_path))
def generate_train_set(connect, positive_set_timerange, negative_set_timerange, f_train_set='%s/train_set.csv' % (data_path)): """ 构建训练集 Args: connect: Mysqldb.connect(), 数据库连接句柄 positive_set_timerange: tuple, 正样本的时间筛选条件, (start, end) e.g. ('2014-12-17', '2014-12-18') negative_set_timerange: tuple, 负样本的时间筛选条件, (start, end) e.g. ('2014-11-17', '2014-12-17') f_train_set: string, 训练集结果文件 tag=1,正样本;tag=-1,负样本 ------ content ------ | user_id,item_id,tag | --------------------- Returns: None """ import arrow from random import randint cursor = connect.cursor() # 正样本的时间过滤条件 # positive_timestamp_start = arrow.get('2014-12-17').timestamp # positive_timestamp_end = arrow.get('2014-12-18').timestamp time2timestamp = lambda elem: arrow.get(elem).timestamp (positive_timestamp_start, positive_timestamp_end) = map(time2timestamp, positive_set_timerange) (negative_timestamp_start, negative_timestamp_end) = map(time2timestamp, negative_set_timerange) with open(f_train_set, 'w') as fout: fout.write('user_id,item_id,tag\n') set_counter = 0 # 正样本 tag = 1 sql = 'select distinct user_id, item_id from train_user where behavior_type=4 and time>%s and time <=%s;' % ( positive_timestamp_start, positive_timestamp_end) logger.debug('positive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() positive_set = set() # 保存正样本,以防止负样本与正样本相同 logger.debug('start store positive set') for [user_id, item_id] in result: set_counter += 1 positive_set.add((user_id, item_id)) fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if set_counter % 300 == 0: logger.debug('[train set] positive No.%s' % (set_counter)) logger.info('[train set] positive set DONE, num of set = %s' % (set_counter)) # 负样本 tag = -1 log_counter = 0 # order by rand() 效率太低,使用两步法代替 """ # sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s order by rand() limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter+1000) sql = 'select distinct user_id, item_id from train_user where behavior_type!=4 and time>%s and time <=%s limit %s;' % (negative_timestamp_start, negative_timestamp_end, set_counter*2) logger.debug('negtive sql: %s' % (sql)) cursor.execute(sql) result = cursor.fetchall() result = list(result) # result type: tuple -> list logger.debug('start store negtive set') shuffle(result) for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter == set_counter: break if log_counter % 1000 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) """ # Step 1: 获得PK的最小值和PK的最大值 sql_PK_min = 'select record_id from train_user where time>%s and time <=%s order by time limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_min) result = cursor.fetchall() PK_min = int(result[0][0]) logger.debug('min Primary Key = %s' % (PK_min)) sql_PK_max = 'select record_id from train_user where time>%s and time <=%s order by time DESC limit 1;' % ( negative_timestamp_start, negative_timestamp_end) cursor.execute(sql_PK_max) result = cursor.fetchall() PK_max = int(result[0][0]) logger.debug('max Primary Key = %s' % (PK_max)) # Step 2: 生成随机数(min,max),直至取出与正样本相同数目的负样本 logger.debug('start store negtive set') while log_counter < set_counter: sql = 'select user_id, item_id from train_user where record_id=%s' % ( randint(PK_min, PK_max)) cursor.execute(sql) result = cursor.fetchall() for [user_id, item_id] in result: if (user_id, item_id) not in positive_set: log_counter += 1 fout.write('%s,%s,%s\n' % (user_id, item_id, tag)) if log_counter % 300 == 0: logger.debug('[train set] negtive No.%s' % (log_counter)) logger.info('[train set] negtive set DONE, num of set = %s' % (log_counter)) cursor.close()
def adjust_learning_rate(optimizer, decay_rate=0.95): for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate logger.info("current learning rate" + str(param_group['lr']))
def load_data(config): data_processor = MRCProcessor() # load data exampels logger.info("loading {} ...".format(config.train_file)) train_examples = data_processor.get_train_examples(config.train_file) logger.info("{} train examples load sucessful.".format( len(train_examples))) logger.info("loading {} ...".format(config.dev_file)) dev_examples = data_processor.get_test_examples(config.dev_file) logger.info("{} dev examples load sucessful.".format(len(dev_examples))) logger.info("loading {} ...".format(config.test_file)) test_examples = data_processor.get_test_examples(config.test_file) logger.info("{} test examples load sucessful.".format(len(test_examples))) label_list = data_processor.get_labels( [train_examples, dev_examples, test_examples]) tokenizer = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=True) # convert data example into featrues train_features = convert_examples_to_features(train_examples, tokenizer, label_list, config.max_seq_length, config.max_query_length, config.doc_stride) dev_features = convert_examples_to_features(dev_examples, tokenizer, label_list, config.max_seq_length, config.max_query_length, config.doc_stride) test_features = convert_examples_to_features(test_examples, tokenizer, label_list, config.max_seq_length, config.max_query_length, config.doc_stride) num_train_steps = int( len(train_examples) / config.train_batch_size * config.epochs) return tokenizer, train_features, dev_features, test_features, num_train_steps, label_list