def sampling(): cutoffLine('*') print 'Sampling using EasyEnsemble method' start_time = time.time() TRAIN_SET = 'training_set' if not os.path.exists(TRAIN_SET): os.mkdir(TRAIN_SET) propotion = 10 negative_size = POSITIVE * propotion r_file = file(PRE_DIR + '/negative_set.csv', 'r') reader = csv.reader(r_file) positive_set = readCSV(PRE_DIR + '/positive_set.csv', int) negative_set = [] set_count = 0 for line in reader: progressBar(reader.line_num, NEGATIVE) line = map(int, line) if line[-1] == 1: positive_set.append(line) if line[-1] == 0: negative_set.append(line) if len(negative_set) == negative_size or reader.line_num == NEGATIVE: set_count += 1 training_set = positive_set + negative_set random.shuffle(training_set) file_name = TRAIN_SET + '/' + '%d.csv'%set_count writeCSV(training_set, file_name) negative_set = [] r_file.close() end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to sampling' % duration
def drop_no_buy_user(): cutoffLine('-') rfile = file('data/nuser.csv', 'r') reader = csv.reader(rfile) buyed_user = set() print 'user behavior stat' for line in reader: doneCount(reader.line_num) if int(line[2]) == 4: buyed_user.add(int(line[0])) rfile.close() print '\ndrop...' rfile = file('data/nuser.csv', 'r') wfile = file('data/nuser_cleaned', 'w') reader = csv.reader(rfile) writer = csv.writer(wfile) count = 0 for line in reader: doneCount(reader.line_num) if int(line[0]) in buyed_user: writer.writerow(line) count += 1 cutoffLine('-') print count rfile.close() wfile.close()
def global_feature(): cutoffLine('-') print 'Generate global feature' # 统计每种商品每天销量,为统计每种商品在同类商品种排名服务, 为了避免使用未来信息 global ci_sale if os.path.exists('data/ci_sale.pkl'): ci_sale_file = open('data/ci_sale.pkl', 'rb') ci_sale = pickle.load(ci_sale_file) # for c in ci_rank: print ci_rank[c] ci_sale_file.close() else: u_file = file('data/nuser.csv', 'r') u_reader = csv.reader(u_file) ci_sale = {} for line in u_reader: doneCount(u_reader.line_num) item = int(line[1]) behavior = int(line[2]) category = int(line[4]) date = int(line[5]) if not ci_sale.has_key(category): ci_sale[category] = {} if behavior == 4: if not ci_sale[category].has_key(item): ci_sale[category][item] = [0]*(TOTAL_DAY+1) ci_sale[category][item][date] += 1 ci_sale_file = open('data/ci_sale.pkl', 'wb') pickle.dump(ci_sale, ci_sale_file) ci_sale_file.close() u_file.close()
def drop_no_buy_user(): cutoffLine('-') rfile = file('data/nuser.csv','r') reader = csv.reader(rfile) buyed_user = set() print 'user behavior stat' for line in reader: doneCount(reader.line_num) if int(line[2]) == 4: buyed_user.add(int(line[0])) rfile.close() print '\ndrop...' rfile = file('data/nuser.csv','r') wfile = file('data/nuser_cleaned','w') reader = csv.reader(rfile) writer = csv.writer(wfile) count = 0 for line in reader: doneCount(reader.line_num) if int(line[0]) in buyed_user: writer.writerow(line) count += 1 cutoffLine('-') print count rfile.close() wfile.close()
def LR(X, y): cutoffLine('-') print 'Training...' X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) c_set = [0.01, 0.5, 0.1] + map(lambda x: x / 100.0, range(50, 1001, 50)) # c_set = [1] min_error = 100000 best_model = 1 best_c = -1 for c in c_set: LR_model = LogisticRegression(C=c, penalty='l1', tol=0.001, max_iter=20000) LR_model.fit(X, y) y_pred = LR_model.predict(X_test) error = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) if error < min_error: min_error = error best_model = LR_model best_c = c print "best C is %f, error is %f" % (best_c, min_error) print 'coefs below:' print best_model.coef_[0] return best_model
def SVM(X, y): cutoffLine('-') print 'Training...' X = preprocessing.scale(X) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1) SVM_model = SVC() SVM_model.fit(X, y) return SVM_model
def predict(window, model, item_subset, proportion, algo, confidence): cutoffLine('-') print 'Generate result set with confidence %f' % confidence feature_file = file('splited_data_%d/set_for_prediction.csv'%window, 'r') result_file = file('data/tianchi_mobile_recommendation_predict_%d_%s_%d_%s.csv'%\ (window, algo, proportion, str(confidence)), 'w') f_reader = csv.reader(feature_file) r_writer = csv.writer(result_file) r_writer.writerow(['user_id','item_id']) predict_set = set() UI = [] X = [] each_time = 500000 for line in f_reader: doneCount(f_reader.line_num) line = map(int, line) UI.append(tuple(line[0:2])) X.append(line[3:]) if f_reader.line_num % each_time == 0: if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X) if algo == 'lr' or algo == 'rf': y_pred = model.predict_proba(X) print y_pred for index, y in enumerate(y_pred): if y[1] > confidence: predict_set.add(UI[index]) if algo == 'svm': y_pred = model.predict(X) for index, y in enumerate(y_pred): if y == 1: predict_set.add(UI[index]) UI = [] X = [] if len(UI) > 0: if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X) if algo == 'lr' or algo == 'rf': y_pred = model.predict_proba(X) for index, y in enumerate(y_pred): if y[1] > confidence: predict_set.add(UI[index]) if algo == 'svm': y_pred = model.predict(X) for index, y in enumerate(y_pred): if y == 1: predict_set.add(UI[index]) UI = [] X = [] cutoffLine('-') print "Prediction set size before drop: %d" % len(predict_set) predict_set = dropItemsNotInSet(predict_set, item_subset) r_writer.writerows(predict_set) print "Prediction set size after drop: %d" % len(predict_set) feature_file.close() result_file.close() return len(predict_set)
def predict(model, index): cutoffLine('-') print 'Generate result set %d' % index feature_file = file('splited_data/set_for_prediction.csv', 'r') result_file = file(TRAIN_SET_DIR + '/' + 'lr_result_%d.csv' % index, 'w') f_reader = csv.reader(feature_file) r_writer = csv.writer(result_file) r_writer.writerow(['user_id','item_id']) for line in f_reader: doneCount(f_reader.line_num) line = map(int, line) if model.predict([line[2:]])[0] == 1: r_writer.writerow(line[0:2]) feature_file.close() result_file.close()
def splitData(): cutoffLine('*') print 'Start split data with window %d' % WINDOW start_time = time.time() stat_file = file(PRE_DIR + '/stat.csv','w') stat_writer = csv.writer(stat_file) for i in range(1,FILES+1): cutoffLine('-') print 'Split dataset %d/%d: ' % (i, FILES) rfile = file(DATA_SET,'r') reader = csv.reader(rfile) j = i + WINDOW if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv'%i result_file_name = '%s_%d.csv'%('result',i) train_file = file(PRE_DIR + '/' + train_file_name,'w') result_file = file(PRE_DIR + '/' + result_file_name,'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name,'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close() stat_file.close() end_time = time.time() duration = timekeeper(start_time,end_time) cutoffLine('-') print 'It takes ' + duration + ' to split dataset.' cutoffLine('*')
def stat(): cutoffLine('-') print 'stat some information...' user_file = file('data/nuser.csv','r') item_file = file('data/item.csv','r') stat_file = open('data/stat.txt','w') row_count = 0 user_set = set() sub_item_set = set() all_item_set = set() category_set = set() user_geo_count = 0 item_geo_count = 0 reader = csv.reader(item_file) for line in reader: doneCount(reader.line_num) if reader.line_num == 1: continue if line[1]: item_geo_count += 1 category_set.add(line[2]) sub_item_set.add(line[0]) reader = csv.reader(user_file) for line in reader: doneCount(reader.line_num) row_count += 1 user_set.add(line[0]) all_item_set.add(line[1]) if line[3]: user_geo_count += 1 interact_item_set = all_item_set & sub_item_set stat_file.write('%s : %s\n'%(u'Total Count',row_count)) stat_file.write('%s : %s\n'%(u'User Count',len(user_set))) stat_file.write('%s : %s\n'%(u'All Item Count',len(all_item_set))) stat_file.write('%s : %s\n'%(u'Sub Item Count',len(sub_item_set))) stat_file.write('%s : %s %f\n'%(u'Interact Item Count', len(interact_item_set), float(len(interact_item_set))/len(sub_item_set))) stat_file.write('%s : %s\n'%(u'Category Count',len(category_set))) stat_file.write('%s : %s\n'%(u'User Geo Count',user_geo_count)) stat_file.write('%s : %s\n'%(u'Item Geo Count',item_geo_count)) stat_file.close() user_file.close() item_file.close()
def splitData(): cutoffLine('*') print 'Start split data with window %d' % WINDOW start_time = time.time() stat_file = file(PRE_DIR + '/stat.csv', 'w') stat_writer = csv.writer(stat_file) for i in range(1, FILES + 1): cutoffLine('-') print 'Split dataset %d/%d: ' % (i, FILES) rfile = file(DATA_SET, 'r') reader = csv.reader(rfile) j = i + WINDOW if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv' % i result_file_name = '%s_%d.csv' % ('result', i) train_file = file(PRE_DIR + '/' + train_file_name, 'w') result_file = file(PRE_DIR + '/' + result_file_name, 'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0], line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name, 'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close() stat_file.close() end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('-') print 'It takes ' + duration + ' to split dataset.' cutoffLine('*')
def evaluate_model(model, index): cutoffLine('-') print 'offline evaluate RF model %d' % index test_file = file('splited_data/set_test.csv', 'r') test_reader = csv.reader(test_file) predict_set = set() real_set = set() for line in test_reader: doneCount(test_file.line_num) line = map(int, line) if line[-1] == 1 : real_set.add((line[0],line[1])) if model.predict([line[2:-1]])[0] == 1: predict_set.add((line[0],line[1])) import evaluate P, R, F = evaluate.evaluate(predict_set, real_set) test_file.close() return P, R, F
def stat(): cutoffLine('-') print 'stat some information...' user_file = file('data/nuser.csv', 'r') item_file = file('data/item.csv', 'r') stat_file = open('data/stat.txt', 'w') row_count = 0 user_set = set() sub_item_set = set() all_item_set = set() category_set = set() user_geo_count = 0 item_geo_count = 0 reader = csv.reader(item_file) for line in reader: doneCount(reader.line_num) if reader.line_num == 1: continue if line[1]: item_geo_count += 1 category_set.add(line[2]) sub_item_set.add(line[0]) reader = csv.reader(user_file) for line in reader: doneCount(reader.line_num) row_count += 1 user_set.add(line[0]) all_item_set.add(line[1]) if line[3]: user_geo_count += 1 interact_item_set = all_item_set & sub_item_set stat_file.write('%s : %s\n' % (u'Total Count', row_count)) stat_file.write('%s : %s\n' % (u'User Count', len(user_set))) stat_file.write('%s : %s\n' % (u'All Item Count', len(all_item_set))) stat_file.write('%s : %s\n' % (u'Sub Item Count', len(sub_item_set))) stat_file.write('%s : %s %f\n' % (u'Interact Item Count', len(interact_item_set), float(len(interact_item_set)) / len(sub_item_set))) stat_file.write('%s : %s\n' % (u'Category Count', len(category_set))) stat_file.write('%s : %s\n' % (u'User Geo Count', user_geo_count)) stat_file.write('%s : %s\n' % (u'Item Geo Count', item_geo_count)) stat_file.close() user_file.close() item_file.close()
def evaluate_model(algo, window, model, item_subset, confidence): cutoffLine('-') print 'offline evaluate model with confidence %f' % confidence test_file = file('splited_data_%d/set_test.csv'%window, 'r') test_reader = csv.reader(test_file) predict_set = set() real_set = set() UI = [] X = [] each_time = 500000 for line in test_reader: doneCount(test_reader.line_num) line = map(int, line) UI.append(tuple(line[0:2])) X.append(line[3:-1]) if line[-1] == 1 : real_set.add((line[0],line[1])) if test_reader.line_num % each_time == 0: if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X) if algo == 'lr' or algo == 'rf': y_pred = model.predict_proba(X) for index, y in enumerate(y_pred): if y[1] > confidence: predict_set.add(UI[index]) if algo == 'svm': y_pred = model.predict(X) for index, y in enumerate(y_pred): if y == 1: predict_set.add(UI[index]) UI = [] X = [] if len(UI) > 0: if algo == 'lr' or algo == 'svm': X = preprocessing.scale(X) if algo == 'lr' or algo == 'rf': y_pred = model.predict_proba(X) for index, y in enumerate(y_pred): if y[1] > confidence: predict_set.add(UI[index]) if algo == 'svm': y_pred = model.predict(X) for index, y in enumerate(y_pred): if y == 1: predict_set.add(UI[index]) UI = [] X = [] predict_set = dropItemsNotInSet(predict_set, item_subset) real_set = dropItemsNotInSet(real_set, item_subset) import evaluate P, R, F = evaluate.evaluate(predict_set, real_set) test_file.close() return P, R, F
def splitData(): stat_file = file('splited_data/stat.csv','w') stat_writer = csv.writer(stat_file) for i in range(1,FILES+1): cutoffLine('-') print 'Split dataset %d: ' % i rfile = file(DATA_SET,'r') reader = csv.reader(rfile) j = i + 10 if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv'%i result_file_name = '%s_%d.csv'%('result',i) train_file = file(PRE_DIR + '/' + train_file_name,'w') result_file = file(PRE_DIR + '/' + result_file_name,'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name,'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close()
def evaluate(prediction, result): cutoffLine('-') print 'Prediction set size: %d' % len(prediction) print 'Result set size: %d' % len(result) prediction = set(prediction) result = set(result) intersection = prediction & result precision = float(len(intersection)) / len(prediction) * 100 recall = float(len(intersection)) / len(result) * 100 F1 = 2 * precision * recall / (precision + recall) print 'P : %2f' % precision print 'R : %2f' % recall print 'F1: %2f' % F1 return precision, recall, F1
def evaluate(prediction,result): cutoffLine('-') print 'Prediction set size: %d' % len(prediction) print 'Result set size: %d' % len(result) prediction = set(prediction) result = set(result) intersection = prediction & result precision = float(len(intersection))/len(prediction)*100 recall = float(len(intersection))/len(result)*100 F1 = 2 * precision * recall / (precision + recall) print 'P : %2f' % precision print 'R : %2f' % recall print 'F1: %2f' % F1 return precision, recall, F1
def merge_training_set(): cutoffLine('*') print 'Merge training set' start_time = time.time() positive_count = 0 negative_count = 0 total_count = 0 total_file = file(PRE_DIR + '/' + 'train_set.csv', 'w') pos_file = file(PRE_DIR + '/' + 'positive_set.csv', 'w') neg_file = file(PRE_DIR + '/' + 'negative_set.csv', 'w') total_writer = csv.writer(total_file) pos_writer = csv.writer(pos_file) neg_writer = csv.writer(neg_file) for i in range(1, FILES-1): cutoffLine('-') print 'load train set %d' % i r_file = file(PRE_DIR + '/' + 'set_%d.csv' % i) reader = csv.reader(r_file) for line in reader: doneCount(reader.line_num) line = map(int, line) if line[-1] == 1: positive_count += 1 pos_writer.writerow(line) if line[-1] == 0: negative_count += 1 neg_writer.writerow(line) total_count += 1 total_writer.writerow(line) r_file.close() total_file.close() pos_file.close() neg_file.close() cutoffLine('-') # 44114 print 'Positive Example: %d' % positive_count # 59373295 print 'Negative Example: %d' % (total_count - positive_count) # 59417409 print 'Total Example: %d' % total_count # 一致性判断 print 'Is right? %s'%('Yes' if positive_count + negative_count == total_count else 'No') end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to merge training set and backup negative and positive set' % duration
def generate_training_set(): start_time = time.time() ## load the information of data set line_count = {} rfile = file(PRE_DIR + '/stat.csv','r') reader = csv.reader(rfile) for line in reader: line_count[line[0]] = int(line[1]) rfile.close() cutoffLine('*') print 'Generate training set' for i in range(1,FILES + 1): cutoffLine('-') if i == FILES: file_name = 'for_prediction.csv' print 'Extract feature from %s'%file_name extract_feature(file_name, line_count[file_name], i) elif i == FILES - 1: file_name = 'test.csv' print 'Extract feature from %s'%file_name result_name = 'result_%s'%file_name extract_feature(file_name, line_count[file_name], i, result_name) else: file_name = '%d.csv' % i print 'Extract feature from %s and tag it'%file_name result_name = 'result_%d.csv' % i extract_feature(file_name, line_count[file_name], i, result_name) end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to generate training set' % duration
def merge_training_set(): cutoffLine('*') print 'Merge training set' start_time = time.time() positive_count = 0 negative_count = 0 total_count = 0 total_file = file(PRE_DIR + '/' + 'train_set.csv', 'w') pos_file = file(PRE_DIR + '/' + 'positive_set.csv', 'w') neg_file = file(PRE_DIR + '/' + 'negative_set.csv', 'w') total_writer = csv.writer(total_file) pos_writer = csv.writer(pos_file) neg_writer = csv.writer(neg_file) for i in range(1, FILES - 1): cutoffLine('-') print 'load train set %d' % i r_file = file(PRE_DIR + '/' + 'set_%d.csv' % i) reader = csv.reader(r_file) for line in reader: doneCount(reader.line_num) line = map(int, line) if line[-1] == 1: positive_count += 1 pos_writer.writerow(line) if line[-1] == 0: negative_count += 1 neg_writer.writerow(line) total_count += 1 total_writer.writerow(line) r_file.close() total_file.close() pos_file.close() neg_file.close() cutoffLine('-') print 'Positive Example: %d' % positive_count print 'Negative Example: %d' % (total_count - positive_count) print 'Total Example: %d' % total_count print 'Is right? %s' % ('Yes' if positive_count + negative_count == total_count else 'No') end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to merge training set and backup negative and positive set' % duration
def train_RF(): start_time = time.time() cutoffLine('*') print 'Use RF model to train %d models'%TRAIN_SET_FILES for i in range(1, 1 + 1): #for i in range(1, TRAIN_SET_FILES + 1): cutoffLine('-') print 'model %d'%i t_file = file(TRAIN_SET_DIR + '/%d.csv'%i, 'r') t_reader = csv.reader(t_file) X = [] y = [] for line in t_reader: line = map(int, line) X.append(line[2:-1]) y.append(line[-1]) model = RF(X, y) P ,R ,F = evaluate_model(model, i) predict(model, i) models.append(model) t_file.close() cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'I takes %s to train , evaluate model and generate result'% duration
def predict(model, item_subset): cutoffLine('-') print 'Generate result set' feature_file = file('splited_data/set_for_prediction.csv', 'r') result_file = file('data/prediction_lr.csv', 'w') f_reader = csv.reader(feature_file) r_writer = csv.writer(result_file) r_writer.writerow(['user_id','item_id']) predict_set = set() for line in f_reader: doneCount(f_reader.line_num) line = map(int, line) if model.predict([line[2:]])[0] == 1: predict_set.add((line[0], line[1])) cutoffLine('-') print "Prediction set size before drop: %d" % len(predict_set) predict_set = dropItemsNotInSet(predict_set, item_subset) r_writer.writerows(predict_set) print "Prediction set size after drop: %d" % len(predict_set) feature_file.close() result_file.close()
def LR(X, y): cutoffLine('-') print 'Training...' X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1) c_set = [0.01, 0.5, 0.1] + map(lambda x: x/100.0, range(50,1001,50)) # c_set = [1] min_error = 100000 best_model = 1 best_c = -1 for c in c_set: LR_model = LogisticRegression(C=c, penalty = 'l1', tol = 0.001, max_iter = 20000) LR_model.fit(X, y) y_pred = LR_model.predict(X_test) error = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) if error < min_error: min_error = error best_model = LR_model best_c = c print "best C is %f, error is %f" % (best_c, min_error) print 'coefs below:' print best_model.coef_[0] return best_model
def train(window, proportion, algo, confidence): start_time = time.time() cutoffLine('*') print '%s model training with sample proportion 1:%d...' %(algo, proportion) t_file = file('data/training_set_%d_%d.csv' % (window, proportion), 'r') t_reader = csv.reader(t_file) X = [] y = [] for line in t_reader: doneCount(t_reader.line_num) line = map(int, line) X.append(line[3:-1]) y.append(line[-1]) model_name = 'data/model/%s_%d_%d.model'%(algo, window, proportion) if os.path.exists(model_name): model = joblib.load(model_name) else: if algo == 'lr': model = LR(X, y) if algo == 'rf': model = RF(X, y) if algo == 'svm': model = SVM(X, y) joblib.dump(model, model_name) cutoffLine('-') print model.classes_ item_subset = loadItemSubset() record_file = open('data/model_evaluate_record.txt','a') P, R, F= evaluate_model(algo, window, model, item_subset, confidence) predict_set_size = predict(window, model, item_subset, proportion, algo, confidence) record_file.write('window %d '%window + algo+' %d'%proportion + ' %.2f\n'%confidence) record_file.write('\tP: %f\n'%P) record_file.write('\tR: %f\n'%R) record_file.write('\tF1: %f\n'%F) record_file.write('Predict Set Size: %d\n'%predict_set_size) record_file.write('-'*30+'\n') record_file.close() t_file.close() cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'I takes %s to train , evaluate model and generate result' % duration
def generate_training_set(window): start_time = time.time() global PRE_DIR, FILES PRE_DIR = 'splited_data_%d' % window FILES = TOTAL_DAY - window + 1 ## load the information of data set line_count = {} rfile = file(PRE_DIR + '/stat.csv', 'r') reader = csv.reader(rfile) for line in reader: line_count[line[0]] = int(line[1]) rfile.close() cutoffLine('*') print 'Generate training set with window %d' % window for i in range(1, FILES + 1): cutoffLine('-') if i == FILES: file_name = 'for_prediction.csv' print 'Extract feature from %s' % file_name extract_feature(window, i + window, file_name, line_count[file_name], i) elif i == FILES - 1: file_name = 'test.csv' print 'Extract feature from %s' % file_name result_name = 'result_%s' % file_name extract_feature(window, i + window, file_name, line_count[file_name], i, result_name) else: file_name = '%d.csv' % i print 'Extract feature from %s and tag it' % file_name result_name = 'result_%d.csv' % i extract_feature(window, i + window, file_name, line_count[file_name], i, result_name) end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to generate training set' % duration
def train_LR(): start_time = time.time() cutoffLine('*') print 'LR model training...' cutoffLine('-') t_file = file('data/training_set_10.csv', 'r') t_reader = csv.reader(t_file) X = [] y = [] for line in t_reader: line = map(int, line) X.append(line[2:-1]) y.append(line[-1]) model = logRes(X,y) item_subset = loadItemSubset() evaluate_model(model, item_subset) predict(model, item_subset) t_file.close() cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'I takes %s to train , evaluate model and generate result' % duration
def sampling(proportion): cutoffLine('*') start_time = time.time() print 'sampling with propotion %d...'%proportion negative_needed = POSITIVE * proportion sample_times = 10 mod = NEGATIVE / sample_times negative_eachtime = negative_needed / sample_times training_set = readCSV(PRE_DIR + '/positive_set.csv', int) ## sampling negative example rfile = file(PRE_DIR + '/' + 'negative_set.csv', 'r') reader = csv.reader(rfile) negative_tmp = [] for line in reader: progressBar(reader.line_num, NEGATIVE) negative_tmp.append(map(int, line)) if reader.line_num % mod == 0: random.shuffle(negative_tmp) training_set = training_set + negative_tmp[0:negative_eachtime] negative_tmp = [] rfile.close() wfile = file('data/training_set_%d.csv'%proportion, 'w') writer = csv.writer(wfile) random.shuffle(training_set) writer.writerows(training_set) wfile.close() cutoffLine('-') print "Real proportion: %f" %((len(training_set)-POSITIVE) / float(POSITIVE)) cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'It takes %s to sampling with proportion %d'%(duration, proportion)
def sampling(window, proportion): cutoffLine('*') start_time = time.time() print 'sampling with propotion %d...' % proportion exec('negative_needed = POSITIVE_%d * propotion' % window) sample_times = 20 exec('mod = NEGATIVE_%d / sample_times' % window) exec('negative_eachtime = negative_needed / sample_times') training_set = readCSV(PRE_DIR + '/positive_set.csv', int) ## sampling negative example rfile = file(PRE_DIR + '/' + 'negative_set.csv', 'r') reader = csv.reader(rfile) negative_tmp = [] for line in reader: exec('progressBar(reader.line_num, NEGATIVE_%d)' % window) negative_tmp.append(map(int, line)) if reader.line_num % mod == 0: random.shuffle(negative_tmp) training_set.extend(negative_tmp[0:negative_eachtime]) negative_tmp = [] rfile.close() wfile = file('data/training_set_%d_%d.csv' % (window, propotion), 'w') writer = csv.writer(wfile) random.shuffle(training_set) writer.writerows(training_set) wfile.close() cutoffLine('-') exec('real_proportion = (len(training_set)- POSITIVE_%d) / float(POSITIVE_%d)'%(window, window)) print "Real proportion: %f" % real_proportion cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'It takes %s to sampling with proportion %d'%(duration, proportion)
if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name,'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close() if __name__ == '__main__': print 'Start split data' cutoffLine('*') start_time = time.time() splitData() end_time = time.time() duration = timekeeper(start_time,end_time) cutoffLine('*') print 'It takes ' + duration + ' to split dataset.'
def RF(X, y): cutoffLine('-') print 'Training...' model = RandomForestClassifier(n_estimators = 100) model.fit(X, y) return model