def prediction(train_dataSet, train_labels, test_dataSet, test_labels, alpha, b, option,cat): train_predict_cat = numpy.zeros(len(train_labels)) test_predict_cat = numpy.zeros(len(test_labels)) for i in range (len(train_labels)): temp = 0 for j in range(len(train_labels)): temp += alpha[j] * train_labels[j] * Kernel(train_dataSet[j], train_dataSet[i], option,cat) #print 't', temp #print 'b', temp + b if (temp + b) < 0: train_predict_cat[i] = -1 else: train_predict_cat[i] = 1 #print train_predict_cat[i] #print numpy.sum(train_predict_cat) print 'Accuracy for training dataset: ',evaluation.evaluation(train_predict_cat, train_labels) for i in range (len(test_labels)): temp = 0 for j in range(len(train_labels)): temp += alpha[j] * test_labels[j] * Kernel(train_dataSet[j], test_dataSet[i], option,cat) if (temp + b) < 0: test_predict_cat[i] = -1 else: test_predict_cat[i] = 1 print 'Accuracy for test dataset: ',evaluation.evaluation(test_predict_cat, test_labels)
def train(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") json_file = open("parameters.json") parameters = json.load(json_file) json_file.close() net = CNNModel(1, 10) optimizer = torch.optim.Adam(net.parameters(), lr=parameters["lr"]) criterion = nn.BCELoss() if torch.cuda.is_available(): net = torch.nn.DataParallel(net, device_ids=range( torch.cuda.device_count())).cuda() cudnn.benchmark = True ecg_dataset = EcgDataset(is_train=True) train_loader = torch.utils.data.DataLoader(dataset=ecg_dataset, batch_size=10) for epoch in range(parameters["num_epochs"]): net.train() for i, (data, label) in enumerate(train_loader): data, label = data.to(device), label.to(device) output = net(data) optimizer.zero_grad() loss = criterion(output, label) loss.backward() optimizer.step() print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, parameters["num_epochs"], loss.item())) evaluation(net)
def predict(train_dataSet, train_labels, test_dataSet, test_labels, w, b): """ Predict training and test set """ train_sample_num = len(train_labels) train_feature_num = len(train_dataSet[0]) test_sample_num = len(test_labels) test_feature_num = len(test_dataSet[0]) train_predict_cat = numpy.zeros(train_sample_num) test_predict_cat = numpy.zeros(test_sample_num) #predict training set for i in range(train_sample_num): if ((numpy.inner(train_dataSet[i],w)+b)) <= 0: #print numpy.inner(train_dataSet[i],w)+b train_predict_cat[i] = -1 else: #print numpy.inner(train_dataSet[i],w)+b train_predict_cat[i] = 1 #print "# train_size = " + str(train_size) print 'Accuracy for training dataset: ',evaluation.evaluation(train_predict_cat, train_labels) #predict test set for i in range(test_sample_num): if ((numpy.inner(test_dataSet[i], w) + b)) <= 0: #print numpy.inner(test_dataSet[i],w)+b test_predict_cat[i] = -1 else: test_predict_cat[i] = 1 #print "# test_size = " + str(test_size) print 'Accuracy for test dataset: ',evaluation.evaluation(test_predict_cat, test_labels) return train_predict_cat, test_predict_cat
def main(): Cluster.GAMMA = 0.95 ALPHA = 0.1 truths, predicts = split_phase(flattened_train_data_path) pre, coll, f1 = evaluation(truths, predicts) print(pre, coll, f1) final_pre = merge_phases(predicts, ALPHA) pre, coll, f1 = evaluation(truths, final_pre) print(pre, coll, f1)
def train_model(option): train_loader = DataLoader(dataset=build_dataset('./data/train_x.pkl', './data/train_y.pkl'), batch_size=option.batch_size, shuffle=True) test_loader = DataLoader(dataset=build_dataset('./data/test_x.pkl', './data/test_y.pkl'), batch_size=option.batch_size, shuffle=True) model = biLSTM_CRF(option.embedding_size, option.hidden_size, option.dict_number, option.num_labels) optimizer = torch.optim.Adam(model.parameters(), lr=option.lr) if option.use_gpu: model.cuda() if option.pre_trained: model.load_state_dict(torch.load(option.pre_trained)) for epoch in range(option.epochs): print(epoch) # 创建评价 train_eva = evaluation(option.label_dict) test_eva = evaluation(option.label_dict) # 训练 model.train() for step, (batch_x, batch_y, batch_masks) in enumerate(train_loader): optimizer.zero_grad() if option.use_gpu: batch_x = batch_x.cuda() batch_y = batch_y.cuda() masks = masks.cuda() y_pred, loss = model(batch_x, batch_y, batch_masks) train_eva.add(y_pred, batch_y) loss.backward() optimizer.step() # 通过测试集验证 model.eval() for step, (batch_x, batch_y) in enumerate(test_loader): if option.use_gpu: batch_x = batch_x.cuda() batch_y = batch_y.cuda() masks = masks.cuda() y_pred, loss = model(batch_x, batch_y, batch_masks) test_eva.add(y_pred, batch_y) print("train:") print(train_eva.evaluate()) print("test:") print(test_eva.evaluate())
def main(args): #load data if args.mode == 'train': data_loader_train = data_load.data_loader(args.train_feat, args.train_phn, args.batch_size, meta_path=args.meta, max_length=args.max_length, is_training=True) else: data_loader_train = data_load.data_loader(args.train_feat, args.train_phn, args.batch_size, meta_path=args.meta, max_length=args.max_length, is_training=True) data_loader_test = data_load.data_loader(args.test_feat, args.test_phn, args.batch_size, max_length=args.max_length, is_training=False) #add some feature to args args.feat_dim = data_loader_train.feat_dim args.vocab_size = data_loader_train.vocab_size #build model graph if args.mode == 'train': g = model(args) else: g = model(args, is_training=False) print("Graph loaded") if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) #create sess with tf.Session(graph=g.graph) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=3) if (args.mode != 'train') or (args.load == 'load'): print('load_model') saver.restore(sess, tf.train.latest_checkpoint(args.save_dir)) if args.mode == 'train': print('training') train(sess, g, args, saver, data_loader_train) else: print('evaluating') evaluation(sess, g, args, data_loader_train, data_loader_test)
def test(): path = '../data/reason' # out = open('../out/outlierTree.csv', 'w') # out.write('name, correct, per\n') # out.close() for _, _, files in os.walk(path): # root 根目录,dirs 子目录 for filename in files: if str(filename)[-4:] == '.csv' and str(filename)[:1] == 'c': filepath = path + "/" + str(filename) df_true = pd.read_csv(filepath) df = df_true.drop(['label', 'reason'], axis=1) for i in range(10): df_pre = run_outlierTree(df) # 只有待判断属性列 evaluation(df_true, df_pre, outpath='../out/outlierTree', name=filename[:-4])
def main(model_type): if model_type == macro._JOINTREALNVP: prior_z = dataset.gauss_sample(n_sample=10000, dim=2 + 2) mask = torch.from_numpy(np.array([0, 1, 0, 1]).astype(np.float32)) model = flow.JointRealNVP(input_dim=2 + 2, output_dim=2 + 2, hid_dim=512, mask=mask, n_layers=8) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) elif model_type == macro._CONDITIONALREALNVP: prior_z = dataset.gauss_sample(n_sample=10000, dim=2) mask = torch.from_numpy(np.array([0, 1]).astype(np.float32)) model = flow.ConditionalRealNVP(input_dim=2 + 2, output_dim=2, hid_dim=512, mask=mask, n_layers=8) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) sampled_x, sampled_labels = dataset.doublemoon_sample(n_sample=10000) train_loader = DataLoader(TensorDataset(sampled_x, sampled_labels), batch_size=64, shuffle=True) model.train() train_loss = 0 for epoch in range(macro._EPOCH): for batch_idx, (data, labels) in enumerate(train_loader): optimizer.zero_grad() z, log_det_j_sum = model(data, labels) if model_type == macro._JOINTREALNVP: y = z[:, len(z[0]) - len(labels[0]):] log_prob_loss = -(prior_z.log_prob(z) + log_det_j_sum).mean() mse_loss = F.mse_loss(y, labels) loss = log_prob_loss + macro._LAMBDA * mse_loss elif model_type == macro._CONDITIONALREALNVP: loss = -(prior_z.log_prob(z) + log_det_j_sum).mean() loss.backward() train_loss += loss.item() optimizer.step() print('Epoch: {} Average loss: {:.4f}'.format(\ epoch, train_loss / (len(train_loader.dataset)*len(z[0])))) eval.evaluation(model, model_type)
def annealing(n,cnt): maze = generateMatrix(n) first = evaluation(maze,n) minima = first print("Init energy: %d" %(first)) #print(maze) backup = maze; temperature=2.5 decay = 0.99999 for i in range(cnt): #randomly choose a cell to change, make sure it's not goal state goal=[n-1,n-1] new_cell=goal while new_cell == goal: row = rd.randint(0,n-1) col = rd.randint(0,n-1) new_cell=[row,col] #change the jump number, make sure it's not same as before old_step = maze[row,col] new_step = old_step maxrnd = max(n-row-1,n-col-1,row,col) while new_step == old_step: new_step = rd.randint(1,maxrnd) maze[row,col] = new_step #get the new energy nexte = evaluation(maze,n) if i % 1000 == 0: print("Current energy: %d" %(minima)) print("temp diff %d" %(first-nexte)) print("temp %f" %(temperature*0.5)) #print(np.exp((first-nexte)/temperature)) #if nexte <= first or rd.random()< np.exp((first-nexte)/temperature): if nexte <= first or rd.random()< temperature*0.5: first = nexte # minima < nexte < first, not minima keep going if nexte < minima: #new minima, store the matrix minima = nexte backup = maze else: maze = backup temperature = temperature*decay return (minima,maze)
def main(): args = get_args() wandb.init() wandb.config.update(args) seed = 42 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.deterministic = True torch.backends.cudnn.benchmark = False loaded_model = False [train_loader, valid_loader, model, optimizer] = initialize(args, loaded_model) scaler = torch.cuda.amp.GradScaler() wandb.watch(model) best_acc = 0 run_avg = RunningAverage() # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.1, cycle_momentum=False) for epoch in range(1, args.epochs_number + 1): run_avg.reset_train() run_avg.reset_val() train(args, model, train_loader, epoch, optimizer, scaler, run_avg) val_acc = evaluation(args, model, valid_loader, epoch, run_avg) # scheduler.step() if best_acc < val_acc: best_acc = val_acc save_checkpoint(model, optimizer, args, epoch)
def evaluate_model(self, test_file, output_dict, output_file): """ This method generate an output file by predicting a data in a given file with the trained model. """ test_file = open(test_file, "r", encoding="utf8") index_dict = output_dict["index_dict"] token_dict = output_dict["token_dict"] model = output_dict["model"] te_refined_data, test_data_objs = self.cue_tr_obj.get_data_for_validation( test_file, self.max_len, index_dict, token_dict) # Prediction index2tag = index_dict["index2tag"] test_x, _, _ = self.cue_tr_obj.prepare_training_data( te_refined_data, self.features_dict, index_dict, self.embed_size) eval_obj = evaluation.evaluation() test_pred = eval_obj.predict_test(model, test_x, index2tag) #print("Unique labels: {}".format(np.unique(test_pred) )) # Create new file negation_dict = eval_obj.tag_negation_cues(test_data_objs, test_pred) dp_obj = data_prep.data_preparation() new_obj_list = dp_obj.create_new_obj_list(test_data_objs, negation_dict) dp_obj.print_to_file(new_obj_list, output_file) # test_gold_pred_cue3 is the best
def __init__(self, SIZE): self.evaluator = evaluation(SIZE) self.board = [[0 for n in range(SIZE)] for i in range(SIZE)] self.gameover = 0 self.overvalue = 0 self.maxdepth = 3 self.SIZE = SIZE
def __init__(self, client_name): # Nombre del cliente para identificar sobre cuales archivos se ejecutaran los procesos self.client_name = client_name self.config = configparser.ConfigParser() self.config.sections() if os.path.isfile( str('../Datasets/' + self.client_name) + 'config.ini'): with open(str('../Datasets/' + self.client_name) + 'config.ini') as config_parser_fp: self.config.read_file(config_parser_fp) self.database_path = "../Datasets/" + str( self.client_name) + "database/input_data/" self.sql_db = sql.create_engine('sqlite:///' + self.database_path + "db.sql") self.models_path = "../Datasets/" + str( self.client_name) + "database/models/" self.precision_weight = float( self.config['EVALUATION']['precision_weight']) self.recall_weight = float(self.config['EVALUATION']['recall_weight']) self.time_weight = float(self.config['EVALUATION']['time_weight']) self.rmse_weight = float(self.config['EVALUATION']['rmse_weight']) self.mae_weight = float(self.config['EVALUATION']['mae_weight']) self.common_functions = common(self.client_name) self.evaluation = evaluation(self.client_name)
def entity_predict(dataset_iter): model.eval() dataset_iter.init_epoch() gold_list = [] pred_list = [] dete_result = [] question_list = [] for data_batch_idx, data_batch in enumerate(dataset_iter): #batch_size = data_batch.text.size()[1] answer = torch.max(model(data_batch), 1)[1].view(data_batch.ed.size()) answer[(data_batch.text.data == 1)] = 1 answer = np.transpose(answer.cpu().data.numpy()) gold_list.append(np.transpose(data_batch.ed.cpu().data.numpy())) index_question = np.transpose(data_batch.text.cpu().data.numpy()) question_array = index2word[index_question] dete_result.extend(answer) question_list.extend(question_array) #for i in range(batch_size): # If no word is detected as entity, select top 3 possible words # if all([j == 1 or j == idxO for j in answer[i]]): # index = list(range(i, scores.shape[0], batch_size)) # FindOidx = [j for j, x in enumerate(answer[i]) if x == idxO] # idx_in_socres = [index[j] for j in FindOidx] # subscores = scores[idx_in_socres] # answer[i][torch.sort(torch.max(subscores, 1)[0], descending=True)[1][0:min(2, len(FindOidx))]] = idxI pred_list.append(answer) P, R, F = evaluation(gold_list, pred_list, index2tag, type=False) print("{} Precision: {:10.6f}% Recall: {:10.6f}% F1 Score: {:10.6f}%".format("Dev", 100. * P, 100. * R, 100. * F)) return dete_result, question_list
def test_cross_dataset(config_file,test_dataset, **kwargs): cfg.merge_from_file(config_file) if kwargs: opts = [] for k,v in kwargs.items(): opts.append(k) opts.append(v) cfg.merge_from_list(opts) cfg.freeze() PersonReID_Dataset_Downloader('./datasets',cfg.DATASETS.NAMES) _, _, _, num_classes = data_loader(cfg,cfg.DATASETS.NAMES) PersonReID_Dataset_Downloader('./datasets',test_dataset) _, val_loader, num_query, _ = data_loader(cfg,test_dataset) re_ranking=cfg.RE_RANKING if not re_ranking: logger = make_logger("Reid_Baseline", cfg.OUTPUT_DIR, cfg.DATASETS.NAMES+'->'+test_dataset) logger.info("Test Results:") else: logger = make_logger("Reid_Baseline", cfg.OUTPUT_DIR, cfg.DATASETS.NAMES+'->'+test_dataset+'_re-ranking') logger.info("Re-Ranking Test Results:") device = torch.device(cfg.DEVICE) model = getattr(models, cfg.MODEL.NAME)(num_classes) model.load(cfg.OUTPUT_DIR,cfg.TEST.LOAD_EPOCH) model = model.eval() all_feats = [] all_pids = [] all_camids = [] since = time.time() for data in tqdm(val_loader, desc='Feature Extraction', leave=False): model.eval() with torch.no_grad(): images, pids, camids = data if device: model.to(device) images = images.to(device) feats = model(images) all_feats.append(feats) all_pids.extend(np.asarray(pids)) all_camids.extend(np.asarray(camids)) cmc, mAP = evaluation(all_feats,all_pids,all_camids,num_query,re_ranking) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(r, cmc[r - 1])) test_time = time.time() - since logger.info('Testing complete in {:.0f}m {:.0f}s'.format(test_time // 60, test_time % 60))
def main(): X, Y, ids, Amounts = preprocess() ratio = int(math.ceil(.8 * len(X))) x_train, y_train, x_test, y_test = X[:ratio], Y[:ratio], X[ratio:], Y[ ratio:] ids_train, ids_test = ids[:ratio], ids[ratio:] model = v3.bayesnet() eval = evaluation(Amounts[ratio:]) #print np.unique(ids_train).shape #print np.unique(ids_test).shape for rounds in range(1): print 'round: ' + str(rounds) #print "n_samples: ", n_samples model.fit(x_train, y_train, ids_train, k_clusters=FLAGS.k_clusters, epochs=40, verbose=1) #print "MLE + EM: " predictions = model.predict(x_test, ids_test) eval.evaluate(predictions, y_test) eval.get_results().to_csv("../results/" + "v3_" + str(FLAGS.k_clusters), index=False)
def trainStep(model, input_tensor, target_tensor, testLoader, optimizer, device, criterion=nn.CrossEntropyLoss()): ''' :param model: no need to use device :param input_tensor: :param target_tensor: :param optimizer: :param criterion: :return: ''' input_tensors, mask, segment_id = input_tensor output_tensor = model(input_tensors, segment_id, mask) # p,r,f1=eval(output_tensor,target_tensor) p, r, f1 = evaluation(model, testLoader, device) loss = criterion(output_tensor, target_tensor.squeeze()) loss.backward() optimizer.step() optimizer.zero_grad() return model, loss, p, r, f1
def test(in_dir, class_dir, feat_dir): if not os.path.exists(feat_dir): os.mkdir(feat_dir) cuda = torch.cuda.is_available() if cuda: print('cuda is available!') img_transform = transforms.Compose([ transforms.ToTensor(), ]) test_dataset = LD(in_dir, lmark_num=LMARK_NUM, color=1) test_loader = DataLoader(test_dataset, batch_size=3000) model = Encoder() if cuda: model.cuda() model.load_state_dict(torch.load(class_dir + 'classifier.pth')) lmark, reye, leye, mouth, label = iter(test_loader).next() lmark = lmark.view(lmark.size(0), -1) label = label.numpy() with torch.no_grad(): if cuda: lmark = Variable(lmark).cuda().float() reye = Variable(reye).cuda().float() leye = Variable(leye).cuda().float() mouth = Variable(mouth).cuda().float() else: lmark = Variable(lmark).float() reye = Variable(reye).float() leye = Variable(leye).float() mouth = Variable(mouth).float() out, feat = model(lmark, reye, leye, mouth) feat = feat.cpu().data.numpy() out = out.view(out.size(0)).cpu().data.numpy() f = open(class_dir + 'result.txt', 'w') for i in range(len(label)): f.write(str(label[i]) + ' ' + str(out[i]) + '\n') np.save(feat_dir + str(i).zfill(4) + '_' + str(int(label[i])), feat[i]) f.close() evaluation(class_dir)
def notificationTourJoueur(self): #Un joueur choisi une action (on annonce le tour du joueur, si id%2 == 0 alors blanc sinon noir) if (self.turnId % 2 == 0): print("C'est au tour de", self.player1) else: print("C'est au tour de", self.player2) print(evaluation(self.board.fen()))
def emsemble(cross_epoch = 0,data_index=None,cut_shape=None,data_type=['MCIc','MCInc'],pre_dir='/home/anzeng/rhb/fmri_data', num_batches = 256*5,voxnet_point=None,test_size = 6,brain_map=[217],f_handle = None): # tf.reset_default_graph() keras.backend.clear_session() dataset = fMRI_data(data_type, data_index=data_index, varbass=False, dir=pre_dir) # xyz = 32 # input_shape = [None, xyz, xyz, xyz, 1] # voxnet = VoxNet(input_shape=input_shape, voxnet_type='cut') true_shape = [] for x in range(0, len(cut_shape), 2): true_shape.append(cut_shape[x + 1] - cut_shape[x] + 1) # with tf.Session() as sess: # sess.run(tf.global_variables_initializer()) # voxnet.npz_saver.restore(sess,voxnet_point) #加载模型 model = keras.models.load_model(voxnet_point) print('train_acc') train_fmri_evaluation = evaluation() train_smri_evaluation = evaluation() train_iter = iter(dataset.get_fmri('train')).__next__ for i in range(100): img,label,_ = train_iter() predict,y_true = get_label(model,img,label,cut_shape,true_shape) predict = np.argmax(predict,axis=1) train_smri_evaluation += evaluation(y_predict=predict,y_true=y_true) if i %10 == 0 and i > 0: print(train_smri_evaluation) y_predict = ensemble_label(predict,2) train_fmri_evaluation += evaluation(y_predict = [y_predict],y_true=[label]) print(train_fmri_evaluation) print('test_acc') test_fmri_evaluation = evaluation() test_smri_evaluation = evaluation() test_iter = iter(dataset.get_fmri('test')).__next__ for i in range(test_size): img, label,filename = test_iter() predict, y_true = get_label(model, img, label, cut_shape, true_shape) predict = np.argmax(predict,axis=1) test_smri_evaluation_one = evaluation(y_predict=predict, y_true=y_true) test_smri_evaluation += test_smri_evaluation_one print(test_smri_evaluation_one) print(test_smri_evaluation) y_predict = ensemble_label(predict,2) test_fmri_evaluation += evaluation(y_predict=[y_predict], y_true=[label]) print(y_predict,label,test_fmri_evaluation) # if y_predict != label: # print(filename) # f_handle.write(filename+'\n') if f_handle: f_handle.write('ensemble train:\n') f_handle.write(str(train_fmri_evaluation) + '\n') f_handle.write('ensemble test:\n') f_handle.write(str(test_fmri_evaluation) + '\n') return test_fmri_evaluation
def ajax_bgm_recommendation(): #exec(compile(open('./evaluation.py', "rb").read(), './evaluation.py', 'exec')) #subprocess.call("evaluation.py", shell=True) EVA = evaluation.evaluation() wgan_result = EVA.gan_evaluation() with open('./recommend_result.txt', 'w') as f: f.write(wgan_result) return redirect(url_for('mix_video_audio'))
def final_evaluation(self): combine = eval.combine_pred_real_labels(self.all_final_pred_label, self.all_real_label) eval_measures, overall_accuracy = eval.evaluation(combine) print('evaluation for all model') print(eval_measures) print('accuracy') print(overall_accuracy) return eval_measures, overall_accuracy
def __init__(self, path, test_size, random_state): self.process = Process(path) self.evaluation = evaluation() self.result = self.process.load_image_files_modified() self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.result.data, self.result.target, test_size=test_size, random_state=random_state)
def lgb_cv(cv_train, cv_test, params, low_bound, topk, idx): print('CV Fold {}/5'.format(idx)) params['gpu_device_id'] = idx - 1 train_gid, train_feat, train_label = utils.preprocess_xgb(cv_train) del cv_train # print('| Training Data: Adding product id & aisle id & department id ...') # train_feat['product_id'], train_feat['aisle_id'], train_feat['department_id'] = train_gid['product_id'], train_gid['aisle_id'], train_gid['department_id'] test_gid, test_feat, test_label = utils.preprocess_xgb(cv_test) del cv_test # print('| Test Data: Adding product id & aisle id & department id ...') # test_feat['product_id'], test_feat['aisle_id'], test_feat['department_id'] = test_gid['product_id'], test_gid['aisle_id'], test_gid['department_id'] print('| Construct lgb Dataset ...') lgb_train = lgb.Dataset( train_feat, train_label, free_raw_data=True ) #, categorical_feature=['product_id', 'aisle_id', 'department_id']) del train_feat, train_label # lgb_test = lgb.Dataset(test_feat, test_label, free_raw_data=True) print('| Training ...') gbm = lgb.train(params, lgb_train, num_boost_round=num_rounds, valid_sets=lgb_train) del lgb_train y_scores = gbm.predict(test_feat, num_iteration=gbm.best_iteration) del test_feat test_auc_score = roc_auc_score(test_label, y_scores) print('| test auc: %s' % test_auc_score) gc.collect() user_product = test_gid[['user_id', 'product_id', 'order_id']] user_product['label'] = test_label user_product['score'] = y_scores user_product = user_product.sort_values(['user_id', 'order_id', 'score'], ascending=False) gold = evaluation.get_gold(user_product) op = user_product.copy() # op = utils.shing_f1_optim(op, low_bound, int(topk)) op = utils.faron_f1_optim(op, low_bound, int(topk)) op['products'] = op['products'].apply( lambda x: [int(i) if i != 'None' else i for i in x.split()]) op = pd.merge(pd.DataFrame({'order_id': user_product.order_id.unique()}), op, on=['order_id'], how='left') res = evaluation.evaluation(gold, op[['order_id', 'products']]) mf1 = res.f1score.mean() with open( constants.LGB_DIR + 'lgb_{}_{:.6f}_{:.6f}'.format( params['boosting_type'], test_auc_score, mf1), 'wb') as f: pickle.dump(gbm, f, pickle.HIGHEST_PROTOCOL) del gbm print('F1 Optimization Result: mean-f1-score {}'.format(mf1)) del user_product, op, gold, res gc.collect() return mf1
def minimax(position, stack, depth, alpha, beta, maximizingPlayer, calculations): calculations[0] += 1 gameOver = isGameOver(position) if gameOver != 0: return gameOver * 10000 elif depth == 0: return evaluation(position) if maximizingPlayer: maxEval = -99999 # for each child of position for x in [3, 2, 4, 1, 5, 0, 6]: y = stack[x] if y < BOARD_HEIGHT: position[x][y] = 1 stack[x] += 1 _eval = minimax(position, stack, depth - 1, alpha, beta, False, calculations) position[x][y] = 0 stack[x] -= 1 maxEval = max(maxEval, _eval) alpha = max(alpha, _eval) if beta <= alpha: return maxEval return maxEval else: minEval = 99999 # for each child of position for x in [3, 2, 4, 1, 5, 0, 6]: y = stack[x] if y < BOARD_HEIGHT: position[x][y] = -1 stack[x] += 1 _eval = minimax(position, stack, depth - 1, alpha, beta, True, calculations) position[x][y] = 0 stack[x] -= 1 minEval = min(minEval, _eval) beta = min(beta, _eval) if beta <= alpha: return minEval return minEval
def final_evaluation(self): """ This function gives overall evaluation of the 12 models. @return: first: measures for each class, second: average accuracy """ combine = eval.combine_pred_real_labels(self.all_models_pred_labels, self.all_models_real_labels) eval_measures, overall_accuracy = eval.evaluation(combine) print(eval_measures) print(overall_accuracy) return eval_measures, overall_accuracy
def model_eval(self): """ After fully trained ,can be used to evaluate this model @return: precision, recall, F1, accuracy """ test_output, _ = self.get_test_output_loss() pred_y = eval.predict_labels(test_output) combine = eval.combine_pred_real_labels(pred_y, self.test_y) eval_measures, accuracy = eval.evaluation(combine) print(eval_measures) print(accuracy) return eval_measures, accuracy
def random_walk(n, cnt, p): maze = generateMatrix(n) first = evaluation(maze, n) minima = first print("Init energy: %d" % (first)) #print(maze) backup = maze for i in range(cnt): #randomly choose a cell to change, make sure it's not goal state if i % 100 == 0: print("Current energy: %d" % (minima)) goal = [n - 1, n - 1] new_cell = goal while new_cell == goal: row = rd.randint(0, n - 1) col = rd.randint(0, n - 1) new_cell = [row, col] #change the jump number, make sure it's not same as before old_step = maze[row, col] new_step = old_step maxrnd = max(n - row - 1, n - col - 1, row, col) while new_step == old_step: new_step = rd.randint(1, maxrnd) maze[row, col] = new_step #get the new energy nexte = evaluation(maze, n) if nexte < first or rd.random() < p: first = nexte # minima < nexte < first, not minima keep going if nexte < minima: #new minima, store the matrix minima = nexte backup = maze else: maze = backup return (minima, maze)
def predict(dataset_iter=test_iter, dataset=test, data_name="test"): print("Dataset: {}".format(data_name)) model.eval() dataset_iter.init_epoch() n_correct = 0 fname = "{}.txt".format(data_name) temp_file = 'tmp' + fname results_file = open(temp_file, 'w') gold_list = [] pred_list = [] for data_batch_idx, data_batch in enumerate(dataset_iter): scores = model(data_batch) if args.dataset == 'EntityDetection': n_correct += torch.sum(torch.sum(torch.max(scores, 1)[1].view(data_batch.ed.size()).data == data_batch.ed.data, dim=1) \ == data_batch.ed.size()[0]).item() index_tag = np.transpose( torch.max(scores, 1)[1].view(data_batch.ed.size()).cpu().data.numpy()) tag_array = index2tag[index_tag] index_question = np.transpose(data_batch.text.cpu().data.numpy()) question_array = index2word[index_question] gold_list.append(np.transpose(data_batch.ed.cpu().data.numpy())) gold_array = index2tag[np.transpose( data_batch.ed.cpu().data.numpy())] pred_list.append(index_tag) for question, label, gold in zip(question_array, tag_array, gold_array): results_file.write("{}\t{}\t{}\n".format( " ".join(question), " ".join(label), " ".join(gold))) else: print("Wrong Dataset") exit() if args.dataset == 'EntityDetection': P, R, F = evaluation(gold_list, pred_list, index2tag, type=False) print("{} Precision: {:10.6f}% Recall: {:10.6f}% F1 Score: {:10.6f}%". format("Dev", 100. * P, 100. * R, 100. * F)) else: print("Wrong dataset") exit() results_file.flush() results_file.close() convert(temp_file, os.path.join(args.data_dir, "lineids_{}.txt".format(data_name)), os.path.join(results_path, "query.{}".format(data_name))) os.remove(temp_file)
def train_neural_network(x_train, train_labels, x_test, orig_test): """ Trains neural network ready-to-use dataframes Args: X_train: train dataset train_labels: train labels X_test: test dataset """ train_features = np.array(x_train) test_features = np.array(x_test) train_labels = np.array(train_labels['Col2']) model = models.make_model(params=train_features, model_name='neural_network_1') checkpoint_cb, tensorboard_cb = models.callbacks( model_name='nn_submission03_s_1_m1_f_2165.ckpt') epochs = 6 batch_size = 32 history = model.fit(train_features, train_labels, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint_cb, tensorboard_cb] # validation_data=(val_features, val_labels) ) evaluation.evaluation(model, train_features, train_labels) evaluation.plot_metrices(epochs, history, if_val=False) evaluation.plot_confusion_matrix(model, train_features, train_labels) evaluation.submission_nn( model=model, test_features=test_features, orig_test_df=orig_test, submission_name='nn_submission03_s_1_m1_f_2165.csv')
def validate(test_loader, model, args): # switch to evaluation mode model.eval() testdata = torch.Tensor() testlabel = torch.LongTensor() with torch.no_grad(): for i, (input, target) in enumerate(test_loader): if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) # compute output output = model(input) testdata = torch.cat((testdata, output.cpu()), 0) testlabel = torch.cat((testlabel, target)) nmi, recall = eva.evaluation(testdata.numpy(), testlabel.numpy(), [1, 2, 4, 8]) return nmi, recall
def minimax(position, depth, alpha, beta, maximizingPlayer): gameOver = isGameOver(position) if gameOver != 0: return gameOver * 10000 elif depth == 0: return evaluation(position) if maximizingPlayer: maxEval = -999999 # for each child of position for child in child_order: x = child[0] y = child[1] if position[x][y] == 0: position[x][y] = 1 _eval = minimax(position, depth - 1, alpha, beta, False) position[x][y] = 0 maxEval = max(maxEval, _eval) alpha = max(alpha, _eval) if beta <= alpha: return maxEval return maxEval else: minEval = 999999 # for each child of position for child in child_order: x = child[0] y = child[1] if position[x][y] == 0: position[x][y] = -1 _eval = minimax(position, depth - 1, alpha, beta, True) position[x][y] = 0 minEval = min(minEval, _eval) beta = min(beta, _eval) if beta <= alpha: return minEval return minEval
def svr_timeseries(self, x_train, y_train, x_test, y_real, kernel): ''' c_set = [0.0001,0.001,0.01,0.1,1,10,100,1000] gamma_set = [0.0001,0.001,0.01,0.1,10,100,1000] epsilon_set = [0.0001,0.001,0.01,0.1,10,100,1000] ''' parameter_start = -5 parameter_stop = 5 count = 10 c_set = svm.svmCal.svr.numberGenerate(self, parameter_start, parameter_stop, count) gamma_set = svm.svmCal.svr.numberGenerate(self, parameter_start, parameter_stop, count) epsilon_set = svm.svmCal.svr.numberGenerate(self, parameter_start, parameter_stop, count) c_min = 0 gamma_min = 0 epsilon_min = 0 nmse_min = 100 nmse_result = [] ds_max = 0 ds_result = [] profit_result = [] profit_max = -100 doc_max = -1000 # R square doc_result = [] loop_number = (1+count)**3 loop_count = 0 percent_count = 0.05 t0 = time.time() for C in c_set: for gamma in gamma_set: for epsilon in epsilon_set: loop_count += 1 if (C * gamma * epsilon != 0): svr_rbf = SVR(kernel=kernel, C=C, gamma=gamma, epsilon = epsilon) #svr_rbf.fit(x_train, y_train) y_pred = svr_rbf.fit(x_train, y_train).predict(x_test) nmse = evaluation.evaluation(y_real,y_pred).NMSE() ds = evaluation.evaluation(y_real,y_pred).DS() profit = ljCao.profit.profitLjCao(y_real,y_pred).Profit() doc = r2_score(y_real,y_pred) #corr = np.corrcoef(y_real, y_pred, bias = 0, ddof = None)[0,1] #print("C = %f, gamma = %f, epsilon = %f, NMSE = %f, DS = %f, Profit = %f, DOC = %f" %(C,gamma,epsilon,nmse,ds,profit,doc)) nmse_result.append(nmse) ds_result.append(ds) doc_result.append(doc) if (doc > doc_max): c_min = C gamma_min = gamma epsilon_min = epsilon doc_max = doc finished_percent = float(loop_count) / float(loop_number) t1 = time.time() if finished_percent > percent_count : minutes_lfet = ((t1-t0) * (1.0 - finished_percent) / finished_percent) /60 print("%d%% %f minutes left" %(percent_count * 100, minutes_lfet )) percent_count += 0.05 svr_rbf = SVR(kernel=kernel, C=c_min, gamma=gamma_min, epsilon = epsilon_min) y_pred = svr_rbf.fit(x_train, y_train).predict(x_test) nmse = evaluation.evaluation(y_real,y_pred).NMSE() ds = evaluation.evaluation(y_real,y_pred).DS() profit = ljCao.profit.profitLjCao(y_real,y_pred).Profit() profit_time = ljCao.profit.profitLjCao(y_real,y_pred).ProfitTimeSeries() doc = r2_score(y_real,y_pred) print('MAX DS = %f' %ds) print('Hit rate = %f' %(float(ds) / float(len(y_pred)))) print('NMSE = %f' %nmse) print('Profit = %f' %profit) print('DOC = %f' %doc) print("C = %f, gamma = %f, epsilon = %f" %(c_min,gamma_min,epsilon_min)) x = range(len(y_real)) plt.figure() plt.subplot2grid((2,2),(0, 0)) plt.scatter(x, y_real, c='k', label='data') plt.scatter(x, y_pred, c='r', label='RBF model') plt.xlabel('data') plt.ylabel('target') plt.title('Support Vector Regression') #plt.legend() #plt.figure(2) plt.subplot2grid((2,2),(0, 1)) x = range(len(profit_time)) plt.plot(x, profit_time, c='g', label='profit') plt.xlabel('day') plt.ylabel('profit (times)') #plt.figure(3) plt.subplot2grid((2,2),(1, 0)) x = range(len(nmse_result)) plt.plot(x, nmse_result, c='g', label='NMSE') plt.xlabel('Times') plt.ylabel('NMSE') plt.subplot2grid((2,2),(1, 1)) plt.show() pass
user_item.generateCandidatesWithWeights(sc) gc.collect() os.rename("subalg/user_item/output/part-00000","subalg/user_item/output/user_item_results.txt") print "\nStarting item-item logic" item_item.generateCandidatesWithWeights(sc) #pass in sc, expects a file to have been writter gc.collect() print "\n\nDone processing data, begin Logistic Regression..." lr.runLogisticRegression(sc) print "Starting evaluation..." resultLoc = 'logistic_regression/output/final_output.txt' solLoc = 'data/solution.csv' print evaluation.evaluation(resultLoc, solLoc) ### Postprocessing - Mainly should be to delete files created on disk ### print "Postprocessing - Cleaning up" try: os.remove('subalg/item_item/output/item_item_results.txt') os.remove('subalg/user_user/output/user_user_results.txt') os.remove('subalg/item_item/output/user_item_results.txt') os.remove('logistic_regression/output/input_for_lr.txt') except: print "Something went wrong with removing temporary files, you may need to manually delete them." print "Exiting spark..."
if len(resultmerchant) != 0: str = '' for mer in resultmerchant: str = str + mer + ':' str = str[0:len(str)-1] result.append(str) allresult.append(result) outfile = open('/home/wanghao/Document/tianchi/trainset/trainresult.csv','wb') import csv writer = csv.writer(outfile) writer.writerows(outfile) outfile.close() # evaluate the result eval = evaluation() truefile = '/home/wanghao/Document/tianchi/dataset/train11' predictfile = '/home/wanghao/Document/tianchi/trainset/trainresult.csv' merchantfile = '/home/wanghao/Document/tianchi/tianchi_dataset/ijcai2016_merchant_info' eval.getS_true(truefile) eval.getS_predict(predictfile) eval.get_MerchantBudget(merchantfile) f1 = eval.comp_f1_score() print "This train F1 score is ", f1
def logistic_reg(train_dataSet, train_labels, test_dataSet, test_labels,lamda): #print '--------------------- Logistic Regression ----------------' #print 'Loading data...' #load data #[x, y, train_size,x_test,y_test,test_size] = document_vectorize.createDataSet(train_path, test_path, category, k) MaxIteration = 100 train_sample_num = len(train_labels) test_sample_num = len(test_labels) feature_num = 2 #lamdas = [0.0001,0.001, 0.01, 0.1, 1, 2, 5, 6,10,100,1000] #lamdas = [100] #for lamda in lamdas: update_loss = 0 min_loss = 'Inf' train_predict_cat = numpy.zeros(train_sample_num) test_predict_cat = numpy.zeros(test_sample_num) w = numpy.zeros(feature_num) b = 0 min_w = numpy.zeros(feature_num) min_b = 0 #print '------------------------------------------------------------' #print 'lamda = ', lamda shuffle_order = range(train_sample_num) #start traing start_time = time.time() for iteration in range(MaxIteration): learn_rate = 1/(iteration+1) shuffle(shuffle_order) #stochastic gradient descent for t in shuffle_order: temp1 = numpy.add(numpy.inner(w, train_dataSet[t]), b) temp2 = numpy.exp(numpy.multiply(temp1, train_labels[t])) temp3 = learn_rate / (1 + temp2) w = numpy.add( (1 - lamda * learn_rate) * w, numpy.multiply(train_labels[t] * temp3, train_dataSet[t])) b += learn_rate * train_labels[t] * temp3 #print "iteration = "+str(iteration) #print "update_loss ="+str(update_loss) #print "min_loss =" +str(min_loss) #calculate loss temp_loss = 0 for i in range (train_sample_num): temp1 = numpy.add(numpy.inner(w, train_dataSet[t]), b) temp2 = numpy.exp(numpy.multiply(-temp1, train_labels[t])) temp3 = 1 / (1 + temp2) temp_loss += numpy.log(1 + temp3) square = lamda * numpy.sum(numpy.square(w)) / 2.0 update_loss = temp_loss/train_sample_num + square #if min_loss == 0: # break #record minimum loss if min_loss > update_loss: min_loss = update_loss min_w = w min_b = b #print "min_loss = "+str(min_loss) #print "iteration = "+str(iteration) #if abs(min_loss - update_loss) < 0.000001: # break print 'min loss', min_loss #print 'Training time: ', time.time()-start_time print "RESULT: w: " + str(min_w) + " b: " + str(min_b) #predict training set for i in range(train_sample_num): if ((numpy.inner(train_dataSet[i],min_w)+min_b)) < 0: train_predict_cat[i] = -1 else: train_predict_cat[i] = 1 #print "# train_size = " + str(train_size) print 'Accuracy for training dataset: ',evaluation.evaluation(train_predict_cat, train_labels) #predict test set for i in range(test_sample_num): if ((numpy.inner(test_dataSet[i],min_w)+min_b)) < 0: test_predict_cat[i] = -1 else: test_predict_cat[i] = 1 #print "# test_size = " + str(test_size) print 'Accuracy for test dataset: ',evaluation.evaluation(test_predict_cat, test_labels)
def svr_timeseries(self, x_train, y_train, x_test, y_real, kernel): ''' c_set = [0.0001,0.001,0.01,0.1,1,10,100,1000] gamma_set = [0.0001,0.001,0.01,0.1,10,100,1000] epsilon_set = [0.0001,0.001,0.01,0.1,10,100,1000] ''' parameter_start = -4 parameter_stop = 4 count = 10.0 c_set = self.numberGenerate(parameter_start, parameter_stop, count) gamma_set = self.numberGenerate(parameter_start, parameter_stop, count) epsilon_set = self.numberGenerate(parameter_start, parameter_stop, count) #c_set = self.numberGenerate(-2, 0, count) #gamma_set = self.numberGenerate(1, 2, count) #epsilon_set = self.numberGenerate(-2, 0, count) print c_set c_min = 0 gamma_min = 0 epsilon_min = 0 nmse_min = 100 nmse_result = [] ds_max = 0 ds_result = [] mae_result = [] profit_result = [] profit_max = -100 doc_max = -1000 # R square doc_result = [] loop_number = (1+count)**3 loop_count = 0 percent_count = 0.05 t0 = time.time() #======================================================================= # c_set = [0.088914] # gamma_set = [7.924466] # epsilon_set = [0.019905] #======================================================================= for C in c_set: for gamma in gamma_set: for epsilon in epsilon_set: loop_count += 1 if (C * gamma * epsilon != 0): svr_rbf = SVR(kernel=kernel, C=C, gamma=gamma, epsilon = epsilon) #svr_rbf.fit(x_train, y_train) y_pred = svr_rbf.fit(x_train, y_train).predict(x_test) result = pd.DataFrame() result["Y_real"] = y_real result["y_pred"] = y_pred #======================================================= # plt.scatter(y_real, y_real - y_pred) # plt.show() #======================================================= #x_axis = range(len(y_real)) #plt.plot(x_axis, y_real[:100], color = "r", ) #plt.plot(x_axis, y_pred[:100]) #======================================================= # plt.scatter(x_axis, y_real - y_pred) # plt.xlabel("Trade Count") # plt.ylabel("Real High - Pred High") # plt.legend() # plt.show() # plt.scatter(x_axis, y_real, color = "r", ) # plt.plot(x_axis, y_pred) # plt.show() #======================================================= nmse = evaluation.evaluation(y_real,y_pred).NMSE() ds = evaluation.evaluation(y_real,y_pred).DS() mae = evaluation.evaluation(y_real, y_pred).MAE() #profit = ljCao.profit.profitLjCao(y_real,y_pred).Profit() doc = r2_score(y_real,y_pred) #corr = np.corrcoef(y_real, y_pred, bias = 0, ddof = None)[0,1] #print("C = %f, gamma = %f, epsilon = %f, NMSE = %f, DS = %f, Profit = %f, DOC = %f" %(C,gamma,epsilon,nmse,ds,profit,doc)) nmse_result.append(nmse) ds_result.append(ds) doc_result.append(doc) mae_result.append(mae) if (doc > doc_max): c_min = C gamma_min = gamma epsilon_min = epsilon doc_max = doc finished_percent = float(loop_count) / float(loop_number) t1 = time.time() if finished_percent > percent_count : minutes_left = ((t1-t0) * (1.0 - finished_percent) / finished_percent) /60 print("%d%% %f minutes left" %(percent_count * 100, minutes_left )) percent_count += 0.05 svr_rbf = SVR(kernel=kernel, C=c_min, gamma=gamma_min, epsilon = epsilon_min) y_pred = svr_rbf.fit(x_train, y_train).predict(x_test) nmse = evaluation.evaluation(y_real,y_pred).NMSE() ds = evaluation.evaluation(y_real,y_pred).DS() mae = evaluation.evaluation(y_real,y_pred).MAE() #profit = ljCao.profit.profitLjCao(y_real,y_pred).Profit() #profit_time = ljCao.profit.profitLjCao(y_real,y_pred).ProfitTimeSeries() doc = r2_score(y_real,y_pred) print('MAX DS = %f' %ds) print('Hit rate = %f' %(float(ds) / float(len(y_pred)))) print('NMSE = %f' %nmse) #print('Profit = %f' %profit) print('DOC = %f' %doc) print("MAE = %f" %mae) print("C = %f, gamma = %f, epsilon = %f" %(c_min,gamma_min,epsilon_min)) x = range(len(y_real)) plt.figure() plt.subplot2grid((2,2),(0, 0)) plt.scatter(x, y_real, c='k', label='data') plt.scatter(x, y_pred, c='r', label='RBF model') plt.xlabel('data') plt.ylabel('target') plt.title('Support Vector Regression') #plt.legend() #plt.figure(2) #plt.subplot2grid((2,2),(0, 1)) #x = range(len(profit_time)) #plt.plot(x, profit_time, c='g', label='profit') #plt.xlabel('day') #plt.ylabel('profit (times)') #plt.figure(3) plt.subplot2grid((2,2),(1, 0)) x = range(len(nmse_result)) plt.plot(x, nmse_result, c='g', label='NMSE') plt.xlabel('Times') plt.ylabel('NMSE') plt.subplot2grid((2,2),(1, 1)) plt.show() plt.scatter(y_pred, y_real - y_pred) plt.xlabel('Prediction lowest price in 2nd day') plt.ylabel("(Real - Prediction) lowest price in 2nd day") plt.show() pass
def trainClassifier(conn, cursor, tablename, test_tweet, enable_evaluation): """Train the Naive Bayes""" stop_words = [] # Fetch all the stop words # try: # query_sw = "SELECT word FROM stop_words limit 35" # cursor.execute(query_sw) # sw = cursor.fetchall() # stop_words = filter_tweets(sw) # print(stop_words) # except: # Get the most recent exception # exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() # print "Select Error -> %s" % exceptionValue # lastid="0" # Fetch all the traffic tweets try: query_pt = "SELECT tweet FROM "+ tablename +" WHERE ptraffic='y' ORDER BY tid ASC LIMIT 681" cursor.execute(query_pt) ttweets = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # Fetch all the non-traffic tweets try: query_nt = "SELECT tweet FROM "+ tablename +" WHERE ntraffic='y' ORDER BY tid ASC LIMIT 681" cursor.execute(query_nt) nttweets = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # If the user chose to evaluate the classifier fetach more labelled tweets for testing if enable_evaluation == 'test': # Fetch all the traffic tweets for the evaluation try: query_pt = "SELECT tweet FROM "+ tablename +" WHERE ptraffic='y' ORDER BY tid DESC LIMIT 375" cursor.execute(query_pt) ttweets_test = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # Fetch all the non-traffic tweets for the evaluation try: query_nt = "SELECT tweet FROM "+ tablename +" WHERE ntraffic='y' ORDER BY tid DESC LIMIT 375" cursor.execute(query_nt) nttweets_test = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" try: # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TRAIN SET <<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Apply preprocessing on the traffic tweets for the train set data=[] for text in ttweets: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) traffic_tweets=add_label(data, 'traffic') # Apply preprocessing on the non-traffic tweets for the train set data=[] for text in nttweets: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) nontraffic_tweets=add_label(data, 'nontraffic') # Merge the tweets for the train set combined_tweets = traffic_tweets + nontraffic_tweets # Extract the features for the train set temp = [] for i in range(len(combined_tweets)): temp.append(((features_extractor(combined_tweets[i][0])),combined_tweets[i][1])) train_set=temp # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TEST SET <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # If the user chose to evaluate the classifier create a test_set if enable_evaluation == 'test': # Apply preprocessing on the traffic tweets for the test set data=[] for text in ttweets_test: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) traffic_tweets_test=add_label(data, 'traffic') # Apply preprocessing on the non-traffic tweets for the test set data=[] for text in nttweets_test: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) nontraffic_tweets_test=add_label(data, 'nontraffic') # Merge the tweets for the test set combined_tweets_test = traffic_tweets_test + nontraffic_tweets_test # Extract the features for the test set temp = [] for i in range(len(combined_tweets_test)): temp.append(((features_extractor(combined_tweets_test[i][0])),combined_tweets_test[i][1])) test_set=temp # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TRAIN THE CLASSIFIER <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Train our classifier using the training set classifier = nltk.NaiveBayesClassifier.train(train_set) # Save the classifier in a .pickle file name = 'naive_bayes.pickle' fname = os.path.join(os.path.expanduser('~/nltk_data/classifiers'), name) dump_classifier(classifier, fname) # Classify the tweet test_tweet1 = preprocessor().preprocess(test_tweet,stop_words) test = features_extractor(test_tweet1) proba = classifier.prob_classify(test) print "\nThe tweet '%s' is about: %s with probability: %s\n" % (test_tweet, classifier.classify(test),proba.prob('traffic')) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TEST THE CLASSIFIER <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # If the user chose to evaluate the classifier apply the evaluation techniques if enable_evaluation == 'test': evaluation(test_set,classifier) except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Error -> %s" % exceptionValue lastid="0"