def train(net, set_id, train_loader, test_loader, loss_func, optimizer, num_epochs, lr, device): import time for epoch in range(num_epochs): start = time.time() train_loss, test_loss = 0.0, 0.0 train_kappa, test_kappa = 0.0, 0.0 n, m = 0, 0 for essays, labels, lengths in train_loader: essays = essays.to(device) labels = labels.to(device) lengths = lengths.to(device) y = net(essays, lengths) loss = loss_func(y, labels) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() * labels.shape[0] train_kappa += metrics.kappa(y.cpu().detach().numpy(), labels.cpu().detach().numpy(), 'quadratic') * labels.shape[0] n += labels.shape[0] train_loss /= n train_kappa /= n net.eval() with torch.no_grad(): for essays, labels, lengths in test_loader: essays = essays.to(device) labels = labels.to(device) lengths = lengths.to(device) y = net(essays, lengths) loss = loss_func(y, labels) test_loss += loss.item() * labels.shape[0] test_kappa += metrics.kappa(y.cpu().detach().numpy(), labels.cpu().detach().numpy(), 'quadratic') * labels.shape[0] m += labels.shape[0] net.train() test_loss /= m test_kappa /= m if test_kappa > max_test_kappa[set_id]: max_test_kappa[set_id] = test_kappa torch.save(net.state_dict(), 'HW2/models/model_' + str(set_id) + '.pt') print(max_test_kappa) end = time.time() runtime = end - start print('set %d, epoch %d, train loss: %.4f, train kappa: %.4f, dev loss: %.4f, dev kappa: %.4f, time: %.2f' % (set_id, epoch, train_loss, train_kappa, test_loss, test_kappa, runtime))
def predict(set_id, min_score, max_score, net, test_loader, device): preds = [] essays_ids = [] test_kappa = 0.0 net.eval() with torch.no_grad(): m = 0 for essays, lengths, labels, ids, prompts in test_loader: essays = essays.to(device) prompts = prompts.to(device) lengths = lengths.to(device) pred = net(essays, prompts, lengths) pred = pred * (max_score - min_score) + min_score labels = labels * (max_score - min_score) + min_score pred = pred.cpu().detach().numpy() test_kappa += metrics.kappa(pred, labels, 'quadratic') * len(labels) preds.extend(pred.round().tolist()) essays_ids.extend(ids.tolist()) m += len(labels) test_kappa /= m return preds, essays_ids, test_kappa
def train(net, loss_func, optimizer, train_loader, dev_loader): for epoch in range(num_epochs): start = time.time() train_loss, dev_loss = 0.0, 0.0 train_kappa, dev_kappa = 0.0, 0.0 n, m = 0, 0 for seqs, seqs_len, labels in train_loader: seqs = seqs.to(device) seqs_len = seqs_len.to(device) labels = labels.to(device) y = net(seqs, seqs_len) loss = loss_func(y, labels) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() * seqs.shape[0] train_kappa += metrics.kappa(y.cpu().detach().numpy(), labels.cpu().detach().numpy(), 'quadratic') * seqs.shape[0] n += seqs.shape[0] train_loss /= n train_kappa /= n net.eval() with torch.no_grad(): for seqs, seqs_len, labels in dev_loader: seqs = seqs.to(device) seqs_len = seqs_len.to(device) labels = labels.to(device) y = net(seqs, seqs_len) loss = loss_func(y, labels) dev_loss += loss.item() * labels.shape[0] dev_kappa += metrics.kappa(y.cpu().detach().numpy(), labels.cpu().detach().numpy(), 'quadratic') * seqs.shape[0] m += labels.shape[0] net.train() dev_loss /= m dev_kappa /= m end = time.time() runtime = end - start print('epoch %d, train loss: %.4f, train kappa: %.4f, dev loss: %.4f, dev kappa: %.4f, time: %.2f' % (epoch, train_loss, train_kappa, dev_loss, dev_kappa, runtime))
def begin_testing(filename, classifier): print "\nLoading the test data..." test_docs = docR.get_list(filename) data = [] target = [] for doc in test_docs: data.append(doc.vector[:-1]) target.append(doc.vector[-1]) np_data = np.array(data) np_target = np.array(target) results = classifier.predict(np_data) kp = kappa(np_target, results) print "\nThe Average Quadratic Weighted Kappa obtained is: ", kp, "\n" print "="*50
def evaluate_kappa(opts, model, data_iter, batch_size): model.eval() predicted_labels = [] true_labels = [] hidden = model.init_hidden(batch_size) for i in range(len(data_iter)): vectors, labels = get_batch(data_iter[i]) vectors = torch.stack(vectors).squeeze() vectors = vectors.transpose(1, 0) if opts.use_cuda: vectors = vectors.cuda() vectors = Variable(vectors) hidden = repackage_hidden(hidden) output, hidden = model(vectors, hidden) predicted = [int(round(float(num))) for num in output.data.cpu().numpy()] predicted_labels.extend([round(float(num)) for num in output.data.cpu().numpy()]) labels = [int(label[0]) for label in labels] true_labels.extend(labels) return kappa(true_labels, predicted_labels, weights = "quadratic")
def evaluating(self, model, dataset, split): """ input: model: (object) pytorch model dataset: (object) dataset split: (str) split of dataset in ['train', 'val', 'test'] return [overall_accuracy, precision, recall, f1-score, jaccard, kappa] """ args = self.args oa, precision, recall, f1, jac, kappa = 0, 0, 0, 0, 0, 0 model.eval() data_loader = DataLoader(dataset, args.batch_size, num_workers=4, shuffle=False) batch_iterator = iter(data_loader) steps = len(dataset) // args.batch_size start = time.time() for step in range(steps): x, y = next(batch_iterator) x = Variable(x, volatile=True) y = Variable(y, volatile=True) if args.cuda: x = x.cuda() y = y.cuda() # calculate pixel accuracy of generator gen_y = model(x) if self.is_multi: gen_y = gen_y[0] oa += metrics.overall_accuracy(gen_y.data, y.data) precision += metrics.precision(gen_y.data, y.data) recall += metrics.recall(gen_y.data, y.data) f1 += metrics.f1_score(gen_y.data, y.data) jac += metrics.jaccard(gen_y.data, y.data) kappa += metrics.kappa(gen_y.data, y.data) _time = time.time() - start if not os.path.exists(os.path.join(Logs_DIR, 'statistic')): os.makedirs(os.path.join(Logs_DIR, 'statistic')) # recording performance of the model nb_samples = steps * args.batch_size basic_info = [ self.date, self.method, self.epoch, self.iter, nb_samples, _time ] basic_info_names = [ 'date', 'method', 'epochs', 'iters', 'nb_samples', 'time(sec)' ] perform = [ round(idx / steps, 3) for idx in [oa, precision, recall, f1, jac, kappa] ] perform_names = [ "overall_accuracy", "precision", "recall", "f1-score", "jaccard", "kappa" ] cur_log = pd.DataFrame([basic_info + perform], columns=basic_info_names + perform_names) # save performance if os.path.exists( os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split))): logs = pd.read_csv( os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split))) else: logs = pd.DataFrame([]) logs = logs.append(cur_log, ignore_index=True) logs.to_csv(os.path.join(Logs_DIR, 'statistic', "{}.csv".format(split)), index=False, float_format='%.3f')
def go(): ds = load_dataset() pd_train, pd_solut = ds['train'], ds['solut'] first = False if first: # traditional featvecs for each text pd_train = abstract_featdict(pd_train) pd_solut = abstract_featdict(pd_solut) # wv for each text TEXT_DIM = 300 wv = wv_pre_model(ds, wv_dim=TEXT_DIM) for df in [pd_train, pd_solut]: df['wv'] = df.apply( lambda e: text_to_fv_avg(wv, e['BOW'], TEXT_DIM), axis=1) # save data save_dataset(ds) # featvec for each text FV_NAMES = [ 'wv', # here!! # 'len', # 'tok', # 'tok_wrong', # 'tok_uniq', # 'tok_long', # 'tok_stop', # 'sent', # 'sent_complex_max', # 'sent_len_mean', # 'sent_long', # 'noun', # 'propn', # 'adj', # 'pron', # 'verb', # 'adv', # 'cconj', # 'det', # 'part', # 'punct', # 'comma', ] df = pd_train # for col in FV_NAMES: print('%s - mean: %.2f var: %.2f' % (col, df[col].mean(), df[col].var())) # DNN model inshape = (len(FV_NAMES), ) model = Sequential([ # LSTM(32, input_shape=(nb_time_steps, nb_input_vector))), Dense(32, activation='relu', kernel_initializer='he_normal', input_shape=inshape), Dropout(0.2), # Dense(8, activation='relu', kernel_initializer='he_normal'), # Dropout(dropout), Dense(1), ]) model.build() model.summary() adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) model.compile(optimizer=adam, loss='mse', metrics=['mse', 'mae']) for topic, df in pd_train.groupby('topic'): print('[Topic] working on topic %r' % topic) X, y = df[FV_NAMES], df['score'].astype(np.float64) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) model.fit(X_train, y_train) scores = cross_val_score(model, X_test, y_test, cv=3) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) y_pred = model.predict(X_test) kp = kappa(y_pred, y_test, weights='quadratic') print('kappa: %r, topic %r' % (kp, topic)) # applicational solution df = pd_solut[pd_solut.topic == topic] X = df[FV_NAMES] y_pred = model.predict(X) def trim_range(topic, score): RANGE = { 1: (0, 15), 2: (0, 8), 3: (0, 5), 4: (0, 5), 5: (0, 5), 6: (0, 5), 7: (0, 15), 8: (0, 70), } score = int(round(score)) rng = RANGE[topic] if score < rng[0]: score = rng[0] elif score > rng[1]: score = rng[1] return score res = [] ids = list(df['id']) # reindex column 'id' for i in range(len(y_pred)): id = ids[i] score = trim_range(topic, y_pred[i]) res.append([id, topic, score]) return res
def main(): results = [] cross_validate_list = [] for i in range(1, 9): _feature = feature # dev_x, dev_y = regression.generate_model_input(dev_dataset.data[str(i)][:], _feature) # x, y = regression.generate_model_input(train_dataset.data[str(i-1)] + dev_dataset.data[str(i-1)][:-100], _feature) # x, y = regression.generate_model_input(train_dataset.data[str(i + 1)], # _feature) # test_x = regression.generate_model_test_input(test_dataset.data[str(i)], _feature) x_s = [] y_s = [] print('used_set', used_set[i - 1]) for j in used_set[i - 1]: x, y = regression.generate_model_input_for_ranges( train_dataset.data[str(j)], _feature, score_ranges[j - 1]) standerizer = StandardScaler().fit(x) x = standerizer.transform(x) x_s.append(x) y_s.append(y) x = np.concatenate(x_s, axis=0) y = np.concatenate(y_s, axis=0) # print(x[0]) train_dev_test_x, _ = regression.generate_model_input_for_ranges( train_dataset.data[str(i)] + dev_dataset.data[str(i)] + test_dataset.data[str(i)], _feature, score_ranges[i - 1]) dev_test_standerizer = StandardScaler().fit(train_dev_test_x) dev_x, dev_y = regression.generate_model_input_for_ranges( dev_dataset.data[str(i)], _feature, score_ranges[i - 1]) test_x = regression.generate_model_test_input( test_dataset.data[str(i)], _feature) dev_x = dev_test_standerizer.transform(dev_x) test_x = dev_test_standerizer.transform(test_x) # standerizer = StandardScaler().fit(x) # x = standerizer.transform(x) # standerizer = StandardScaler().fit(x) # _train_x = standerizer.transform(x) # _dev_x = standerizer.transform(dev_x) # _test_x = standerizer.transform(test_x) kf = KFold(n_splits=9) dev_predict_list = [] model_list = [] test_list = [] dev_result_list = [] test_result_list = [] for train_index, test_index in kf.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] # print(x_train[0]) # print('train size', x_train.shape[0]) standerizer = StandardScaler().fit(x_train) _x_train = standerizer.transform(x_train) _x_test = standerizer.transform(x_test) _dev_x = standerizer.transform(dev_x) _test_x = standerizer.transform(test_x) # _x_train = x_train # _x_test = x_test # _dev_x = dev_x # _test_x = test_x # regressor = regression.svr_regressor(_x_train, y_train) regressor = regression.gradient_boosting_regressor( _x_train, y_train, num_estimators=int(_x_train.shape[0] / 20) + model_count[i - 1]) # regressor = regression.gradient_boosting_regressor(_x_train, y_train) model_list.append(regressor) predict_dev_y = regressor.predict(_dev_x) _dev_y = [ predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for predict in dev_y ] _predict_dev_y = [ predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for predict in predict_dev_y ] dev_result = metrics.kappa(y_true=_dev_y, y_pred=_predict_dev_y, weights='quadratic') # print(dev_result) dev_result_list.append(dev_result) # dev_predict_list.append(np.around(_predict_dev_y)) dev_predict_list.append(_predict_dev_y) # y_test_predict = regressor.predict(_x_test) # test_result = metrics.kappa(y_true=y_test, y_pred=y_test_predict, weights='quadratic') # test_result_list.append(test_result) # test_predict_y = regressor.predict(_test_x) # test_predict_y = [predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for # predict in test_predict_y] test_list.append(test_predict_y) # test_list.append(np.around(test_predict_y)) dev_predict = [ sum(x) / len(dev_predict_list) for x in zip(*dev_predict_list) ] # dev_predict = predict_bag(dev_predict_list) # dev_predict = [predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for predict # in dev_predict] dev_predict = np.around(dev_predict) # print(dev_predict) dev_y = [ y * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for y in dev_y ] dev_result = metrics.kappa(y_true=dev_y, y_pred=dev_predict, weights='quadratic') # print('cross validate average', np.average(test_result_list)) print('dev ', i, ' ', dev_result) # cross_validate_list.append(np.average(test_result_list)) results.append(dev_result) # test_predict_y = predict_bag(test_list) # test_predict_y = [sum(x) / len(test_list) for x in zip(*test_list)] test_predict_y = [ predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for predict in test_predict_y ] # print('average ', np.average(test_predict_y)) # test_predict_y = np.around(test_predict_y) for idx, sample in enumerate(test_dataset.data[str(i)]): sample['domain1_score'] = int(test_predict_y[idx]) all_test_samples.extend(test_dataset.data[str(i)]) save_to_tsv(test_dataset.data[str(i)], '../' + str(i) + '.tsv') print(np.average(results))
'x12', 'x13', 'x14', 'x15', 'x16' ]] train_y = train_data['score'] test_id = test_data['id'].values.tolist() test_x = test_data[[ 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16' ]] test_y = test_data['score'] from sklearn.svm import SVC svm = SVC(gamma='scale') svm.fit(train_x.values, train_y.values) pred_label = svm.predict(test_x.values) acc = metrics.kappa(test_y.values, pred_label, 'quadratic') svm_label.append(pred_label.tolist()) svm_kappa.append(acc) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', penalty='l2') lr.fit(train_x.values, train_y.values) pred_label = lr.predict(test_x.values).round() acc = metrics.kappa(test_y.values, pred_label, 'quadratic') lr_label.append(pred_label.tolist()) lr_kappa.append(acc) essay_id.append(test_id)
def go(): res = [] ds = load_dataset() pd_train = ds['train'] pd_solut = ds['solut'] abstract_featdict(pd_train) abstract_featdict(pd_solut) # save_dataset(ds) # SAVE MODEL FV_NAMES = [ 'len', 'tok', 'tok_wrong', 'tok_uniq', # 'tok_long', 'tok_stop', 'sent', # 'sent_complex_max', 'sent_len_mean', # 'sent_long', 'noun', # 'propn', 'adj', # 'pron', 'verb', 'adv', # 'cconj', # 'det', # 'part', 'punct', 'comma', ] df = ds['train'] for col in FV_NAMES: print('%s - mean: %.2f var: %.2f' % (col, df[col].mean(), df[col].var())) for topic, df in pd_train.groupby('topic'): print('[Topic] working on topic %r' % topic) X, y = df[FV_NAMES], df['score'].astype(np.float64) # train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) MODLES = [ [('linr', LinearRegression(normalize=True, copy_X=False, n_jobs=16))], # [('std', StandardScaler()), ('gs:svr', GridSearchCV(SVR(kernel='linear'), iid=True, param_grid={'C': [0.75, 1.5, 3, 5]}, cv=3, n_jobs=16))], # [('std', StandardScaler()), ('gs:lsvc', GridSearchCV(LinearSVC(), iid=True, param_grid={'C': [0.75, 1.5, 3, 5]}, cv=3, n_jobs=16))], [('gs:knn', GridSearchCV(KNeighborsRegressor(weights="distance"), iid=True, param_grid={'n_neighbors': [12, 16, 24, 32, 36]}, cv=3, n_jobs=16))], [('gs:dt', GridSearchCV(DecisionTreeRegressor(), iid=True, param_grid={'max_depth': [6, 12, 18, 24, 30]}, cv=3, n_jobs=16))], [('gs:et', GridSearchCV(ExtraTreesRegressor(), iid=True, param_grid={'n_estimators': [32, 64, 96, 128]}, cv=3, n_jobs=16))], [('gs:gb', GridSearchCV(GradientBoostingRegressor(), iid=True, param_grid={'n_estimators': [32, 64, 96, 128]}, cv=3, n_jobs=16))], [('gs:rf', GridSearchCV(RandomForestRegressor(), iid=True, param_grid={'n_estimators': [32, 64, 96, 128]}, cv=3, n_jobs=16))], [('gs:etc', GridSearchCV(ExtraTreesClassifier(), iid=True, param_grid={'n_estimators': [32, 64, 96, 128]}, cv=3, n_jobs=16))], [('gs:en', GridSearchCV(ElasticNet(), iid=True, param_grid={ 'l1_ratio': [0.01, 0.1, 0.5, 0.9], 'alpha': [0.01, 0.1, 1] }, cv=3, n_jobs=16))], ] kp_mdl = [] for pl in MODLES: model = Pipeline(pl) model.fit(X_train, y_train) scores = cross_val_score(model, X_test, y_test, cv=3) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) y_pred = model.predict(X_test) kp = kappa(y_pred, y_test, weights='quadratic') print('kappa: %r, using %r' % (kp, [m[0] for m in pl])) kp_mdl.append((kp, model)) # ok we select best k-models and use avgval of predicted N_MODELS = 3 kp_mdl.sort(reverse=True) print('kappas: %r' % [k for k, _ in kp_mdl]) models = [m for _, m in kp_mdl[:N_MODELS]] # applicational solution df = pd_solut[pd_solut.topic == topic] X = df[FV_NAMES] y_preds = [model.predict(X) for model in models] y_pred = [ sum(y_preds[i][j] for i in range(N_MODELS)) / N_MODELS for j in range(len(X)) ] def trim_range(topic, score): RANGE = { 1: (0, 15), 2: (0, 8), 3: (0, 5), 4: (0, 5), 5: (0, 5), 6: (0, 5), 7: (0, 15), 8: (0, 70), } score = int(round(score)) rng = RANGE[topic] if score < rng[0]: score = rng[0] elif score > rng[1]: score = rng[1] return score ids = list(df['id']) # reindex column 'id' for i in range(len(y_pred)): id = ids[i] score = trim_range(topic, y_pred[i]) res.append([id, topic, score]) return res
dev_x = dev_test_standerizer.transform(dev_x) test_x = dev_test_standerizer.transform(test_x) regressor = regression.gradient_boosting_regressor(x, y) # num_estimators=int(x.shape[0] / 20) + model_count[i - 1]) predict_dev_y = regressor.predict(dev_x) predict_dev_y = [ predict * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for predict in predict_dev_y ] # dev_predict = np.around(dev_predict) # print(dev_predict) dev_y = [ y * (score_ranges[i - 1][1] - score_ranges[i - 1][0]) + score_ranges[i - 1][0] for y in dev_y ] dev_result = metrics.kappa(y_true=dev_y, y_pred=predict_dev_y, weights='quadratic') current_results.append(dev_result) # print(dev_result) print(max(current_results)) index = current_results.index(max(current_results)) print(all_selected[index]) results.append(max(current_results)) print(np.average(results)) # print(np.average(results))