def get_prediction(file_path): # predictions = np.loadtxt("prob_result.csv") predictions = predict.get_predictions(file_path) labels = predict.get_labels() print predictions.shape file_path = file_path + "?cachebuster=%s" % time.time() result = { "video": { "url": "%s" % file_path, "framerate": 25 }, "frames": [] } for index, row in enumerate(predictions): pred_per_label = [] five_best = np.argpartition(row, -5)[-5:] for i in five_best: pred_per_label.append({"label": labels[i], "prob": row[i]}) new_frame = {"frameNumber": index, "predictions": pred_per_label} result["frames"].append(new_frame) return result
def update_table(n_clicks): if n_clicks == 0: raise PreventUpdate else: predictions = predict.get_predictions(client=client, features_df=features) results_table = pd.merge(predictions, table, on='author') results_table = results_table[[ 'Default Likelihood_x', 'author', 'title', 'full_link', 'created_utc' ]] columns = [{ "name": 'Default Likelihood', "id": 'Default Likelihood_x' }, { "name": 'Borrower', "id": 'author' }, { "name": 'Post Title', "id": 'title' }, { "name": 'Link', "id": 'full_link', 'presentation': 'markdown' }, { "name": 'Create Date UTC', "id": 'created_utc' }] return (results_table.to_dict('records'), columns)
def eval_epoch(model, data_loader, device): model.eval() val_loss = 0.0 with torch.no_grad(): for data in data_loader: tokens_tensors, segments_tensors, masks_tensors, labels = [ t.to(device) for t in data ] # Forwarding, ouput: (loss), (logits), (hidden_states), (attentions) outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs[0] val_loss += loss.item() # Compute accuray and average loss _, val_acc = predict.get_predictions(model, data_loader, device, compute_acc=True) val_avg_loss = np.mean(val_loss) / len(data_loader) return val_acc, val_avg_loss
def train_epoch(model, data_loader, device): model.train() #optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6) train_loss = 0.0 optimizer = AdamW(model.parameters(), lr=1e-5) for data in data_loader: tokens_tensors, segments_tensors, masks_tensors, labels = [ t.to(device) for t in data ] optimizer.zero_grad() # Forwarding, ouputs: (loss), (logits), (hidden_states), (attentions) outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs[0] train_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() # Compute accuray and average loss _, train_acc = predict.get_predictions(model, data_loader, device, compute_acc=True) train_avg_loss = np.mean(train_loss) / len(data_loader) return train_acc, train_avg_loss
def get_prediction(file_path): # predictions = np.loadtxt("prob_result.csv") predictions = predict.get_predictions(file_path) labels = predict.get_labels() print predictions.shape file_path = file_path + "?cachebuster=%s" % time.time() result = { "video" : { "url" : "%s" % file_path, "framerate" : 25 }, "frames" : [] } for index, row in enumerate(predictions): pred_per_label = [] five_best = np.argpartition(row, -5)[-5:] for i in five_best: pred_per_label.append({"label" : labels[i], "prob" : row[i]}) new_frame = { "frameNumber" : index, "predictions" : pred_per_label } result["frames"].append(new_frame) return result
def main(): model = load_cnn_imdb() labels = load_cnn_imdb_labels() word_index = load_imdb_word_index() features = load_imdb_keywords() m = [model, labels, word_index] for filename in os.listdir('reviews/'): filename = filename.split('.')[0] news_file = open("reviews/" + filename + ".txt", "r", encoding="utf8") text = news_file.read().replace('\n', '') text = re.sub(r'[^\w\s]', '', text) tokens = word_tokenize(text) filtered_tokens = [ word.lower() for word in tokens if word not in stopwords.words('english') ] lmtzr = WordNetLemmatizer() lems = [lmtzr.lemmatize(t) for t in filtered_tokens] instance_features = [] for l in lems: for word in features.keys(): if (l == word and l not in instance_features): instance_features.append(word) perturbated = {} for f in instance_features: p = [] for t in tokens: if (t != f): p.append(t) perturbated[f] = " ".join(p) original_text = " ".join(tokens) out_file = open("results\perturbated_" + filename + ".txt", "w") out_file.write(text + "\n\n") for p in perturbated: out_file.write(p + "\n" + perturbated[p] + "\n\n") out_file.close() get_predictions(original_text, perturbated, filename, m)
def main(): for i in range(10): model_path = f'../models/chukchi/chukchi_{i}' test_path = f'../data/Chukchi/flat_test/ckt_hse-ud-test.0{i}.conllu' savepath = f'../results/chukchi/test_result_{i}.conllu' config = Params.from_file(Path(model_path, 'config.json')) dataset_reader = DatasetReader.from_params(config['dataset_reader']) model = Model.load(config, serialization_dir=model_path, cuda_device=-1) model.eval() predictor = UniversalDependenciesCharacterLevelPredictor( model=model, dataset_reader=dataset_reader) predictions = get_predictions(test_path, dataset_reader, predictor) save_predictions_to_conllu(savepath, predictions)
def score(): jsons = json.dumps(request.json, sort_keys=True, indent=4, separators=(',', ':')) m = hashlib.sha224(jsons).hexdigest() if col.find_one({'_id': m}) is None: print "New Point" timepoint = time.time() datapoint_dict = json.loads(jsons) df = pd.DataFrame.from_dict(datapoint_dict, orient='index').T df = predict.clean_json(df) predictions = predict.get_predictions(df, model) datapoint_dict['predicts'] = tuple(predictions[0]) datapoint_dict['time'] = timepoint datapoint_json = json.dumps(datapoint_dict) col.insert_one({'_id': m, 'data': datapoint_json}) else: print "Seen this point" return ""
def run(): print ("Importing the datasets...\n") df = pd.read_csv(config.Training_file) # Multi-Genre NLI Corpus df_mnli = data_augmentation.load_mnli() # Cross-Lingual NLI Corpus df_xnli = data_augmentation.load_xnli() df.drop(columns = ['id'], inplace = True) print ("Imported...\n") print ("Processing and doing Data Augmentation ...\n") print ("back translation started...\n") df = data_augmentation.proc(df) print ("Back Translation Done...\n") df = pd.concat([df, df_mnli, df_xnli], ignore_index = True) # Shuffling the dataframe df = resample(df, random_state = config.RANDOM_SEED) df['premise'] = df['premise'].astype(str) df['hypothesis'] = df['hypothesis'].astype(str) df.drop_duplicates(subset = ['premise', 'hypothesis'], inplace = True) combined_thesis = df[['premise', 'hypothesis']].values.tolist() df['combined_thesis'] = combined_thesis df_train, df_test = train_test_split( df, test_size = 0.3, random_state = 0 ) df_test, df_val = train_test_split( df_test, test_size = 0.4, random_state = 0 ) train_data_loader = dataset.create_data_loader(df_train, config.tokenizer, config.max_len, config.batch_size) test_data_loader = dataset.create_data_loader(df_test, config.tokenizer, config.max_len, config.batch_size) val_data_loader = dataset.create_data_loader(df_val, config.tokenizer, config.max_len, config.batch_size) print ("Processing and Data Augmentation Done...\n") # we save the whole model of roberta_model in the model var which is further used in the code device = torch.device(config.DEVICE) model = roberta_model(3) model = model.to(device) #optimizer, scheduler and loss_fn used in the train_roberta_epoch and eval_roberta optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_data_loader) * config.EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) loss_fn = nn.CrossEntropyLoss().to(device) history = defaultdict(list) best_accuracy = 0 print ("Training...\n") for epochs in range(config.EPOCHS): print ('Epoch {} \n'.format(epochs + 1)) print ("-"*100) train_acc, train_loss = engine.train_roberta_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) ) print ('train_acc {} train_loss {}'.format(train_acc, train_loss)) val_acc, val_loss = engine.eval_roberta( model, val_data_loader, loss_fn, device, len(df_val) ) print ('val_acc {} val_loss {}'.format(val_acc, val_loss)) print () history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: # here model.state_dict() will save the model and optimizer's parameter torch.save(model.state_dict(), 'best_model.bin') best_accuracy = val_acc print ("Training completed...") print ("Testing...\n") # predictions y_thesis, y_pred, y_pred_probs, y_test = predict.get_predictions( model, test_data_loader, device ) # Classification_report print(classification_report(y_test, y_pred)) # Confusion matrix ln = [] # tn, fp, fn, tp ln = confusion_matrix(y_test, y_pred).ravel() print ("True Negative, False Positive, False Negative, True positive : ", ln) print () # Accuracy print (accuracy_score(y_test, y_pred)) print() print ("DONE!!")
plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend() plt.ylim([0, 1]) plt.grid() plt.savefig('acc_history.png') plt.clf() plt.plot(history['train_loss'], label='train loss') plt.plot(history['val_loss'], label='val loss') plt.title('Loss history') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend() plt.grid() plt.savefig('loss_history.png') # Inference with test set test_set = CovidDataset(df_test, tokenizer=tokenizer) test_loader = DataLoader(test_set, batch_size=256, collate_fn=batch.create_mini_batch) predictions = predict.get_predictions( model, test_loader, device ) ### Currently have a bug here, why prediction get tuple wite 2 same predicted results? # Concat predition to .csv df_pred = df_test df_pred['prediction'] = predictions[0].tolist() df_pred.to_csv('predict.csv', index=False)