def main(): parser = argparse.ArgumentParser() parser.add_argument("serialization_dir", help="save directory to load model") parser.add_argument("file", help="file to evaluate on") args = parser.parse_args() model = torch.load(f"{args.serialization_dir}/model.pt").eval() dataset = TwitterDataset(args.file) dataset.set_vocab(model.token_vocab, model.tag_vocab) dataloader = torch.utils.data.DataLoader(dataset, 1) for batch in tqdm(dataloader): _ = model(**batch) print(json.dumps(model.get_metrics(), indent=4))
def load_datasets(train_dataset_params: dict, validation_dataset_params: dict): # load PyTorch ``Dataset`` objects for the train & validation sets train_dataset = TwitterDataset(**train_dataset_params) validation_dataset = TwitterDataset(**validation_dataset_params) # use tokens and tags in the training set to create `Vocabulary` objects token_vocab = Vocabulary(train_dataset.get_tokens_list(), add_unk_token=True) tag_vocab = Vocabulary(train_dataset.get_tags_list()) # add `Vocabulary` objects to datasets for tokens/tags to ID mapping train_dataset.set_vocab(token_vocab, tag_vocab) validation_dataset.set_vocab(token_vocab, tag_vocab) return train_dataset, validation_dataset
#model_filename = "./model/semi_no_76.084.model" #model = torch.load(model_filename) # devide to train90% and vaild10% on labeled training set X_train, X_val, y_train, y_val = train_x[:190000], train_x[ 190000:], y[:190000], y[190000:] # devide to train90% and vaild10% on both labeled and predicted labeled training set #X_train, X_val, y_train, y_val = train_x[20000:], train_x[:20000], y[20000:], y[:20000] # devide to train90% and vaild10% on both labeled and predicted labeled training set #X_train, y_train = train_x[:20000], y[:20000] #X_val, y_val = train_x[190000:200000], y[190000:200000] #X_train = torch.cat((X_train, train_x[200000:]), 0) #y_train = torch.cat((y_train, y[200000:]), 0) # to dataset train_dataset = TwitterDataset(X=X_train, y=y_train) val_dataset = TwitterDataset(X=X_val, y=y_val) # to dataloader train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8) # training training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model,
# loading data print("loading testing data ...") test_x = load_testing_data(testing_filename) # parameters sen_len = 32 batch_size = 16 # predicting preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_model_filename) embedding = preprocess.make_embedding(load=True) test_x = preprocess.sentence_word2idx() # to dataset test_dataset = TwitterDataset(X=test_x, y=None) # to dataloader test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=8) # testing print('\nload model ...') model = torch.load(os.path.join(model_dir, 'last_semi_82.38.model')) outputs = testing(batch_size, test_loader, model, device) # save result to csv tmp = pd.DataFrame({ "id": [str(i) for i in range(len(test_x))],
from utils import calc_acc nb_classes = 2 # (Positive, Negative) nb_words = 10000 # Size of our bag of words load_cached = True # If false, regenerates features from the original dataset # Load the dataset if load_cached: with open('dataset.pkl', 'rb') as pkl: print('Loading a cached dataset') dataset = pickle.load(pkl) print('Done loading') else: with open('dataset.pkl', 'wb') as pkl: print('Preparing a new dataset') dataset = TwitterDataset('data/train.csv', nb_words, 0.1, 0.1) print('Done preparing the dataset, serializing') pickle.dump(dataset, pkl, pickle.HIGHEST_PROTOCOL) print('Done serializing') # Fit several models with varying pseudocount parameter models = dict() for pseudocount in range(1, 30): # Fit the model print('Fitting a model with pseudocount={}'.format(pseudocount)) model = MultinomialNaiveBayes(nb_classes, nb_words, pseudocount) model.fit(dataset.train) # Evaluate on train set preds_train = model.predict(dataset.train['x']) acc_train = calc_acc(dataset.train['y'], preds_train)
for i in range(len(train_x1)): for w in train_x1[i]: x[i][word_index[w]] += 1 print(x.size()) print("\nConstructing model...", flush=True) model = DNN(x.size(1)).to(device) total_param = sum(p.numel() for p in model.parameters()) trainable_param = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{} parameters with {} trainable".format(total_param, trainable_param), flush=True) print("\nStart training...", flush=True) train_dataset1 = TwitterDataset(x, train_y1) train_loader1 = torch.utils.data.DataLoader(dataset=train_dataset1, batch_size=BATCH, shuffle=True, num_workers=4) train_model(train_loader1, model, device, LR) print("\nStart testing...", flush=True) test_x = [ "today is a good day , but it is hot", "today is hot , but it is a good day" ] test_x = [i.split() for i in test_x] x = torch.zeros(2, len(word_index)) for i in range(2): for w in test_x[i]:
shuffle=True) print("{} initial training data and {} validation data".format( len(train_x1), len(val_x)), flush=True) print("\nConstructing model...", flush=True) model = LSTM_Net(embedding).to(device) total_param = sum(p.numel() for p in model.parameters()) trainable_param = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{} parameters with {} trainable".format(total_param, trainable_param), flush=True) print("\nStart training...", flush=True) val_dataset = TwitterDataset(val_x, val_y) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=96, shuffle=False, num_workers=8) for it in range(SEMI_ITER): print("\n\n=================Iter {}/{}======================".format( it + 1, SEMI_ITER), flush=True) train_dataset1 = TwitterDataset(train_x1, train_y1) train_loader1 = torch.utils.data.DataLoader(dataset=train_dataset1, batch_size=BATCH, shuffle=True, num_workers=8) model.apply(weight_init)