コード例 #1
0
ファイル: evaluate.py プロジェクト: zydeon/uci-statnlp
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("serialization_dir", help="save directory to load model")
    parser.add_argument("file", help="file to evaluate on")
    args = parser.parse_args()

    model = torch.load(f"{args.serialization_dir}/model.pt").eval()
    dataset = TwitterDataset(args.file)
    dataset.set_vocab(model.token_vocab, model.tag_vocab)
    dataloader = torch.utils.data.DataLoader(dataset, 1)

    for batch in tqdm(dataloader):
        _ = model(**batch)
    print(json.dumps(model.get_metrics(), indent=4))
コード例 #2
0
def load_datasets(train_dataset_params: dict, validation_dataset_params: dict):
    # load PyTorch ``Dataset`` objects for the train & validation sets
    train_dataset = TwitterDataset(**train_dataset_params)
    validation_dataset = TwitterDataset(**validation_dataset_params)

    # use tokens and tags in the training set to create `Vocabulary` objects
    token_vocab = Vocabulary(train_dataset.get_tokens_list(),
                             add_unk_token=True)
    tag_vocab = Vocabulary(train_dataset.get_tags_list())

    # add `Vocabulary` objects to datasets for tokens/tags to ID mapping
    train_dataset.set_vocab(token_vocab, tag_vocab)
    validation_dataset.set_vocab(token_vocab, tag_vocab)

    return train_dataset, validation_dataset
コード例 #3
0
ファイル: main.py プロジェクト: shannon112/MareepLearning
#model_filename = "./model/semi_no_76.084.model"
#model = torch.load(model_filename)

# devide to train90% and vaild10% on labeled training set
X_train, X_val, y_train, y_val = train_x[:190000], train_x[
    190000:], y[:190000], y[190000:]
# devide to train90% and vaild10% on both labeled and predicted labeled training set
#X_train, X_val, y_train, y_val = train_x[20000:], train_x[:20000], y[20000:], y[:20000]
# devide to train90% and vaild10% on both labeled and predicted labeled training set
#X_train, y_train = train_x[:20000], y[:20000]
#X_val, y_val = train_x[190000:200000], y[190000:200000]
#X_train = torch.cat((X_train, train_x[200000:]), 0)
#y_train = torch.cat((y_train, y[200000:]), 0)

# to dataset
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# to dataloader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=8)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=8)

# training
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model,
コード例 #4
0
# loading data
print("loading testing data ...")
test_x = load_testing_data(testing_filename)

# parameters
sen_len = 32
batch_size = 16

# predicting
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_model_filename)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()

# to dataset
test_dataset = TwitterDataset(X=test_x, y=None)

# to dataloader
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          num_workers=8)

# testing
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'last_semi_82.38.model'))
outputs = testing(batch_size, test_loader, model, device)

# save result to csv
tmp = pd.DataFrame({
    "id": [str(i) for i in range(len(test_x))],
コード例 #5
0
from utils import calc_acc

nb_classes = 2  # (Positive, Negative)
nb_words = 10000  # Size of our bag of words
load_cached = True  # If false, regenerates features from the original dataset

# Load the dataset
if load_cached:
    with open('dataset.pkl', 'rb') as pkl:
        print('Loading a cached dataset')
        dataset = pickle.load(pkl)
        print('Done loading')
else:
    with open('dataset.pkl', 'wb') as pkl:
        print('Preparing a new dataset')
        dataset = TwitterDataset('data/train.csv', nb_words, 0.1, 0.1)
        print('Done preparing the dataset, serializing')
        pickle.dump(dataset, pkl, pickle.HIGHEST_PROTOCOL)
        print('Done serializing')

# Fit several models with varying pseudocount parameter
models = dict()
for pseudocount in range(1, 30):
    # Fit the model
    print('Fitting a model with pseudocount={}'.format(pseudocount))
    model = MultinomialNaiveBayes(nb_classes, nb_words, pseudocount)
    model.fit(dataset.train)

    # Evaluate on train set
    preds_train = model.predict(dataset.train['x'])
    acc_train = calc_acc(dataset.train['y'], preds_train)
コード例 #6
0
    for i in range(len(train_x1)):
        for w in train_x1[i]:
            x[i][word_index[w]] += 1
    print(x.size())

    print("\nConstructing model...", flush=True)
    model = DNN(x.size(1)).to(device)
    total_param = sum(p.numel() for p in model.parameters())
    trainable_param = sum(p.numel() for p in model.parameters()
                          if p.requires_grad)
    print("{} parameters with {} trainable".format(total_param,
                                                   trainable_param),
          flush=True)

    print("\nStart training...", flush=True)
    train_dataset1 = TwitterDataset(x, train_y1)
    train_loader1 = torch.utils.data.DataLoader(dataset=train_dataset1,
                                                batch_size=BATCH,
                                                shuffle=True,
                                                num_workers=4)
    train_model(train_loader1, model, device, LR)

    print("\nStart testing...", flush=True)
    test_x = [
        "today is a good day , but it is hot",
        "today is hot , but it is a good day"
    ]
    test_x = [i.split() for i in test_x]
    x = torch.zeros(2, len(word_index))
    for i in range(2):
        for w in test_x[i]:
コード例 #7
0
                                                        shuffle=True)
    print("{} initial training data and {} validation data".format(
        len(train_x1), len(val_x)),
          flush=True)

    print("\nConstructing model...", flush=True)
    model = LSTM_Net(embedding).to(device)
    total_param = sum(p.numel() for p in model.parameters())
    trainable_param = sum(p.numel() for p in model.parameters()
                          if p.requires_grad)
    print("{} parameters with {} trainable".format(total_param,
                                                   trainable_param),
          flush=True)

    print("\nStart training...", flush=True)
    val_dataset = TwitterDataset(val_x, val_y)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=96,
                                             shuffle=False,
                                             num_workers=8)
    for it in range(SEMI_ITER):
        print("\n\n=================Iter {}/{}======================".format(
            it + 1, SEMI_ITER),
              flush=True)
        train_dataset1 = TwitterDataset(train_x1, train_y1)
        train_loader1 = torch.utils.data.DataLoader(dataset=train_dataset1,
                                                    batch_size=BATCH,
                                                    shuffle=True,
                                                    num_workers=8)

        model.apply(weight_init)