Esempio n. 1
0
def train(args, labeled, resume_from, ckpt_file):
    batch_size = args["batch_size"]
    lr = 4.0
    momentum = 0.9
    epochs = args["train_epochs"]

    if not os.path.isdir('./.data'):
        os.mkdir('./.data')

    global train_dataset, test_dataset
    train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
        root='./.data', ngrams=args["N_GRAMS"], vocab=None)

    global VOCAB_SIZE, EMBED_DIM, NUN_CLASS
    VOCAB_SIZE = len(train_dataset.get_vocab())
    EMBED_DIM = args["EMBED_DIM"]
    NUN_CLASS = len(train_dataset.get_labels())

    trainloader = DataLoader(train_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=generate_batch)
    net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.SGD(net.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

    if resume_from is not None:
        ckpt = torch.load(os.path.join(args["EXPT_DIR"], resume_from))
        net.load_state_dict(ckpt["model"])
        optimizer.load_state_dict(ckpt["optimizer"])
    else:
        getdatasetstate()

    net.train()
    for epoch in tqdm(range(epochs), desc="Training"):
        running_loss = 0.0
        train_acc = 0
        for i, data in enumerate(trainloader):
            text, offsets, cls = data
            text, offsets, cls = text.to(device), offsets.to(device), cls.to(
                device)
            outputs = net(text, offsets)
            loss = criterion(outputs, cls)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_acc += (outputs.argmax(1) == cls).sum().item()
            running_loss += loss.item()
        scheduler.step()

    print("Finished Training. Saving the model as {}".format(ckpt_file))
    print("Training accuracy: {}".format(
        (train_acc / len(train_dataset) * 100)))
    ckpt = {"model": net.state_dict(), "optimizer": optimizer.state_dict()}
    torch.save(ckpt, os.path.join(args["EXPT_DIR"], ckpt_file))

    return
Esempio n. 2
0
def test(args, ckpt_file):
    batch_size = args["batch_size"]
    testloader = DataLoader(test_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=generate_batch)

    predictions, targets = [], []
    net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
    ckpt = torch.load(os.path.join(args["EXPT_DIR"], ckpt_file))
    net.load_state_dict(ckpt["model"])
    net.eval()

    correct, total = 0, 0
    with torch.no_grad():
        for data in tqdm(testloader, desc="Testing"):
            text, offsets, cls = data
            text, offsets, cls = text.to(device), offsets.to(device), cls.to(
                device)
            outputs = net(text, offsets)

            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy().tolist())
            targets.extend(cls.cpu().numpy().tolist())
            total += cls.size(0)
            correct += (predicted == cls).sum().item()

    return {"predictions": predictions, "labels": targets}
Esempio n. 3
0
def infer(args, unlabeled, ckpt_file):
    unlabeled = Subset(train_dataset, unlabeled)
    unlabeled_loader = torch.utils.data.DataLoader(
        unlabeled,
        batch_size=args["batch_size"],
        shuffle=False,
        num_workers=2,
        collate_fn=generate_batch)

    net = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
    ckpt = torch.load(os.path.join(args["EXPT_DIR"], ckpt_file))
    net.load_state_dict(ckpt["model"])
    net.eval()

    correct, total = 0, 0
    outputs_fin = {}
    with torch.no_grad():
        for i, data in tqdm(enumerate(unlabeled_loader), desc="Inferring"):
            text, offsets, cls = data
            text, offsets, cls = text.to(device), offsets.to(device), cls.to(
                device)
            outputs = net(text, offsets)

            _, predicted = torch.max(outputs.data, 1)
            total += cls.size(0)
            correct += (predicted == cls).sum().item()
            for j in range(len(outputs)):
                outputs_fin[j] = {}
                outputs_fin[j]["prediction"] = predicted[j].item()
                outputs_fin[j]["pre_softmax"] = outputs[j].cpu().numpy()

    return {"outputs": outputs_fin}
Esempio n. 4
0
def infer(sample):
    train_dataset, test_dataset, mytrainloader, mytestloader = get_loaders()
    classes = ("World", "Sports", "Business", "Sci/Tec")

    VOCAB_SIZE = len(train_dataset.get_vocab())
    EMBED_DIM = 32
    NUM_CLASS = len(train_dataset.get_labels())
    mynet = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)
    mycriterion = nn.CrossEntropyLoss().to(device)
    myoptimizer = optim.SGD(mynet.parameters(), lr=4.0)
    myscheduler = torch.optim.lr_scheduler.StepLR(myoptimizer, 1, gamma=0.9)

    sampler = SubsetSampler(sample)
    dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=1,
        num_workers=4,
        sampler=sampler,
        collate_fn=generate_batch,
    )
    soft = torch.nn.Softmax(dim=0)
    results = []
    infer_outs = {}
    with torch.no_grad():
        with tqdm(total=len(dataloader),
                  desc="Inferring on unlabeled ...") as tq:
            for r, (text, offsets, cls) in enumerate(dataloader):
                text, offsets, cls = text.to(device), offsets.to(
                    device), cls.to(device)
                outputs = mynet(text, offsets)
                _, predicted = torch.max(outputs.data, 1)
                ground_truth = cls.item()
                prediction = predicted.item()
                infer_outs[r] = soft(outputs[0]).numpy().tolist()
                tq.update(1)
            # results.append([sample[r], classes[ground_truth], classes[prediction], probability[prediction],classwiseprobs])

    return infer_outs
Esempio n. 5
0
    logging.basicConfig(level=getattr(logging, args.logging_level))

    start_time = time.time()
    logging.info("Loading vocab from: {}".format(args.vocab))
    vocab = torch.load(args.vocab)

    logging.info("Counting training lines and labels")
    num_labels, train_num_lines = count(train_data_path)
    logging.info("Counting testing lines and labels")
    num_labels, test_num_lines = count(test_data_path)

    logging.info("Loading iterable datasets")
    train_dataset = Dataset(get_csv_iterator(train_data_path, ngrams, vocab),
                            train_num_lines, num_epochs)
    test_dataset = Dataset(get_csv_iterator(test_data_path, ngrams, vocab),
                           test_num_lines, num_epochs)

    logging.info("Creating models")
    model = TextSentiment(len(vocab), embed_dim, num_labels).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    logging.info("Setup took: {:3.0f}s".format(time.time() - start_time))

    logging.info("Starting training")
    train(lr, num_epochs, train_dataset)
    test(test_dataset)

    if args.save_model_path:
        print("Saving model to {}".format(args.save_model_path))
        torch.save(model.to('cpu'), args.save_model_path)
Esempio n. 6
0
    split_ratio = args.split_ratio
    # two args for sentencepiece tokenizer
    use_sp_tokenizer = args.use_sp_tokenizer
    sp_vocab_size = args.sp_vocab_size

    logging.basicConfig(level=getattr(logging, args.logging_level))

    if not os.path.exists(data):
        print("Creating directory {}".format(data))
        os.mkdir(data)

    import hackson_dataset
    train_dataset, test_dataset = hackson_dataset.setup_datasets(
        args.dataset, root='.data', vocab_size=sp_vocab_size)
    #pdb.set_trace()
    model = TextSentiment(sp_vocab_size, embed_dim,
                          len(train_dataset.get_labels())).to(device)

    criterion = torch.nn.CrossEntropyLoss().to(device)

    # split train_dataset into train and valid
    train_len = int(len(train_dataset) * split_ratio)
    sub_train_, sub_valid_ = \
        random_split(train_dataset, [train_len, len(train_dataset) - train_len])
    train_and_valid(lr, sub_train_, sub_valid_)
    print("Test - Accuracy: {}".format(test(test_dataset)))

    if args.save_model_path:
        print("Saving model to {}".format(args.save_model_path))
        torch.save(model.to('cpu'), args.save_model_path)

    if args.dictionary is not None:
Esempio n. 7
0
def main():

    device = "gpu" if torch.cuda.is_available() else "cpu"
    train_dataset, test_dataset = get_dataset()
    VOCAB_SIZE = len(train_dataset.get_vocab())
    EMBED_DIM = 32
    NUN_CLASS = len(train_dataset.get_labels())
    model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
    BATCH_SIZE = 16
    N_EPOCHS = 5
    min_valid_loss = float('inf')

    criterion = torch.nn.CrossEntropyLoss().to(
        device)  # mutil-class use the CrossEntropy
    optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

    train_len = int(len(train_dataset) * 0.95)
    sub_train_, sub_valid_ = \
        random_split(train_dataset, [train_len, len(train_dataset) - train_len])
    train_loader = DataLoader(sub_train_,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              collate_fn=generate_batch)
    valid_loader = DataLoader(sub_valid_,
                              batch_size=BATCH_SIZE,
                              collate_fn=generate_batch)
    test_loader = DataLoader(test_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=generate_batch)

    for epoch in tqdm(range(N_EPOCHS)):

        start_time = time.time()
        train_loss, train_acc = train_fn(dataLoader=train_loader,
                                         model=model,
                                         optimizer=optimizer,
                                         scheduler=scheduler,
                                         criterion=criterion,
                                         device=device)
        valid_loss, valid_acc = evaluate_fn(dataLoader=valid_loader,
                                            model=model,
                                            criterion=criterion,
                                            device=device)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch + 1),
              " | time in %d minutes, %d seconds" % (mins, secs))
        print(
            f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)'
        )
        print(
            f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)'
        )
        if valid_loss < min_valid_loss:
            torch.save(model.state_dict(),
                       "../weights/text_news{}.pth".format(valid_loss))
            print(min_valid_loss, "--------->>>>>>>>", valid_loss)
            min_valid_loss = valid_loss

    print('Checking the results of test dataset...')
    test_loss, test_acc = evaluate_fn(dataLoader=test_loader,
                                      model=model,
                                      criterion=criterion,
                                      device=device)
    print(
        f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')
Esempio n. 8
0
# train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
#     root='./data/AG/', ngrams=NGRAMS, vocab=None, download=False)
train_dataset, test_dataset = _setup_datasets(root='./data/AG/ag_news_csv',
                                              ngrams=NGRAMS,
                                              vocab=None)

BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())

# embbed 层 32维 num class 对应label 4层 vocab
print("VOCAB_SIZE", "NUN_CLASS", VOCAB_SIZE, NUN_CLASS)
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)


def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

Esempio n. 9
0
    return sum(total_accuracy) / len(total_accuracy)

if __name__ == "__main__":
    logger = log.GetLogger(log.logging.INFO)
    a = ArgParse()
    logger.info("batch size:{}".format(a.batch_size))
    logger.info("device:{}".format(a.device))
    logger.info("data_name:{}".format(a.data_name))
    logger.info("data_dir:{}".format(a.data_dir))

    if not os.path.exists(a.data_dir):
        print("Creating directory {}".format(a.data_dir))
        os.mkdir(data)

    train, test = text_classification.DATASETS[a.data_name](root=a.data_dir, ngrams=a.ngrams)
    model = TextSentiment(len(train.get_vocab()), a.embed_dim, len(train.get_labels())).to(a.device)

    train_len = int(len(train) * a.split_ratio)
    train2, valid = random_split(train, [train_len, len(train) - train_len])

    TrainValid(a.num_epochs, a.num_workers, a.device, a.batch_size, a.lr, a.lr_gamma,
               train2, valid, model = model)
    acc = Test(a.batch_size, a.device, test, model)
    logger.info("Test - Accuracy: {}".format(acc))

    if a.save_model_path:
        logger.info(a.save_model_path)
        torch.save(model.to('cpu'), a.save_model_path)

    if a.dictionary is not None:
        print("Save vocab to {}".format(a.dictionary))
Esempio n. 10
0
def train_rating_model(
    YELP_TRAIN,
    fields,
    criterion,
    N_EPOCHS=20,
    split_ratio=0.9,
    num_hidden=30,
    embed_dim=50,
    actual_embed_dim=50,
):
    SEED = 0
    BATCH_SIZE = 16

    # Load and process data
    train_data = data.TabularDataset(path=YELP_TRAIN,
                                     format="json",
                                     fields=fields)
    print(YELP_TRAIN)
    print("NUM TRAIN", len(train_data.examples))
    assert len(train_data.examples) > 2
    TEXT = fields["text"][1]
    TEXT.build_vocab(train_data, vectors="glove.6B.%dd" % embed_dim)

    # Load model
    model = TextSentiment(
        vocab_size=len(TEXT.vocab),
        vocab=TEXT.vocab,
        embed_dim=actual_embed_dim,
        num_class=1,
        num_hidden=num_hidden,
    )

    # define optimizer and loss
    optimizer = optim.Adam(model.parameters())
    # criterion = nn.CrossEntropyLoss()

    # Train the model
    random.seed(0)
    train_data, valid_data = train_data.split(split_ratio=split_ratio,
                                              random_state=random.getstate())
    train_iterator, valid_iterator = data.Iterator.splits(
        (train_data, valid_data),
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        shuffle=True,
    )
    # iterator = data.Iterator(
    #    train_data,
    #    batch_size = BATCH_SIZE,
    #    sort_key = lambda x: len(x.text),
    #    sort_within_batch=True,
    #    shuffle=True)
    for epoch in range(N_EPOCHS):
        train_loss = train(model, train_iterator, optimizer, criterion)
        if epoch % 5 == 0:
            print(f"\tTrain Loss {epoch}: {train_loss:.3f}")
            evaluate(model, valid_iterator, criterion)

    evaluate(model, valid_iterator, criterion)
    return model
from torchtext.data.utils import get_tokenizer
from dataset import get_dataset
import pickle
import argparse
import torch

ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}
WEIGHT_PATH = "../weights/text_news0.2672930294473966.pth"

vocab = pickle.load(open(".data/save_vocab.p", "rb"))

device = "cuda" if torch.cuda.is_available() else "cpu"
VOCAB_SIZE = 1308844
EMBED_DIM = 32
NUM_CLASS = 4
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS)
checkpoint = torch.load(WEIGHT_PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)
model.to(device)


def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([
            vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)
        ])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1