Esempio n. 1
0
def train(trainset=trainset, valset=valset, weight_file=args.weights):
    """trains classifier on name->gender
    
    Args:
        trainset: list of name->gender tuple pairs for training
        valset (opt): list of name->gender tuple pairs to validation
        weight_file: filename to save classifer weights

    """

    start = time.time()
    print("Training Naive Bayes Classifer on %d examples (%s)" %
          (len(trainset), time_since(start)))

    trainset = apply_features(_gender_features, trainset, labeled=True)
    classifier = nltk.NaiveBayesClassifier.train(trainset)

    print("Training complete. (%s)" % (time_since(start)))

    # validation
    if valset is not None and len(valset) > 0:
        valset = apply_features(_gender_features, valset, labeled=True)
        acc = nltk.classify.accuracy(classifier, valset)
        print("Validation accuracy is %.2f%% on %d examples (%s)" %
              (acc * 100, len(valset), time_since(start)))

    # save weights
    with open(weight_file, 'wb') as f:
        pickle.dump(classifier, f)
        f.close()
        print('Weights saved to "%s"' % (args.weights))
Esempio n. 2
0
def train(dataset=TRAINSET,
          batch_size=args.batch_size,
          num_workers=args.num_workers,
          start_ep=args.start_epoch,
          end_ep=args.end_epoch,
          save_name=save_nm,
          print_every=100,
          log_iters=args.log_iters):
    rnn.train()
    print('Loading Dataset...')

    dataset = NameGenderDataset(dataset)
    collate_fn = name_gender_collate_cuda if args.cuda else name_gender_collate
    data_loader = data.DataLoader(dataset,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  pin_memory=True)

    start = time.time()
    print("Beginning training...")
    for epoch in range(start_ep, end_ep + 1):

        ep_loss = 0

        # fencepost print
        if epoch == start_ep: print('EPOCH %s/%s' % (start_ep, end_ep))

        # iterate over all minibatches
        batch_iterator = iter(data_loader)
        batch = 0
        while batch <= 500:
            try:
                batch += 1
                batch_loss = 0
                names_tensor, genders_tensor = next(batch_iterator)
                for name_tensor, gender_tensor in zip(names_tensor,
                                                      genders_tensor):
                    output, loss = _train(name_tensor, gender_tensor)
                    batch_loss += loss
                if log_iters and batch % print_every == 0:
                    print('\tLoss[ ep%d: %.2f | mb%d: %.2f ]  (%s) ' %
                          (epoch, ep_loss / batch, batch,
                           batch_loss / len(names_tensor), time_since(start)))
                ep_loss += batch_loss
            except StopIteration:
                break
        print(
            'EPOCH %d %d%% (%s) avg loss: %.4f' %
            (epoch, epoch / end_ep * 100, time_since(start), ep_loss / batch))

        ep_loss = 0
        torch.save(rnn.state_dict(),
                   save_name + '_epoch' + repr(epoch) + '.pth')
    torch.save(rnn.state_dict(), save_name + '_classification.pth')
def test(testset=testset, weight_file=args.weights):
    """tests classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    start = time.time()
    classifier = load_classifier(weight_file)

    print("Testing Naive Bayes Classifer on %d examples (%s)" %
          (len(testset), time_since(start)))
    testset = apply_features(_gender_features, testset, labeled=True)
    acc = nltk.classify.accuracy(classifier, testset)
    print("Testing accuracy is %.2f%% on %d examples (%s)" %
          (acc * 100, len(testset), time_since(start)))
    return acc
Esempio n. 4
0
def write_classifications(names,
                          weight_file=args.weights,
                          outfile=args.outfile):
    classifier = load_classifier(weight_file)
    headers = ["name", "gender", "probability"]
    with open(outfile, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(headers)
        for name in names:
            guess, prob = _classify(name, classifier)
            writer.writerow([name, guess, prob])
        f.close()
    print('\nWrote %d names to "%s" (%s)' %
          (len(names), outfile, time_since(start)))
Esempio n. 5
0
def test(dataset=TESTSET, verbose=log_iters):
    print('Loading Dataset...')

    dataset = NameGenderDataset(dataset)
    collate_fn = name_gender_collate_cuda if args.cuda else name_gender_collate
    data_loader = data.DataLoader(dataset,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  pin_memory=True)

    print("Beginning testing on %s names:\n" % (len(dataset)))
    start = time.time()
    cum = 0

    # iterate over all minibatches
    batch_iterator = iter(data_loader)
    batch = 0
    while True:
        try:
            batch += 1
            batch_acc = 0
            names_tensor, genders_tensor = next(batch_iterator)
            for name_tensor, gender_tensor in zip(names_tensor,
                                                  genders_tensor):
                gt = ALL_GENDERS[gender_tensor.data[0]]
                name = tensor_to_name(name_tensor)
                output = _evaluate(name_tensor)
                topv, topi = output.data.topk(k=1, dim=1, largest=True)
                guess = ALL_GENDERS[topi[0][0]]
                correct = '!' if guess == gt else 'X (%s)' % gt
                if verbose: print("\t%s -> %s %s " % (name, guess, correct))
                batch_acc += 1 if guess == gt else 0
            print("%.2f%% minibatch acc: %.4f (%s)" %
                  (batch / (len(dataset) / batch_size),
                   batch_acc / len(names_tensor), time_since(start)))
            cum += batch_acc
        except StopIteration:
            break
    acc = cum / len(dataset)
    print()
    print("TOTAL: %d/%d (%.4f%%)" % (cum, len(dataset), acc * 100))
    return acc
Esempio n. 6
0
def classify(names, weight_file=args.weights):
    classifier = load_classifier(weight_file)
    for name in names:
        _classify(name, classifier)
    print("\nClassified %d names (%s)" % (len(names), time_since(start)))