def train(trainset=trainset, valset=valset, weight_file=args.weights): """trains classifier on name->gender Args: trainset: list of name->gender tuple pairs for training valset (opt): list of name->gender tuple pairs to validation weight_file: filename to save classifer weights """ start = time.time() print("Training Naive Bayes Classifer on %d examples (%s)" % (len(trainset), time_since(start))) trainset = apply_features(_gender_features, trainset, labeled=True) classifier = nltk.NaiveBayesClassifier.train(trainset) print("Training complete. (%s)" % (time_since(start))) # validation if valset is not None and len(valset) > 0: valset = apply_features(_gender_features, valset, labeled=True) acc = nltk.classify.accuracy(classifier, valset) print("Validation accuracy is %.2f%% on %d examples (%s)" % (acc * 100, len(valset), time_since(start))) # save weights with open(weight_file, 'wb') as f: pickle.dump(classifier, f) f.close() print('Weights saved to "%s"' % (args.weights))
def train(dataset=TRAINSET, batch_size=args.batch_size, num_workers=args.num_workers, start_ep=args.start_epoch, end_ep=args.end_epoch, save_name=save_nm, print_every=100, log_iters=args.log_iters): rnn.train() print('Loading Dataset...') dataset = NameGenderDataset(dataset) collate_fn = name_gender_collate_cuda if args.cuda else name_gender_collate data_loader = data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=True) start = time.time() print("Beginning training...") for epoch in range(start_ep, end_ep + 1): ep_loss = 0 # fencepost print if epoch == start_ep: print('EPOCH %s/%s' % (start_ep, end_ep)) # iterate over all minibatches batch_iterator = iter(data_loader) batch = 0 while batch <= 500: try: batch += 1 batch_loss = 0 names_tensor, genders_tensor = next(batch_iterator) for name_tensor, gender_tensor in zip(names_tensor, genders_tensor): output, loss = _train(name_tensor, gender_tensor) batch_loss += loss if log_iters and batch % print_every == 0: print('\tLoss[ ep%d: %.2f | mb%d: %.2f ] (%s) ' % (epoch, ep_loss / batch, batch, batch_loss / len(names_tensor), time_since(start))) ep_loss += batch_loss except StopIteration: break print( 'EPOCH %d %d%% (%s) avg loss: %.4f' % (epoch, epoch / end_ep * 100, time_since(start), ep_loss / batch)) ep_loss = 0 torch.save(rnn.state_dict(), save_name + '_epoch' + repr(epoch) + '.pth') torch.save(rnn.state_dict(), save_name + '_classification.pth')
def test(testset=testset, weight_file=args.weights): """tests classifier on name->gender Args: train: % of examples to train with (e.g., 0.8) """ start = time.time() classifier = load_classifier(weight_file) print("Testing Naive Bayes Classifer on %d examples (%s)" % (len(testset), time_since(start))) testset = apply_features(_gender_features, testset, labeled=True) acc = nltk.classify.accuracy(classifier, testset) print("Testing accuracy is %.2f%% on %d examples (%s)" % (acc * 100, len(testset), time_since(start))) return acc
def write_classifications(names, weight_file=args.weights, outfile=args.outfile): classifier = load_classifier(weight_file) headers = ["name", "gender", "probability"] with open(outfile, 'w') as f: writer = csv.writer(f, delimiter=',') writer.writerow(headers) for name in names: guess, prob = _classify(name, classifier) writer.writerow([name, guess, prob]) f.close() print('\nWrote %d names to "%s" (%s)' % (len(names), outfile, time_since(start)))
def test(dataset=TESTSET, verbose=log_iters): print('Loading Dataset...') dataset = NameGenderDataset(dataset) collate_fn = name_gender_collate_cuda if args.cuda else name_gender_collate data_loader = data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=True) print("Beginning testing on %s names:\n" % (len(dataset))) start = time.time() cum = 0 # iterate over all minibatches batch_iterator = iter(data_loader) batch = 0 while True: try: batch += 1 batch_acc = 0 names_tensor, genders_tensor = next(batch_iterator) for name_tensor, gender_tensor in zip(names_tensor, genders_tensor): gt = ALL_GENDERS[gender_tensor.data[0]] name = tensor_to_name(name_tensor) output = _evaluate(name_tensor) topv, topi = output.data.topk(k=1, dim=1, largest=True) guess = ALL_GENDERS[topi[0][0]] correct = '!' if guess == gt else 'X (%s)' % gt if verbose: print("\t%s -> %s %s " % (name, guess, correct)) batch_acc += 1 if guess == gt else 0 print("%.2f%% minibatch acc: %.4f (%s)" % (batch / (len(dataset) / batch_size), batch_acc / len(names_tensor), time_since(start))) cum += batch_acc except StopIteration: break acc = cum / len(dataset) print() print("TOTAL: %d/%d (%.4f%%)" % (cum, len(dataset), acc * 100)) return acc
def classify(names, weight_file=args.weights): classifier = load_classifier(weight_file) for name in names: _classify(name, classifier) print("\nClassified %d names (%s)" % (len(names), time_since(start)))