Beispiel #1
0
def main(arg_list: List[str]) -> None:
    args = take_std_args(
        arg_list, "non-recurrent neural network "
        "model for Proverbot9001")

    raw_dataset = get_text_data(args)
    dataset, tokenizer, embedding = encode_bag_classify_data(
        raw_dataset, tokenizers[args.tokenizer], args.num_keywords, 2)
    checkpoints = train(dataset, tokenizer.numTokens(), args.hidden_size,
                        embedding.num_tokens(), args.num_decoder_layers,
                        args.batch_size, args.learning_rate, args.gamma,
                        args.epoch_step, args.num_epochs, args.print_every,
                        optimizers[args.optimizer])

    for epoch, (network_state, training_loss) in enumerate(checkpoints):
        state = {
            'epoch': epoch,
            'training-loss': training_loss,
            'tokenizer': tokenizer,
            'embedding': embedding,
            'network-state': network_state,
            'training-args': args,
        }
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".format(epoch))
            torch.save(state, f)
def main(args_list : List[str]) -> None:
    parser = argparse.ArgumentParser(description=
                                     "A second-tier predictor which predicts tactic "
                                     "stems based on word frequency in the goal")
    parser.add_argument("--context-filter", dest="context_filter",
                        type=str, default="default")
    parser.add_argument("--num-keywords", dest="num_keywords",
                        type=int, default=100)
    parser.add_argument("--max-tuples", dest="max_tuples",
                        type=int, default=None)
    parser.add_argument("scrape_file")
    parser.add_argument("save_file")
    args = parser.parse_args(args_list)
    dataset = get_text_data(args)
    samples, tokenizer, embedding = encode_bag_classify_data(dataset,
                                                             tokenizers["no-fallback"],
                                                             args.num_keywords, 2)

    classifier, loss = train(samples, embedding.num_tokens())

    state = {'stem-embeddings': embedding,
             'tokenizer':tokenizer,
             'classifier': classifier,
             'options': [
                 ("dataset size", str(len(samples))),
                 ("context filter", args.context_filter),
                 ("training loss", loss),
                 ("# stems", embedding.num_tokens()),
                 ("# tokens", args.num_keywords),
             ]}
    with open(args.save_file, 'wb') as f:
        pickle.dump(state, f)
 def _encode_data(self, data : RawDataset, arg_values : argparse.Namespace) \
     -> Tuple[ClassifyBagDataset, KNNMetadata]:
     samples, tokenizer, embedding = \
         encode_bag_classify_data(RawDataset(list(self._preprocess_data(data, arg_values))),
                                  tokenizers[arg_values.tokenizer],
                                  arg_values.num_keywords, 2)
     return samples, KNNMetadata(embedding, tokenizer, arg_values.tokenizer,
                                 len(samples), arg_values.context_filter)
def main(args_list : List[str]) -> None:
    parser = argparse.ArgumentParser(description=
                                     "A second-tier predictor which predicts tactic "
                                     "stems based on word frequency in the goal")
    parser.add_argument("--learning-rate", dest="learning_rate", default=.5, type=float)
    parser.add_argument("--num-epochs", dest="num_epochs", default=10, type=int)
    parser.add_argument("--batch-size", dest="batch_size", default=256, type=int)
    parser.add_argument("--print-every", dest="print_every", default=10, type=int)
    parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int)
    parser.add_argument("--gamma", dest="gamma", default=0.5, type=float)
    parser.add_argument("--optimizer", default="SGD",
                        choices=list(optimizers.keys()), type=str)
    parser.add_argument("--context-filter", dest="context_filter",
                        type=str, default="default")
    parser.add_argument("scrape_file")
    parser.add_argument("save_file")
    args = parser.parse_args(args_list)
    print("Loading dataset...")

    text_dataset = get_text_data(args)
    samples, tokenizer, embedding = encode_bag_classify_data(text_dataset,
                                                             tokenizers["char-fallback"],
                                                             100, 2)

    checkpoints = train(samples, args.learning_rate,
                        args.num_epochs, args.batch_size,
                        embedding.num_tokens(), args.print_every,
                        args.gamma, args.epoch_step, args.optimizer)

    for epoch, (linear_state, loss) in enumerate(checkpoints, start=1):
        state = {'epoch':epoch,
                 'text-encoder':tokenizer,
                 'linear-state': linear_state,
                 'stem-embeddings': embedding,
                 'options': [
                     ("# epochs", str(epoch)),
                     ("learning rate", str(args.learning_rate)),
                     ("batch size", str(args.batch_size)),
                     ("epoch step", str(args.epoch_step)),
                     ("gamma", str(args.gamma)),
                     ("dataset size", str(len(samples))),
                     ("optimizer", args.optimizer),
                     ("training loss", "{:10.2f}".format(loss)),
                     ("context filter", args.context_filter),
                 ]}
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".
                  format(epoch))
            torch.save(state, f)