Example #1
0
def main(args):
    spacy_en = spacy.load('en_core_web_sm', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner'])

    postprocess = str.lower if args.lower else lambda x: x

    def tokenizer(x):
        return [postprocess(token.text) for token in spacy_en(x) if not token.is_space]

    if args.only_question:
        indices = [1]
        desc = 'question'
    elif args.only_context:
        indices = [0]
        desc = 'context'
    else:
        indices = [0, 1]
        desc = 'question_context'

    basename, ext = os.path.splitext(args.vocab_path)
    min_freq = args.min_freq if args.min_freq else ''
    max_size = args.max_size if args.max_size else ''
    filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}'

    squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices)
    Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
Example #2
0
    def __init__(self, model_path, device="cpu", max_len=50, verbose=1):
        super().__init__(model_path, device)
        self.max_len = max_len
        self.verbose = verbose

        self.vocab = Vocabulary()
        self.races = Races()
        self.genders = Genders()
        self.to_tensor = ToTensor()

        self.name_transform = Compose([self.vocab, OneHot(self.vocab.size), ToTensor()])
        self.race_transform = Compose([self.races, OneHot(self.races.size), ToTensor()])
        self.gender_transform = Compose([self.genders, OneHot(self.genders.size), ToTensor()])
Example #3
0
def predict(args, states):
    vocab = Vocabulary(config.vocab_file)
    model = CnnTextClassifier(len(vocab))
    model.load_state_dict(states["model"])
    if torch.cuda.is_available():
        model.cuda()

    for line in args.file:
        sequence = [vocab.token_to_id(t) for t in line.strip().split()]
        sequences = autograd.Variable(torch.LongTensor([sequence]))
        if torch.cuda.is_available():
            sequences = sequences.cuda()

        probs, classes = model(sequences)
        print(classes.data[0])
Example #4
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    model = FastQA(len(token_to_index), args.embed, args.hidden,
                   question_limit=args.q_len, context_limit=args.c_len,
                   dropout=args.dropout, pretrained_embeddings=embeddings,
                   with_feature=not args.without_feature).build()
    opt = Adam()
    model.compile(optimizer=opt, loss_weights=[1, 1, 0, 0],
                  loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    tokenizer = get_tokenizer(lower=args.lower, as_str=False)
    converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer,
                               question_max_len=args.q_len, context_max_len=args.c_len)
    eval_converter = SquadEvalConverter(
        token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer,
        question_max_len=args.q_len, context_max_len=args.c_len)
    train_generator = Iterator(train_dataset, args.batch, converter)
    dev_generator_loss = Iterator(dev_dataset, args.batch, converter, shuffle=False)
    dev_generator_f1 = Iterator(dev_dataset, args.batch, eval_converter, repeat=False, shuffle=False)
    trainer = SquadTrainer(model, train_generator, args.epoch, dev_generator_loss,
                           './models/fastqa.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(FastQALRScheduler(
        dev_generator_f1, val_answer_file=args.answer_path, steps=args.steps))
    trainer.add_callback(FastQACheckpoint('./models/fastqa.{steps:06d}.h5', steps=args.steps))
    if args.use_tensorboard:
        trainer.add_callback(TensorBoard(log_dir='./graph', batch_size=args.batch))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')
Example #5
0
def load_vocab(vocab_file, max_word_length=None):
    if max_word_length:
        return UnicodeCharsVocabulary(vocab_file,
                                      max_word_length,
                                      validate_file=True)
    else:
        return Vocabulary(vocab_file, validate_file=True)
Example #6
0
def create_vocabs(opts,checkpointer):
    vocabs = None #Vocabulary(opts.pretrain_files , opts.trainfile,opts.dim,True)
    if checkpointer is not None:
        vocabs = checkpointer["vocabs"]
    else:
        vocabs = Vocabulary(opts.pretrain_files, opts.trainfile, opts.dim, True)
    return vocabs
Example #7
0
def main(args):
    token_to_index, _ = Vocabulary.load(args.vocab_file)

    model = FastQA(len(token_to_index),
                   args.embed,
                   args.hidden,
                   question_limit=args.q_len,
                   context_limit=args.c_len,
                   with_feature=not args.without_feature).build()
    model.load_weights(args.model_path)

    test_dataset = SquadReader(args.test_path)
    tokenizer = get_tokenizer(lower=args.lower, as_str=False)
    converter = SquadEvalConverter(token_to_index,
                                   PAD_TOKEN,
                                   UNK_TOKEN,
                                   tokenizer,
                                   question_max_len=args.q_len,
                                   context_max_len=args.c_len)
    test_generator = Iterator(test_dataset, args.batch, converter, False,
                              False)
    predictions = {}
    for inputs, (contexts, ids) in test_generator:
        _, _, start_indices, end_indices = model.predict_on_batch(inputs)

        for i, (start, end) in enumerate(zip(start_indices, end_indices)):
            prediction = ' '.join(contexts[i][j]
                                  for j in range(start, end + 1))
            predictions[ids[i]] = prediction

    basename = osp.splitext(osp.basename(args.model_path))[0]
    save_path = osp.join(args.save_dir, f'predictions_{basename}.json')

    with open(save_path, 'w') as f:
        json.dump(predictions, f, indent=2)
Example #8
0
def load_vocab(word_file, char_file=None, max_word_length=None):
    if max_word_length:
        return CharsVocabulary(word_file,
                               char_file,
                               max_word_length,
                               validate_file=True)
    else:
        return Vocabulary(word_file, validate_file=True)
Example #9
0
def tensorize(ctxs: List[str], word_vocab: Vocabulary, path_vocab: Vocabulary):
    if len(ctxs) > config.MAX_LENGTH:
        ctxs = random.sample(ctxs, config.MAX_LENGTH)
    x_s, path, x_t = [0] * config.MAX_LENGTH, [0] * config.MAX_LENGTH, [
        0
    ] * config.MAX_LENGTH
    for i in range(config.MAX_LENGTH):
        if i < len(ctxs):
            s, p, t = ctxs[i].split(',')
            p = str(java_string_hashcode(p))
        else:
            s, p, t = '<pad>', '<pad>', '<pad>'
        x_s[i] = word_vocab.lookup_idx(s)
        path[i] = path_vocab.lookup_idx(p)
        x_t[i] = word_vocab.lookup_idx(t)
    x_s, path, x_t = torch.LongTensor(x_s)[None, :], torch.LongTensor(path)[
        None, :], torch.LongTensor(x_t)[None, :]
    return x_s, path, x_t
    def __init__(self,
                 task='sentiment',
                 batch_size=32,
                 gaze_data=None,
                 et_predictor_model=None,
                 et_predictor_vocab=None,
                 use_predictor_vocab=False,
                 filter_vocab=False):
        self.batch_size = batch_size
        self.filter_vocab = False
        self.use_gaze = gaze_data is not None

        _zuco = ZuCo(task=task)
        self.sentences = _zuco.sentences

        # this will be overridden if we're using a trained ET predictor:
        self.sentences_et = np.array(_zuco.sentences_et)
        self.max_seq_len = max([len(s) for s in self.sentences])

        # Initialize the ET features per sentence
        if et_predictor_model and et_predictor_vocab:
            print('\nReceived ET Predictor model and vocab. Vocabulary size:',
                  len(et_predictor_vocab))
            print('Running sentences through ET predictor...')
            indexed_sentences = et_predictor_vocab.index_sentences(
                self.sentences)
            self.sentences_et = et_predictor_model.sentences_to_et(
                indexed_sentences=indexed_sentences,
                max_seq_len=self.max_seq_len)

        # Initialize Vocabulary object
        print('\nuse_predictor_vocab =', use_predictor_vocab)
        if use_predictor_vocab:  # assuming that et_predictor_vocab is provided
            self.vocabulary = et_predictor_vocab
            self.indexed_sentences = indexed_sentences
        else:
            self.vocabulary = Vocabulary(self.sentences, filter_vocab)
            self.indexed_sentences = self.vocabulary.index_sentences(
                self.sentences)

        self.task_num = (1 if task == 'sentiment' else
                         2 if task == 'normal' else 3)
        self.load_labels()
        self.num_classes = len(set(self.labels))
Example #11
0
 def test_load(self):
     filename = '/path/to/vocab.pkl'
     open_ = patch('data.open', mock_open()).start()
     pickle_load = patch('data.pickle.load').start()
     pickle_load.return_value = ('token_to_index', 'index_to_token')
     token_to_index, index_to_token = Vocabulary.load(filename)
     self.assertEqual(token_to_index, 'token_to_index')
     self.assertEqual(index_to_token, 'index_to_token')
     open_.assert_called_with(filename, mode='rb')
     pickle_load.assert_called_with(open_.return_value)
Example #12
0
    def _load_data(self, reverse, chars, bidirectional=False):
        if chars:
            vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5)
        else:
            vocab = Vocabulary(self._tmp_vocab)

        if not bidirectional:
            data = LMDataset(self._tmp_train, vocab, reverse=reverse)
        else:
            data = BidirectionalLMDataset(self._tmp_train, vocab)

        return data
Example #13
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    batch_size = args.batch  # Batch size for training.
    epochs = args.epoch  # Number of epochs to train for.
    converter = SquadDepConverter(token_to_index, PAD_TOKEN, UNK_TOKEN)

    if args.model == 'qanet':
        model = DependencyQANet(len(token_to_index),
                                args.embed,
                                len(converter._dep_to_index),
                                args.hidden,
                                args.num_heads,
                                dropout=args.dropout,
                                num_blocks=args.encoder_layer,
                                num_convs=args.encoder_conv,
                                embeddings=embeddings).build()
    elif args.model == 'lstm':
        model = DependencyLSTM(len(token_to_index),
                               args.embed,
                               len(converter._dep_to_index),
                               args.hidden,
                               dropout=args.dropout,
                               embeddings=embeddings).build()

    opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.)
    model.compile(optimizer=opt,
                  loss=['sparse_categorical_crossentropy'],
                  metrics=['sparse_categorical_accuracy'])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    train_generator = Iterator(train_dataset, batch_size, converter)
    dev_generator = Iterator(dev_dataset, batch_size, converter)
    trainer = SquadTrainer(model, train_generator, epochs, dev_generator,
                           './model/dep.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(BatchLearningRateScheduler())
    trainer.add_callback(ExponentialMovingAverage(0.999))
    if args.use_tensorboard:
        trainer.add_callback(
            TensorBoard(log_dir='./graph', batch_size=batch_size))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')

    test_dataset = SquadReader(args.test_path)
    test_generator = Iterator(test_dataset, args.batch, converter, False,
                              False)
    print(model.evaluate_generator(test_generator, steps=len(test_generator)))
Example #14
0
def main(args):
    tokenizer = get_tokenizer(lower=args.lower, as_str=True)

    if args.only_question:
        indices = [1]
        desc = 'question'
    elif args.only_context:
        indices = [0]
        desc = 'context'
    else:
        indices = [0, 1]
        desc = 'question_context'

    basename, ext = os.path.splitext(args.vocab_path)
    min_freq = args.min_freq if args.min_freq else ''
    max_size = args.max_size if args.max_size else ''
    filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}'

    squad_tokens = load_squad_tokens(args.train_path,
                                     tokenizer,
                                     indices=indices)
    Vocabulary.build(squad_tokens, args.min_freq, args.max_size,
                     (PAD_TOKEN, UNK_TOKEN), filename)
Example #15
0
    def embedding(cls, Config):
        emb_dim = Config.emb_dim if Config.encoder == 'Recurrent' else Config.d_model
        embedding = nn.Embedding(Config.max_vocab, emb_dim)

        if Config.init:  # initialize embeddings with pre-dumped Fasttext embeddings
            try:
                embedding.load_state_dict(torch.load(os.path.join(os.path.dirname(EMB_PATH),
                                                                  f"dump/initial_{Config.dataset}_{Config.max_vocab}_{Config.emb_dim}.pt")))
            except:
                raise Exception(f"First store an embedding file \
                        'initial_{Config.dataset}_{Config.max_vocab}_{Config.emb_dim}.pt' under embeddings/dump/")

        # should always be trainable, because considerable number of embeddings is initialized randomly
        embedding.weight.requires_grad = Config.trainable
        vocab = Vocabulary(Config.vocab_path, Config.max_vocab)
        return vocab, embedding
Example #16
0
def main(config, local):
    # random seed
    random.seed(config.random_seed)
    np.random.seed(config.random_seed)
    torch.random.manual_seed(config.random_seed)
    if config.device == 'cuda':
        torch.cuda.manual_seed_all(config.random_seed)

    vocab = Vocabulary(config)
    print(f'Vocabulary loaded')
    feature = Feature(config)
    print(f'Feature data loaded')

    setattr(config, 'char_vocab_size', 0)
    setattr(config, 'class_size', 1)

    if config.mode == 'train':
        train_question_file_path = os.path.join(config.data_dir, config.train_file_name)
        train_label_file_path = os.path.join(config.data_dir, config.train_label_file_name)
        train_dataset = Dataset(train_question_file_path, train_label_file_path,
                                vocab, feature, mode='train')
        train_data_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

        validation_question_file_path = os.path.join(config.data_dir, config.validation_file_name)
        validation_label_file_path = os.path.join(config.data_dir, config.validation_label_file_name)
        validation_dataset = Dataset(validation_question_file_path, validation_label_file_path,
                                     vocab, feature, mode='validation')
        validation_data_loader = DataLoader(validation_dataset, batch_size=config.batch_size)
    else:
        train_data_loader = None
        validation_data_loader = None
    print(f'{config.mode} Dataset loaded')

    trainer = Trainer(config, feature, train_data_loader, validation_data_loader)
    print(f'Trainer loaded')

    if nsml.IS_ON_NSML:
        bind_model(trainer.model, vocab, feature, config)

        if config.pause:
            nsml.paused(scope=local)

    if config.mode == 'train':
        print(f'Starting training')
        trainer.train()
        print(f'Finishing training')
Example #17
0
def main(args):
    token_to_index, index_to_token = Vocabulary.load(args.vocab_file)

    root, _ = os.path.splitext(args.vocab_file)
    basepath, basename = os.path.split(root)
    embed_path = f'{basepath}/embedding_{basename}.npy'
    embeddings = np.load(embed_path) if os.path.exists(embed_path) else None

    batch_size = args.batch  # Batch size for training.
    epochs = args.epoch  # Number of epochs to train for.

    model = QANet(len(token_to_index),
                  args.embed,
                  args.hidden,
                  args.num_heads,
                  encoder_num_blocks=args.encoder_layer,
                  encoder_num_convs=args.encoder_conv,
                  output_num_blocks=args.output_layer,
                  output_num_convs=args.output_conv,
                  dropout=args.dropout,
                  embeddings=embeddings).build()
    opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.)
    model.compile(optimizer=opt,
                  loss=[
                      'sparse_categorical_crossentropy',
                      'sparse_categorical_crossentropy', None, None
                  ],
                  loss_weights=[1, 1, 0, 0])
    train_dataset = SquadReader(args.train_path)
    dev_dataset = SquadReader(args.dev_path)
    converter = SquadConverter(token_to_index,
                               PAD_TOKEN,
                               UNK_TOKEN,
                               lower=args.lower)
    train_generator = Iterator(train_dataset, batch_size, converter)
    dev_generator = Iterator(dev_dataset, batch_size, converter)
    trainer = SquadTrainer(model, train_generator, epochs, dev_generator,
                           './model/qanet.{epoch:02d}-{val_loss:.2f}.h5')
    trainer.add_callback(BatchLearningRateScheduler())
    # trainer.add_callback(ExponentialMovingAverage(0.999))
    if args.use_tensorboard:
        trainer.add_callback(
            TensorBoard(log_dir='./graph', batch_size=batch_size))
    history = trainer.run()
    dump_graph(history, 'loss_graph.png')
Example #18
0
def make_cache(data_path,
               cache_path,
               image_dim,
               validation_only=False,
               verbose=True):
    q_dict = Vocabulary()
    cache_time = time()
    for name in ('train', 'val'):
        if name != 'val' and validation_only:
            continue
        question_time = time()
        if verbose:
            print(f'cache {name}')
        images = data_path / f'{name}2014'
        questions = json.load(
            (data_path / f'v2_OpenEnded_mscoco_{name}2014_questions.json'
             ).open('r'))['questions']
        for question in questions:
            q_dict.tokenize(question['question'], insert=True)
        if verbose:
            print(
                f'{len(questions)} questions and annotations cached in {time() - question_time:.2f}s'
            )
        if Path(cache_path / f'{name}_img.hdf5').is_file() and Path(
                cache_path / f'{name}_imgmap.pkl').is_file():
            continue
        img_cache_time = time()
        img_size = (image_dim, image_dim)
        n_images = len(list(images.glob('*')))
        img_dict = {}
        with h5py.File(cache_path / f'{name}_img.hdf5', 'w') as h5:
            img_data = h5.create_dataset('images',
                                         shape=(n_images, 3, image_dim,
                                                image_dim),
                                         dtype='i')
            for i, image in enumerate(images.glob('*')):
                if i % 10000 == 0 and verbose:
                    print(f'{i} images cached')
                img_id = int(image.name.replace('.jpg', '')[-12:])
                img_dict[img_id] = i
                img = numpy.array(
                    Image.open(image).resize(img_size).convert('RGB'))
                img_data[i, :] = img.reshape((3, image_dim, image_dim))
        pickle.dump(img_dict,
                    Path(cache_path / f'{name}_imgmap.pkl').open('wb'))
        if verbose:
            print(
                f'{n_images} images cached in {time() - img_cache_time:.2f}s')
    q_dict.save(cache_path / 'vocab.pkl')
    if verbose:
        print(f'data cached in {time() - cache_time:.2f}s')
Example #19
0
def main(args):
    token_to_index, _ = Vocabulary.load(args.vocab_path)

    if os.path.exists(args.embed_array_path) and os.path.exists(
            args.embed_dict_path):
        with open(args.embed_dict_path, 'rb') as f:
            pretrained_token_to_index = pickle.load(f)
        embeddings = extract_embeddings(token_to_index,
                                        pretrained_token_to_index,
                                        np.load(args.embed_array_path))
    else:
        if os.path.exists(args.embed_path):
            pretrained_token_to_index, embeddings = save_word_embedding_as_npy(
                args.embed_path, args.dim)
        else:
            raise FileNotFoundError(
                'Please download pre-trained embedding file')
    root, _ = os.path.splitext(args.vocab_path)
    basepath, basename = os.path.split(root)
    filename = f'{basepath}/embedding_{basename}.npy'
    np.save(filename, embeddings)
Example #20
0
    def __init__(self,
                 args,
                 kwargs,
                 root_dir,
                 hidden_size,
                 lr,
                 epochs,
                 batch_size,
                 device,
                 logfile,
                 verbose=1):
        self.root_dir = kwargs['root_dir']
        self.device = kwargs['device']
        self.verbose = kwargs['verbose']
        self.logfile = kwargs['logfile']

        # Training params
        self.lr = kwargs['lr']
        self.epochs = kwargs['epochs']
        self.batch_size = kwargs['batch_size']

        # Model params
        self.hidden_size = kwargs['hidden_size']

        # Data params
        self.vocab = Vocabulary()
        self.races = Races()
        self.genders = Genders()

        # Initialization
        self.dataset = self.init_dataset()
        self.train_loder = self.init_loader()
        self.model = self.init_model()
        self.criterion = self.init_criterion()
        self.optimizer = self.init_optimizer()

        # Initialize logging
        self.logger = Logger(os.path.join(PROJECT_ROOT, self.logfile))
Example #21
0
    def __init__(self,
                 root_dir,
                 hidden_size,
                 lr,
                 epochs,
                 batch_size,
                 device,
                 logfile,
                 verbose=1):
        self.root_dir = root_dir
        self.device = device
        self.verbose = verbose
        self.logfile = logfile

        # Training params
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size

        # Model params
        self.hidden_size = hidden_size

        # Data params
        self.vocab = Vocabulary()
        self.races = Races()
        self.genders = Genders()

        # Initialization
        self.dataset = self.init_dataset()
        self.train_loder = self.init_loader()
        self.model = self.init_model()
        self.criterion = self.init_criterion()
        self.optimizer = self.init_optimizer()

        # Initialize logging
        self.logger = Logger(os.path.join(PROJECT_ROOT, logfile))
Example #22
0
def init():
    options = tf.app.flags.FLAGS
    os.environ['CUDA_VISIBLE_DEVICES'] = options.gpus

    if not options.model_dir:
        raise Exception('You need to specify --model_dir')
    if not options.vocab_path:
        options.vocab_path = os.path.join(options.model_dir, 'vocab.txt')
    if not options.n_senses_file:
        n_senses_file = os.path.join(options.model_dir, 'n_senses.txt')
        if os.path.exists(n_senses_file):
            options.n_senses_file = n_senses_file

    vocab = Vocabulary(options.vocab_path,
                       min_occurrences=options.min_occurrences_for_vocab)
    multisense_vocab = get_multisense_vocab(options.n_senses_file, vocab,
                                            options)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    #config.log_device_placement = True
    tf_config.allow_soft_placement = True

    return options, vocab, multisense_vocab, tf_config
Example #23
0
        "model_name": "test",
        "stop_words_file": None, # use sub-sampling instead
        "n_epochs": 10,
        "data_path": "data/hansards/training.en",
        "use_cuda": False,
        "batch_size": 500,
    }
    #################
    locals().update(params)

    stop_words = None
    if stop_words_file:
        stop_words = read_stop_words(stop_words_file)

    sentences = SentenceIterator(data_path, stop_words=stop_words)
    vocab = Vocabulary(sentences, max_size = vocab_size)

    sgm = SkipGramModel(vocab, embedding_dim, use_cuda=use_cuda)
    optimizer = optim.SparseAdam(sgm.parameters())

    tictoc = utils.TicToc()
    epoch_losses = []

    for epoch in np.arange(1, n_epochs + 1):
        print("Running epoch: ", epoch)
        epoch_loss = utils.Mean()
        for batch in batch_iterator(sentences, vocab, batch_size, n_negative):
            batch_center, batch_context, negative_words = batch
            optimizer.zero_grad()
            loss = sgm.forward(batch_center, batch_context, negative_words)
            epoch_loss.add(loss.item())
Example #24
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)
    print()

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    local_rank = args.local_rank
    # world_size = torch.cuda.device_count() # assume all local GPUs

    # Set up distributed process group
    rank = setup_dist(local_rank)

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path)
    dev_dataset = QADataset(args, args.dev_path)

    # Create vocabulary and tokenizer.
    vocabulary = Vocabulary(train_dataset.samples, args.vocab_size)
    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    print(f'vocab words = {len(vocabulary)}')

    # Print number of samples.
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    #model = model.to(rank)
    #model = DDP(model, device_ids=[rank], output_device=rank)

    num_pretrained = model.load_pretrained_embeddings(
        vocabulary, args.embedding_path
    )
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(
        f'initialized {num_pretrained}/{len(vocabulary)} '
        f'embeddings ({pct_pretrained}%)'
    )
    print()

    # device = torch.device(f'cuda:{rank}')
    model = model.to(rank)
    model = DDP(model, device_ids=[rank], output_device=rank)

    # if args.use_gpu:
    #     model = cuda(args, model)

    if args.resume and args.model_path:
        map_location = {"cuda:0": "cuda:{}".format(rank)}
        model.load_state_dict(torch.load(args.model_path, map_location=map_location))

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')
    print(model)
    print()

    if args.do_train:
        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            try:
                train_loss = train(args, epoch, model, train_dataset)
            except RuntimeError:
                print(f'NCCL Wait Timeout, rank: \'{args.local_rank}\' (exit)')
                exit(1)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            if rank == 0:
                eval_history.append(eval_loss < best_eval_loss)
                if eval_loss < best_eval_loss:
                    best_eval_loss = eval_loss
                    torch.save(model.state_dict(), args.model_path)
                
                print(
                    f'epoch = {epoch} | '
                    f'train loss = {train_loss:.6f} | '
                    f'eval loss = {eval_loss:.6f} | '
                    f"{'saving model!' if eval_history[-1] else ''}"
                )

                # If early stopping conditions are met, stop training.
                if _early_stop(args, eval_history):
                    suffix = 's' if args.early_stop > 1 else ''
                    print(
                        f'no improvement after {args.early_stop} epoch{suffix}. '
                        'early stopping...'
                    )
                    print()
                    cleanup_dist()
                    break

    if args.do_test and rank == 0:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = (
            'python3 evaluate.py '
            f'--dataset_path {args.dev_path} '
            f'--output_path {args.output_path}'
        )
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Example #25
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)
    print()

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path)
    dev_dataset = QADataset(args, args.dev_path)

    # Create vocabulary and tokenizer.
    if args.vocab_path != None:
        print("loading vocabulary from file at {}".format(args.vocab_path))
        vocabulary = Vocabulary(train_dataset.samples,
                                args.vocab_size,
                                load_from_file=True,
                                filepath=args.vocab_path)
    else:
        print("constructing the vocab from dataset examples")
        vocabulary = Vocabulary(train_dataset.samples, args.vocab_size)

    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    args.char_vocab_size = vocabulary.numCharacters()
    print(f'vocab words = {len(vocabulary)}')
    print(f'num characters = {args.char_vocab_size}')

    # Print number of samples.
    num_train_samples = len(train_dataset)
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    num_pretrained = model.load_pretrained_embeddings(vocabulary,
                                                      args.embedding_path)
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(f'initialized {num_pretrained}/{len(vocabulary)} '
          f'embeddings ({pct_pretrained}%)')
    print()

    if args.use_gpu:
        model = cuda(args, model)

    # load the model from previous checkpoint
    if args.finetune >= 1:
        print("preparing to load {} as base model".format(args.init_model))
        model.load_state_dict(torch.load(args.init_model, map_location='cpu'))

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')
    print(model)
    print()

    if args.do_train:
        # create tensorboard summary writer
        train_writer = tb.SummaryWriter(
            log_dir=os.path.join(args.logdir, args.run + "_train"))
        valid_writer = tb.SummaryWriter(
            log_dir=os.path.join(args.logdir, args.run + "_valid"))

        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            train_loss = train(args, epoch, model, train_dataset, train_writer,
                               num_train_samples)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # write the loss to tensorboard
            valid_writer.add_scalar("valid_loss", eval_loss, global_step=epoch)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            eval_history.append(eval_loss < best_eval_loss)
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(model.state_dict(), args.model_path)

            print(f'epoch = {epoch} | '
                  f'train loss = {train_loss:.6f} | '
                  f'eval loss = {eval_loss:.6f} | '
                  f"{'saving model!' if eval_history[-1] else ''}")

            # If early stopping conditions are met, stop training.
            if _early_stop(args, eval_history):
                suffix = 's' if args.early_stop > 1 else ''
                print(f'no improvement after {args.early_stop} epoch{suffix}. '
                      'early stopping...')
                print()
                break

    if args.do_test:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = ('python3 evaluate.py '
                    f'--dataset_path {args.dev_path} '
                    f'--output_path {args.output_path}')
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Example #26
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNNs for Poetry Generation')
    # data arguments
    parser.add_argument('--datadir',
                        default='data',
                        help='path to dataset',
                        type=str)
    parser.add_argument('--rawdir',
                        default=None,
                        help='path to raw dataset',
                        type=str)
    parser.add_argument('--logdir',
                        default='log',
                        help='path to log',
                        type=str)
    parser.add_argument('--tag',
                        default='tang',
                        help='poetry type for the project.',
                        type=str)
    parser.add_argument('--wordnum',
                        default=5,
                        help='The number of poetry words in the sentences.',
                        type=int)
    parser.add_argument('--sentnum',
                        default=4,
                        help='The number of poetry sentences.',
                        type=int)
    parser.add_argument('--max-len',
                        default=20,
                        help='The number of poetry titles.',
                        type=int)
    parser.add_argument('--embedding-dim',
                        default=300,
                        help='The dimension of embedding .',
                        type=int)
    parser.add_argument('--hidden-dim',
                        default=150,
                        help='The dimension of hidden .',
                        type=int)
    parser.add_argument('--num_layers',
                        default=2,
                        help='The rnn layers.',
                        type=int)
    parser.add_argument('--batch-size',
                        default=30,
                        help='The batch-size of the dataset.',
                        type=int)
    parser.add_argument('--data-workers',
                        type=int,
                        default=5,
                        help='Number of subprocesses for data loading')
    parser.add_argument('--epoches',
                        default=50,
                        help='The batch-size of the dataset.',
                        type=int)
    parser.add_argument('--bidirectional',
                        action='store_true',
                        help='Whether using bidirectional RNNs')
    parser.add_argument('--lr',
                        default=0.001,
                        type=float,
                        metavar='LR',
                        help='initial learning rate')
    parser.add_argument('--seed',
                        default=123,
                        type=int,
                        help='random seed (default: 123)')
    cuda_parser = parser.add_mutually_exclusive_group(required=False)
    cuda_parser.add_argument('--cuda', dest='cuda', action='store_true')
    cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false')
    parser.set_defaults(cuda=True)
    args = parser.parse_args()
    # preparing log
    # logging defination
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    model_name = time.strftime("%Y%m%d%H%M", time.localtime(time.time()))
    log_dir = os.path.join(
        os.getcwd(),
        args.logdir,
    )
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    log_file = os.path.join(log_dir, model_name + ".log")
    fh = logging.FileHandler(log_file, mode="w")
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
    )
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.info(args)
    args.cuda = args.cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    assert (args.rawdir is not None)
    # preparing dataset
    poetry_path = os.path.join(
        args.datadir,
        "poet.%s._%d_%d.json" % (args.tag, args.sentnum, args.wordnum))
    if os.path.exists(poetry_path):
        logger.info("The poetry dataset has been built in path: %s" %
                    poetry_path)
    else:
        logger.info("Preparing poetry...")
        processPoetry(args.rawdir,
                      args.datadir,
                      sentNum=args.sentnum,
                      wordsNum=args.wordnum,
                      max_title_len=args.max_len,
                      tag=args.tag)
        logger.info("Poetry processed!")
    # preparing vocabulary
    vocab_path = os.path.join(args.datadir, "vocab.txt")
    if os.path.exists(vocab_path):
        logger.info("The vocabulary has been built in path: %s" %
                    os.path.join(args.datadir, "vocab.txt"))
    else:
        logger.info("Building vocabulary...")
        build_vocabulary(args.rawdir, args.datadir)
        logger.info("The vocabulary has been built.")
    VocabDataSet = Vocabulary(vocab_path)

    PoetryDataSet = Poetry(VocabDataSet, args.max_len, poetry_path)

    # preparing model

    model = LSTMPoetry(vocab_size=len(VocabDataSet),
                       embedding_dim=args.embedding_dim,
                       hidden_dim=args.hidden_dim,
                       sents_len=args.sentnum,
                       num_layers=args.num_layers,
                       name=model_name)
    criterion = torch.nn.CrossEntropyLoss()
    model.to(device), criterion.to(device)
    optimizer = torch.optim.Adam(model.parameters())
    # training process
    logger.info("Begin training model!")
    train(model, PoetryDataSet, criterion, optimizer, args, device)
    logger.info("End training model!")
Example #27
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path, is_train=True)
    dev_dataset = QADataset(args, args.dev_path, is_train=False)
    print("Start creating vocabulary and tokenizer")

    # Create vocabulary and tokenizer.
    vocabulary = Vocabulary(
        train_dataset.samples + train_dataset.culled_samples, args.vocab_size)
    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    print(f'vocab words = {len(vocabulary)}')

    # Print number of samples.
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    num_pretrained = model.load_pretrained_embeddings(vocabulary,
                                                      args.embedding_path)
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(f'initialized {num_pretrained}/{len(vocabulary)} '
          f'embeddings ({pct_pretrained}%)')
    print()

    if args.use_gpu:
        model = cuda(args, model)

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')

    if args.do_train:
        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            train_loss = train(args, epoch, model, train_dataset)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            eval_history.append(eval_loss < best_eval_loss)
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(model.state_dict(), args.model_path)

            print(f'epoch = {epoch} | '
                  f'train loss = {train_loss:.6f} | '
                  f'eval loss = {eval_loss:.6f} | '
                  f"{'saving model!' if eval_history[-1] else ''}")

            # If early stopping conditions are met, stop training.
            if _early_stop(args, eval_history):
                suffix = 's' if args.early_stop > 1 else ''
                print(f'no improvement after {args.early_stop} epoch{suffix}. '
                      'early stopping...')
                print()
                break

    if args.do_test:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = ('python3 evaluate.py '
                    f'--dataset_path {args.dev_path} '
                    f'--output_path {args.output_path}')
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
def main(mode='test', question=None, answers=None):
    """
    This function is used to train, predict or test

    Args:
        mode (str): train/preddict/test
        question (str): this contains the question
        answers (list): this contains list of answers in string format

    Returns:
        index (integer): index of the most likely answer
    """

    # get the train and predict model model
    vocabulary = Vocabulary("./data/vocab_all.txt")
    embedding_file = "./data/word2vec_100_dim.embeddings"
    qa_model = QAModel()
    train_model, predict_model = qa_model.get_bilstm_model(
        embedding_file, len(vocabulary))

    epoch = 1
    if mode == 'train':
        for i in range(epoch):
            print('Training epoch', i)

            # load training data
            qa_data = QAData()
            questions, good_answers, bad_answers = qa_data.get_training_data()

            # train the model
            Y = np.zeros(shape=(questions.shape[0], ))
            train_model.fit([questions, good_answers, bad_answers],
                            Y,
                            epochs=1,
                            batch_size=64,
                            validation_split=0.1,
                            verbose=1)

            # save the trained model
            train_model.save_weights('model/train_weights_epoch_' +
                                     str(epoch) + '.h5',
                                     overwrite=True)
            predict_model.save_weights('model/predict_weights_epoch_' +
                                       str(epoch) + '.h5',
                                       overwrite=True)
    elif mode == 'predict':
        # load the evaluation data
        data = pickle.load(open("./data/dev.pkl", 'rb'))
        random.shuffle(data)

        # load weights from trained model
        qa_data = QAData()
        predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5')

        c = 0
        c1 = 0
        for i, d in enumerate(data):
            print(i, len(data))

            # pad the data and get it in desired format
            indices, answers, question = qa_data.process_data(d)

            # get the similarity score
            sims = predict_model.predict([question, answers])

            n_good = len(d['good'])
            max_r = np.argmax(sims)
            max_n = np.argmax(sims[:n_good])
            r = rankdata(sims, method='max')
            c += 1 if max_r == max_n else 0
            c1 += 1 / float(r[max_r] - r[max_n] + 1)

        precision = c / float(len(data))
        mrr = c1 / float(len(data))
        print("Precision", precision)
        print("MRR", mrr)
    elif mode == 'test':
        # question and answers come from params
        qa_data = QAData()
        answers, question = qa_data.process_test_data(question, answers)

        # load weights from the trained model
        predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5')

        # get similarity score
        sims = predict_model.predict([question, answers])
        max_r = np.argmax(sims)
        return max_r
        "stopping_loss": 1
    }
    #################

    locals().update(params)

    en_stop_words, fr_stop_words = None, None
    if en_stop_words_path:
        en_stop_words = read_stop_words(en_stop_words_path)

    if fr_stop_words_path:
        fr_stop_words = read_stop_words(fr_stop_words_path)

    en_sentences = SentenceIterator(en_data_path, stop_words=en_stop_words)
    fr_sentences = SentenceIterator(fr_data_path, stop_words=fr_stop_words)
    en_vocab = Vocabulary(en_sentences, max_size=vocab_x)
    fr_vocab = Vocabulary(fr_sentences, max_size=vocab_y)

    eam = EmbedAlignModel(en_vocab,
                          fr_vocab,
                          embedding_dim,
                          random_state=random_state,
                          use_cuda=use_cuda)

    optimizer = optim.Adam(eam.parameters())

    tictoc = utils.TicToc()
    epoch_losses = []
    for epoch in np.arange(1, n_epochs + 1):
        print("Running epoch: ", epoch)
        epoch_loss = utils.Mean()
Example #30
0
    head = '%(asctime)-15s %(message)s'
    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] if args.gpus else [mx.gpu()]
    ngpus = len(ctx)
    rescale_loss = args.bptt

    # logging
    logging.basicConfig(level=logging.INFO, format=head)
    logging.info(args)
    logging.debug(sys.argv)

    # seeding
    mx.random.seed(args.seed)
    np.random.seed(args.seed)

    # data
    vocab = Vocabulary.from_file(args.vocab)
    ntokens = vocab.num_tokens
    train_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data, vocab,
                                       args.batch_size * ngpus, args.bptt))
    # model
    model = Model(args, ntokens, rescale_loss)
    train_loss_and_states = model.train()
    eval_loss_and_states = model.eval()

    # training module
    data_names, label_names = ['data', 'mask'], ['label']
    eval_state_names = model.state_names
    num_sample_names = len(model.sample_names)
    train_state_names = model.state_names + model.sample_names

    module = CustomModule(symbol=train_loss_and_states, context=ctx,
Example #31
0
class RNNLayerGenerator(Generator):
    def __init__(self, model_path, device="cpu", max_len=50, verbose=1):
        super().__init__(model_path, device)
        self.max_len = max_len
        self.verbose = verbose

        self.vocab = Vocabulary()
        self.races = Races()
        self.genders = Genders()
        self.to_tensor = ToTensor()

        self.name_transform = Compose([self.vocab, OneHot(self.vocab.size), ToTensor()])
        self.race_transform = Compose([self.races, OneHot(self.races.size), ToTensor()])
        self.gender_transform = Compose([self.genders, OneHot(self.genders.size), ToTensor()])

    def _init_random_input(self, skip_random_gen=[]):
        """Helper function that initialize random letter, race and gender"""
        random_option = ['letter', 'race', 'gender']
        letter = ''
        gender = ''
        race = ''
        
        if not skip_random_gen:
            letter = np.random.choice(self.vocab.start_letters)
            race = np.random.choice(self.races.available_races)
            gender = np.random.choice(self.genders.available_genders)
        else:
            for opt in random_option:
                if opt not in skip_random_gen:
                    if opt is 'letter':
                        letter = np.random.choice(self.vocab.start_letters)
                    elif opt is 'race':
                        race = np.random.choice(self.races.available_races)
                    elif opt is 'gender':
                        gender = np.random.choice(self.genders.available_genders)
        return letter, race, gender

    def _transform_input(self, letter, race, gender):
        """Helper function to transform input into tensors"""
        letter_tensor = self.name_transform(letter).to(self.device)
        race_tensor = self.race_transform(race).to(self.device)
        gender_tensor = self.gender_transform(gender).to(self.device)

        return letter_tensor, race_tensor, gender_tensor

    def _expand_dims(self, *tensors):
        """Add dimension along 0-axis to tensors"""
        return [torch.unsqueeze(t, 0) for t in tensors]

    def sample(self, letter, race, gender):
        """Sample name from start letter, race and gender"""
        with torch.no_grad():
            assert letter in self.vocab.start_letters, "Invalid letter"
            assert race in self.races.available_races, "Invalid race"
            assert gender in self.genders.available_genders, "Invalid gender"

            # Prepare inputs
            letter_t, race_t, gender_t = self._transform_input(letter, race, gender)
            letter_t, race_t, gender_t = self._expand_dims(letter_t, race_t, gender_t)

            # Merge all input tensors
            input = torch.cat([letter_t, race_t, gender_t], 2)
            outputs = [letter]

            # Initialize hidden states
            hx, cx = self.model.init_states(batch_size=1, device=self.device)

            while True:
                output, hx, cx = self.model(input, hx, cx, lengths=torch.tensor([1]))

                sample = OneHotCategorical(logits=output).sample()
                index = torch.argmax(sample)
                char = self.vocab.get_char(index.item())

                if char == '.' or len(outputs) == self.max_len:
                    break

                outputs.append(char)
                input = torch.cat([sample, race_t, gender_t], 2)

            name = ''.join(map(str, outputs))
            return name

    def generate(self, num_samples, in_race, in_gender):
        """Sample random names"""
        gen_names = []
        ran_gen_names = []
        if in_race is not '':
            ran_gen_names.append('race')
        if in_gender is not '':
            ran_gen_names.append('gender')
        
        for _ in range(num_samples):
            letter, race, gender = self._init_random_input(ran_gen_names)
            race = race + in_race
            gender = gender + in_gender
            gen_name = self.sample(letter, race, gender)
            gen_names.append([gen_name, race, gender])

        return gen_names