Beispiel #1
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)

    test_data = CommentDataSet(Config.test_path, word2ix, ix2word)
    test_loader = DataLoader(
        test_data,
        batch_size=16,
        shuffle=False,
        num_workers=0,
        collate_fn=mycollate_fn,
    )

    weight = torch.zeros(len(word2ix), Config.embedding_dim)

    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    criterion = nn.CrossEntropyLoss()
    model.load_state_dict(torch.load(Config.model_save_path),
                          strict=True)  # 模型加载

    confuse_meter = ConfuseMeter()
    confuse_meter = test(test_loader, device, model, criterion)
Beispiel #2
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build word dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))    

    # Build a char dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build char dictionary')
    char_dict = utils.build_char_dict(args, train_exs + dev_exs)
    logger.info('Num chars = %d' % len(char_dict))
    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)
    if args.char_embedding_file:
        model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file)

    return model
Beispiel #3
0
def init_from_scratch(args, train_exs):
    print('init from scrath')
    print('building word vocabulary')
    word_dict = build_word_dict(args, train_exs)
    print('building char vocabulary')
    char_dict = build_char_dict(args, train_exs)
    model = TMmodel(args, word_dict, char_dict)
    model.load_word_embedding()
    model.load_char_embedding()
    return model
Beispiel #4
0
    def __init__(self):

        # we treat x,y,z as OOV
        self.vocab_dict = build_word_dict(src_path, tgt_path)
        self.vocab_size = len(self.vocab_dict)
        # self.all_tokens['x'] = len(self.all_tokens)
        # self.all_tokens['y'] = len(self.all_tokens)
        # self.all_tokens['z'] = len(self.all_tokens)
        self.reverse_vocab_dict = dict(
            zip(self.vocab_dict.values(), self.vocab_dict.keys()))
        self.dataset = ToyDataset()
        self.tested_examples = ['cake i love']
Beispiel #5
0
def prepare_dataloader(word_dict=None, feature_dict=None):
    """Create data loaders for train and dev"""
    # Load examples
    logger.info('-' * 100)
    logger.info('Loading Datasets...')
    toyfile = 'toy-' if conf['debug'] else ''
    datafile = os.path.join(
        conf['data-dir'], 'bioasq_processed',
        '{}examples-y{}-train.txt'.format(toyfile, conf['year']))
    train_ex = utils.load_data(datafile)
    logger.info('{} train examples loaded'.format(len(train_ex)))
    datafile = os.path.join(
        conf['data-dir'], 'bioasq_processed',
        '{}examples-y{}-test.txt'.format(toyfile, conf['year']))
    test_ex = utils.load_data(datafile)
    logger.info('{} test examples loaded'.format(len(test_ex)))

    # Prepare feature_dict, word_dict
    if feature_dict is None:
        if len(conf['features']) > 0:
            logger.info('Building feature dictionary...')
            feature_dict = utils.build_feature_dict(train_ex)
            if conf['idf-file'] is not None and 'idf' not in feature_dict:
                feature_dict['idf'] = len(feature_dict)
            logger.info('Num features = {}'.format(len(feature_dict)))
            logger.info(feature_dict)
    if word_dict is None:
        logger.info('Build word dictionary...')
        word_dict = utils.build_word_dict(train_ex + test_ex)
        logger.info('Num words = %d' % len(word_dict))
    conf['vocab-size'] = len(word_dict)

    # Prepare DataLoaders
    logger.info('-' * 100)
    logger.info('Creating DataLoaders')
    train_dataset = utils.QaProxDataset(conf, train_ex, word_dict,
                                        feature_dict, conf['idf-file'])
    train_loader_ = DataLoader(train_dataset,
                               batch_size=conf['batch-size'],
                               sampler=sampler.RandomSampler(train_dataset),
                               collate_fn=utils.batchify,
                               num_workers=conf['num-workers'],
                               pin_memory=conf['cuda'])
    dev_dataset = utils.QaProxDataset(conf, test_ex, word_dict, feature_dict,
                                      conf['idf-file'])
    dev_loader_ = DataLoader(dev_dataset,
                             batch_size=conf['batch-size'],
                             sampler=sampler.RandomSampler(dev_dataset),
                             collate_fn=utils.batchify,
                             num_workers=conf['num-workers'],
                             pin_memory=conf['cuda'])
    return train_loader_, dev_loader_, word_dict, feature_dict
Beispiel #6
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)
    weight = torch.zeros(len(word2ix), Config.embedding_dim)
    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    model.load_state_dict(torch.load(Config.model_save_path),
                          strict=True)  # 模型加载

    # comment_str = "忘不掉的一句台词,是杜邦公司笑着对男主说:“Sue me”。我记得前段时间某件事,也是同样的说辞,“欢迎来起诉中华有为”。也是同样的跋扈。若干年后,会看到改编的电影吗。"

    result = predict(Config.comment_str, model, device, word2ix)
    print(Config.comment_str, result)
Beispiel #7
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    # Initialize model
    model = ParagraphRanker(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file and not args.no_embed:
        model.load_embeddings(word_dict.tokens(), args.embedding_file, args.fasttext)

    return model
Beispiel #8
0
with open(args.train_file, 'r') as f:
    train_exs = json.load(f)
    #train_exs=train_exs[:100]

with open(args.dev_file, 'r') as f:
    dev_exs = json.load(f)
    #dev_exs=dev_exs[:100]

with open(args.test_file, 'r') as f:
    test_exs = json.load(f)
    #test_exs=test_exs[:100]
# build dict
feature_dict = build_feature_dict(
    args, train_exs
)  # feature_dict['in_question']=0, ['in_question_uncased']=1,['in_question_lemma']=2,['pos=NN']=3,['pos=IN']=4,['pos=DT']=5,.
word_dict = build_word_dict(args, train_exs, dev_exs + test_exs)
logger.info('Num words = %d' % len(word_dict))

# --------------------------------------------------------------------------
logger.info('-' * 100)
logger.info('Make data loaders')
# single ex vectorized
train_dataset = ReaderDataset(train_exs,
                              args,
                              word_dict,
                              feature_dict,
                              if_train=True)
# sample stategy
if args.sort_by_len:
    train_sampler = SortedBatchSampler(train_dataset.lengths(),
                                       args.batch_size,
Beispiel #9
0
                        help="RNN network depth.")
    parser.add_argument("--num_hidden",
                        type=int,
                        default=50,
                        help="RNN network size.")
    parser.add_argument("--keep_prob",
                        type=float,
                        default=0.5,
                        help="dropout keep prob.")
    parser.add_argument("--learning_rate",
                        type=float,
                        default=1e-3,
                        help="learning rate.")

    parser.add_argument("--batch_size",
                        type=int,
                        default=20,
                        help="batch size.")
    parser.add_argument("--num_epochs",
                        type=int,
                        default=30,
                        help="number of epochs.")
    args = parser.parse_args()

    train_file = "ptb_data/ptb.train.txt"
    test_file = "ptb_data/ptb.test.txt"
    word_dict = build_word_dict(train_file)
    train_data = build_dataset(train_file, word_dict)
    test_data = build_dataset(test_file, word_dict)

    train(train_data, test_data, len(word_dict), args)
Beispiel #10
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)

    train_data = CommentDataSet(Config.train_path, word2ix, ix2word)
    train_loader = DataLoader(
        train_data,
        batch_size=16,
        shuffle=True,
        num_workers=0,
        collate_fn=mycollate_fn,
    )
    validation_data = CommentDataSet(Config.validation_path, word2ix, ix2word)
    validation_loader = DataLoader(
        validation_data,
        batch_size=16,
        shuffle=True,
        num_workers=0,
        collate_fn=mycollate_fn,
    )
    test_data = CommentDataSet(Config.test_path, word2ix, ix2word)
    test_loader = DataLoader(
        test_data,
        batch_size=16,
        shuffle=False,
        num_workers=0,
        collate_fn=mycollate_fn,
    )

    weight = pre_weight(len(word2ix), Config.pred_word2vec_path,
                        Config.embedding_dim, word2ix, ix2word)

    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    optimizer = optim.Adam(model.parameters(), lr=Config.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10,
                                                gamma=0.1)  # 学习率调整
    criterion = nn.CrossEntropyLoss()

    # 因为使用tensorboard画图会产生很多日志文件,这里进行清空操作

    if os.path.exists(Config.tensorboard_path):
        shutil.rmtree(Config.tensorboard_path)
        os.mkdir(Config.tensorboard_path)

    for epoch in range(Config.epochs):
        train_loader = tqdm(train_loader)
        train_loader.set_description(
            '[%s%04d/%04d %s%f]' %
            ('Epoch:', epoch + 1, Config.epochs, 'lr:', scheduler.get_lr()[0]))
        train(epoch, Config.epochs, train_loader, device, model, criterion,
              optimizer, scheduler, Config.tensorboard_path)
        validate(epoch, validation_loader, device, model, criterion,
                 Config.tensorboard_path)

    # 模型保存
    if os.path.exists(Config.model_save_path) == False:
        os.mkdir('./modelDict/')
    torch.save(model.state_dict(), Config.model_save_path)

    confuse_meter = ConfuseMeter()
    confuse_meter = test(test_loader, device, model, criterion)
Beispiel #11
0
def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path,
          model_save_dir):
    """
    :params train_data_path: The path of training data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type train_data_path: str
    :params test_data_path: The path of testing data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type test_data_path: str
    :params word_dict_path: The path of word dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type word_dict_path: str
    :params label_dict_path: The path of label dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type label_dict_path: str
    :params model_save_dir: dir where models saved
    :type model_save_dir: str
    """
    if train_data_dir is not None:
        assert word_dict_path and label_dict_path, (
            "The parameter train_data_dir, word_dict_path, label_dict_path "
            "should be set at the same time.")

    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    use_default_data = (train_data_dir is None)

    if use_default_data:
        logger.info(("No training data are porivided, "
                     "use imdb to train the model."))
        logger.info("Please wait to build the word dictionary ...")

        word_dict = reader.imdb_word_dict()
        train_reader = paddle.batch(paddle.reader.shuffle(
            lambda: reader.imdb_train(word_dict), buf_size=1000),
                                    batch_size=100)
        test_reader = paddle.batch(lambda: reader.imdb_test(word_dict),
                                   batch_size=100)
        class_num = 2
    else:
        if word_dict_path is None or not os.path.exists(word_dict_path):
            logger.info(("Word dictionary is not given, the dictionary "
                         "is automatically built from the training data."))

            # build the word dictionary to map the original string-typed
            # words into integer-typed index
            build_word_dict(data_dir=train_data_dir,
                            save_path=word_dict_path,
                            use_col=1,
                            cutoff_fre=0)

        if not os.path.exists(label_dict_path):
            logger.info(("Label dictionary is not given, the dictionary "
                         "is automatically built from the training data."))
            # build the label dictionary to map the original string-typed
            # label into integer-typed index
            build_label_dict(data_dir=train_data_dir,
                             save_path=label_dict_path,
                             use_col=0)

        word_dict = load_dict(word_dict_path)
        label_dict = load_dict(label_dict_path)

        class_num = len(label_dict)
        logger.info("Class number is : %d." % class_num)

        train_reader = paddle.batch(paddle.reader.shuffle(
            reader.train_reader(train_data_dir, word_dict, label_dict),
            buf_size=conf.buf_size),
                                    batch_size=conf.batch_size)

        if test_data_dir is not None:
            # here, because training and testing data share a same format,
            # we still use the reader.train_reader to read the testing data.
            test_reader = paddle.batch(paddle.reader.shuffle(
                reader.train_reader(test_data_dir, word_dict, label_dict),
                buf_size=conf.buf_size),
                                       batch_size=conf.batch_size)
        else:
            test_reader = None

    dict_dim = len(word_dict)

    logger.info("Length of word dictionary is : %d." % (dict_dim))

    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)

    # create optimizer
    adam_optimizer = paddle.optimizer.Adam(
        learning_rate=conf.learning_rate,
        regularization=paddle.optimizer.L2Regularization(
            rate=conf.l2_learning_rate),
        model_average=paddle.optimizer.ModelAverage(
            average_window=conf.average_window))

    # define network topology.
    cost, prob, label = nested_net(dict_dim, class_num, is_infer=False)

    # create all the trainable parameters.
    parameters = paddle.parameters.create(cost)

    # create the trainer instance.
    trainer = paddle.trainer.SGD(cost=cost,
                                 extra_layers=paddle.evaluator.auc(
                                     input=prob, label=label),
                                 parameters=parameters,
                                 update_equation=adam_optimizer)

    # feeding dictionary
    feeding = {"word": 0, "label": 1}

    def _event_handler(event):
        """
        Define the end batch and the end pass event handler.
        """
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % conf.log_period == 0:
                logger.info(
                    "Pass %d, Batch %d, Cost %f, %s\n" %
                    (event.pass_id, event.batch_id, event.cost, event.metrics))

        if isinstance(event, paddle.event.EndPass):
            if test_reader is not None:
                result = trainer.test(reader=test_reader, feeding=feeding)
                logger.info("Test at Pass %d, %s \n" %
                            (event.pass_id, result.metrics))
            with gzip.open(
                    os.path.join(model_save_dir,
                                 "params_pass_%05d.tar.gz" % event.pass_id),
                    "w") as f:
                trainer.save_parameter_to_tar(f)

    # begin training network
    trainer.train(reader=train_reader,
                  event_handler=_event_handler,
                  feeding=feeding,
                  num_passes=conf.num_passes)

    logger.info("Training has finished.")