Exemple #1
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build word dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))    

    # Build a char dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build char dictionary')
    char_dict = utils.build_char_dict(args, train_exs + dev_exs)
    logger.info('Num chars = %d' % len(char_dict))
    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)
    if args.char_embedding_file:
        model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file)

    return model
Exemple #2
0
def prepare_dataloader(word_dict=None, feature_dict=None):
    """Create data loaders for train and dev"""
    # Load examples
    logger.info('-' * 100)
    logger.info('Loading Datasets...')
    toyfile = 'toy-' if conf['debug'] else ''
    datafile = os.path.join(
        conf['data-dir'], 'bioasq_processed',
        '{}examples-y{}-train.txt'.format(toyfile, conf['year']))
    train_ex = utils.load_data(datafile)
    logger.info('{} train examples loaded'.format(len(train_ex)))
    datafile = os.path.join(
        conf['data-dir'], 'bioasq_processed',
        '{}examples-y{}-test.txt'.format(toyfile, conf['year']))
    test_ex = utils.load_data(datafile)
    logger.info('{} test examples loaded'.format(len(test_ex)))

    # Prepare feature_dict, word_dict
    if feature_dict is None:
        if len(conf['features']) > 0:
            logger.info('Building feature dictionary...')
            feature_dict = utils.build_feature_dict(train_ex)
            if conf['idf-file'] is not None and 'idf' not in feature_dict:
                feature_dict['idf'] = len(feature_dict)
            logger.info('Num features = {}'.format(len(feature_dict)))
            logger.info(feature_dict)
    if word_dict is None:
        logger.info('Build word dictionary...')
        word_dict = utils.build_word_dict(train_ex + test_ex)
        logger.info('Num words = %d' % len(word_dict))
    conf['vocab-size'] = len(word_dict)

    # Prepare DataLoaders
    logger.info('-' * 100)
    logger.info('Creating DataLoaders')
    train_dataset = utils.QaProxDataset(conf, train_ex, word_dict,
                                        feature_dict, conf['idf-file'])
    train_loader_ = DataLoader(train_dataset,
                               batch_size=conf['batch-size'],
                               sampler=sampler.RandomSampler(train_dataset),
                               collate_fn=utils.batchify,
                               num_workers=conf['num-workers'],
                               pin_memory=conf['cuda'])
    dev_dataset = utils.QaProxDataset(conf, test_ex, word_dict, feature_dict,
                                      conf['idf-file'])
    dev_loader_ = DataLoader(dev_dataset,
                             batch_size=conf['batch-size'],
                             sampler=sampler.RandomSampler(dev_dataset),
                             collate_fn=utils.batchify,
                             num_workers=conf['num-workers'],
                             pin_memory=conf['cuda'])
    return train_loader_, dev_loader_, word_dict, feature_dict
Exemple #3
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    # Initialize model
    model = ParagraphRanker(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file and not args.no_embed:
        model.load_embeddings(word_dict.tokens(), args.embedding_file, args.fasttext)

    return model
Exemple #4
0
# load data
with open(args.train_file, 'r') as f:
    train_exs = json.load(f)
    #train_exs=train_exs[:100]

with open(args.dev_file, 'r') as f:
    dev_exs = json.load(f)
    #dev_exs=dev_exs[:100]

with open(args.test_file, 'r') as f:
    test_exs = json.load(f)
    #test_exs=test_exs[:100]
# build dict
feature_dict = build_feature_dict(
    args, train_exs
)  # feature_dict['in_question']=0, ['in_question_uncased']=1,['in_question_lemma']=2,['pos=NN']=3,['pos=IN']=4,['pos=DT']=5,.
word_dict = build_word_dict(args, train_exs, dev_exs + test_exs)
logger.info('Num words = %d' % len(word_dict))

# --------------------------------------------------------------------------
logger.info('-' * 100)
logger.info('Make data loaders')
# single ex vectorized
train_dataset = ReaderDataset(train_exs,
                              args,
                              word_dict,
                              feature_dict,
                              if_train=True)
# sample stategy
if args.sort_by_len: