Example #1
0
def build_tasks_from_file(conf_path, options=None):
    if type(conf_path) is str:
        conf = Config.read(conf_path)
    elif type(conf_path) is Config:
        conf = conf_path
    else:
        raise TypeError('Unknown configuration type. Expect str or Config.')

    if options:
        for k, v in options:
            conf.update_value(k, v)

    # Create data sets
    logger.info('Loading data sets')
    datasets = {}
    lang_datasets = defaultdict(list)
    task_datasets = defaultdict(list)
    for dataset in conf.datasets:
        parser = create_parser(dataset.parser.format, dataset.parser)
        (train_conf, dev_conf,
         test_conf) = dataset.clone(), dataset.clone(), dataset.clone()
        train_conf.update({'path': dataset.files.train, 'parser': parser})
        dev_conf.update({
            'path': dataset.files.dev,
            'parser': parser,
            'sample': None
        })
        train_dataset = create_dataset(dataset.type, train_conf)
        dev_dataset = create_dataset(dataset.type, dev_conf)
        if hasattr(dataset.files, 'test'):
            test_conf.update({
                'path': dataset.files.test,
                'parser': parser,
                'sample': None
            })
            test_dataset = create_dataset(dataset.type, test_conf)
        datasets[dataset.name] = {
            'train': train_dataset,
            'dev': dev_dataset,
            'test': test_dataset,
            'language': dataset.language,
            'task': dataset.task
        }
        lang_datasets[dataset.language].append(dataset.name)
        task_datasets[dataset.task].append(dataset.name)

    # Create vocabs
    # I only keep words in the data sets to save memory
    # If the model will be applied to an unknown test set, it is better to keep
    # all words in pre-trained embeddings.
    logger.info('Creating vocabularies')
    dataset_counts = {}
    lang_token_vocabs = {}
    task_label_vocabs = {}
    for name, ds in datasets.items():
        dataset_counts[name] = compute_metadata(
            [ds['train'], ds['dev'], ds['test']])
    for lang, ds in lang_datasets.items():
        counts = [dataset_counts[d][0] for d in ds]
        lang_token_vocabs[lang] = count2vocab(counts,
                                              ignore_case=True,
                                              start_idx=2)
    for task, ds in task_datasets.items():
        counts = [dataset_counts[d][1] for d in ds]
        task_label_vocabs[task] = count2vocab(counts,
                                              ignore_case=False,
                                              start_idx=0,
                                              sort=True)
    char_vocab = count2vocab([c[2] for c in dataset_counts.values()],
                             ignore_case=False,
                             start_idx=1)

    # Report stats
    for lang, vocab in lang_token_vocabs.items():
        logger.info('#{} token: {}'.format(lang, len(vocab)))
    for task, vocab in task_label_vocabs.items():
        logger.info('#{} label: {}'.format(task, len(vocab)))
        logger.info(vocab)

    # Numberize datasets
    logger.info('Numberizing data sets')
    numberize_conf = []
    for ds in datasets.values():
        numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
    numberize_datasets(numberize_conf,
                       token_ignore_case=True,
                       label_ignore_case=False,
                       char_ignore_case=False)

    # Initialize component confs
    logger.info('Initializing component configurations')
    word_embed_dim = char_embed_dim = lstm_output_dim = 0
    cpnt_confs = {}
    for cpnt in conf.components:
        if cpnt.model == 'embedding':
            cpnt.embedding_dim = cpnt.dimension
            word_embed_dim = cpnt.dimension
        elif cpnt.model == 'char_cnn':
            cpnt.vocab_size = len(char_vocab)
            char_embed_dim = sum([x[1] for x in cpnt.filters])
        elif cpnt.model == 'lstm':
            lstm_output_dim = cpnt.hidden_size * (2
                                                  if cpnt.bidirectional else 1)
        cpnt_confs[cpnt.name] = cpnt.clone()

    # Update component configurations
    target_task = ''
    target_lang = ''
    for task_conf in conf.tasks:
        language = task_conf.language
        task = task_conf.task
        if task_conf.get('ref', False):
            target_lang = language
            target_task = task
        model_conf = task_conf.model
        if model_conf.model != 'lstm_crf':
            continue
        # Update word embedding configuration
        cpnt_confs[model_conf.word_embed].num_embeddings = len(
            lang_token_vocabs[language])
        cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language]
        # Update output layer configuration
        cpnt_confs[model_conf.univ_layer].out_features = len(
            task_label_vocabs[task])
        if hasattr(model_conf, 'spec_layer'):
            cpnt_confs[model_conf.spec_layer].out_features = len(
                task_label_vocabs[task])
        # Update CRF configuration
        cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task]

    for _, cpnt_conf in cpnt_confs.items():
        if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output':
            cpnt_conf.in_features = lstm_output_dim
        if cpnt_conf.model == 'lstm':
            cpnt_conf.input_size = char_embed_dim + word_embed_dim
        if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char':
            cpnt_conf.size = char_embed_dim

    # Create components
    logger.info('Creating components')
    components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()}

    # Construct models
    tasks = []
    for task_conf in conf.tasks:
        model_conf = task_conf.model
        language = task_conf.language
        task = task_conf.task
        if model_conf.model == 'lstm_crf':
            model = LstmCrf(
                lang_token_vocabs[language],
                task_label_vocabs[task],
                char_vocab,
                word_embedding=components[model_conf.word_embed],
                char_embedding=components[model_conf.char_embed] if hasattr(
                    model_conf, 'char_embed') else None,
                crf=components[model_conf.crf],
                lstm=components[model_conf.lstm],
                input_layer=None,
                univ_fc_layer=components[model_conf.univ_layer],
                spec_fc_layer=components[model_conf.spec_layer] if hasattr(
                    model_conf, 'spec_linear') else None,
                embed_dropout_prob=model_conf.embed_dropout,
                lstm_dropout_prob=model_conf.lstm_dropout,
                linear_dropout_prob=model_conf.linear_dropout,
                char_highway=components[model_conf.char_highway] if hasattr(
                    model_conf, 'char_highway') else None,
                use_char_embedding=model_conf.use_char_embedding if hasattr(
                    model_conf, 'use_char_embedding') else True,
            )
        # elif model_conf.model == 'cbow':
        #     pass
        else:
            raise ValueError('Unknown model: {}'.format(model_conf.model))
        logger.debug(model)

        task_classes = {'ner': NameTagging, 'pos': PosTagging}
        if task in task_classes:
            task_obj = task_classes[task](
                task_conf.name,
                model,
                datasets=datasets[task_conf.dataset],
                vocabs={
                    'token': lang_token_vocabs[language],
                    'label': task_label_vocabs[task],
                    'char': char_vocab
                },
                gpu=task_conf.gpu,
                # TODO: 'gpu' -> global config
                prob=getattr(task_conf, 'prob', 1.0),
                lr=getattr(task_conf, 'learning_rate', .001),
                momentum=getattr(task_conf, 'momentum', .9),
                decay_rate=getattr(task_conf, 'decay_rate', .9),
                decay_step=getattr(task_conf, 'decay_step', 10000),
                gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0),
                require_eval=getattr(task_conf, 'require_eval', True),
                ref=getattr(task_conf, 'ref', False),
                aux_task=task_conf.task != target_task,
                aux_lang=task_conf.language != target_lang,
            )
        else:
            raise ValueError('Unknown task {}'.format(task))
        tasks.append(task_obj)

    return tasks, {
        'lang_token_vocabs': lang_token_vocabs,
        'task_token_vocabs': task_label_vocabs,
        'components': components
    }
Example #2
0
    lstm_crf.cuda()
else:
    lstm_crf.cpu()

# Load dataset
logger.info('Loading data')
conll_parser = ConllParser(
    Config({
        'separator': '\t',
        'token_col': 0,
        'label_col': 1,
        'skip_comment': True,
    }))
test_set = SequenceDataset(Config({'path': data_file, 'parser': conll_parser}))
numberize_datasets([(test_set, token_vocab, label_vocab, char_vocab)],
                   token_ignore_case=train_args['word_ignore_case'],
                   label_ignore_case=False,
                   char_ignore_case=False)
idx_token = {idx: token for token, idx in token_vocab.items()}
idx_label = {idx: label for label, idx in label_vocab.items()}
idx_token[C.UNKNOWN_TOKEN_INDEX] = C.UNKNOWN_TOKEN

try:
    results = []
    dataset_loss = []
    for batch in test_set.get_dataset(gpu=use_gpu,
                                      shuffle_inst=False,
                                      batch_size=100):
        tokens, labels, chars, seq_lens, char_lens = batch
        pred, loss = lstm_crf.predict(tokens, labels, seq_lens, chars,
                                      char_lens)
        results.append((pred, labels, seq_lens, tokens))
Example #3
0
                if token.lower() not in token_vocab:
                    token_vocab[token.lower()] = len(token_vocab) \
                                                 + C.EMBED_START_IDX
            except UnicodeDecodeError as e:
                logger.warning(e)
idx_token = {idx: token for token, idx in token_vocab.items()}
idx_label = {idx: label for label, idx in label_vocab.items()}
idx_token[C.UNKNOWN_TOKEN_INDEX] = C.UNKNOWN_TOKEN

# Numberize datasets
logger.info('Numberizing datasets')
numberize_datasets(
    [
        (train_set, token_vocab, label_vocab, char_vocab),
        (dev_set, token_vocab, label_vocab, char_vocab),
        (test_set, token_vocab, label_vocab, char_vocab),
    ],
    token_ignore_case=word_ignore_case,
    label_ignore_case=False,
    char_ignore_case=False
)

# Model components
logger.info('Building the model')
word_embed = Embedding(Config({
    'num_embeddings': len(token_vocab),
    'embedding_dim': args.word_embed_dim,
    'padding': C.EMBED_START_IDX,
    'padding_idx': 0,
    'sparse': True,
    'trainable': True,
    'file': embed_file,
Example #4
0
    'clct': idx_label_2
}

# Numberize data sets
logger.info('Numberizing data sets')
numberize_datasets(
    [
        # Target task
        (train_set_tgt, token_vocab_1, label_vocab_1, char_vocab),
        (dev_set_tgt, token_vocab_1, label_vocab_1, char_vocab),
        (test_set_tgt, token_vocab_1, label_vocab_1, char_vocab),
        # Auxiliary task: Cross-lingual
        (train_set_cl, token_vocab_2, label_vocab_1, char_vocab),
        (dev_set_cl, token_vocab_2, label_vocab_1, char_vocab),
        (test_set_cl, token_vocab_2, label_vocab_1, char_vocab),
        # Auxiliary task: Cross-task
        (train_set_ct, token_vocab_1, label_vocab_2, char_vocab),
        (dev_set_ct, token_vocab_1, label_vocab_2, char_vocab),
        (test_set_ct, token_vocab_1, label_vocab_2, char_vocab),
        # Auxiliary task: Cross-lingual Cross-task
        (train_set_clct, token_vocab_2, label_vocab_2, char_vocab),
        (dev_set_clct, token_vocab_2, label_vocab_2, char_vocab),
        (test_set_clct, token_vocab_2, label_vocab_2, char_vocab),
    ],
    token_ignore_case=word_ignore_case,
    label_ignore_case=False,
    char_ignore_case=False)

# Model components
logger.info('Building the models')
word_embed_1 = Embedding(
    Config({