Exemple #1
0
def build_tasks_from_file(conf_path, options=None):
    if type(conf_path) is str:
        conf = Config.read(conf_path)
    elif type(conf_path) is Config:
        conf = conf_path
    else:
        raise TypeError('Unknown configuration type. Expect str or Config.')

    if options:
        for k, v in options:
            conf.update_value(k, v)

    # Create data sets
    logger.info('Loading data sets')
    datasets = {}
    lang_datasets = defaultdict(list)
    task_datasets = defaultdict(list)
    for dataset in conf.datasets:
        parser = create_parser(dataset.parser.format, dataset.parser)
        (train_conf, dev_conf,
         test_conf) = dataset.clone(), dataset.clone(), dataset.clone()
        train_conf.update({'path': dataset.files.train, 'parser': parser})
        dev_conf.update({
            'path': dataset.files.dev,
            'parser': parser,
            'sample': None
        })
        train_dataset = create_dataset(dataset.type, train_conf)
        dev_dataset = create_dataset(dataset.type, dev_conf)
        if hasattr(dataset.files, 'test'):
            test_conf.update({
                'path': dataset.files.test,
                'parser': parser,
                'sample': None
            })
            test_dataset = create_dataset(dataset.type, test_conf)
        datasets[dataset.name] = {
            'train': train_dataset,
            'dev': dev_dataset,
            'test': test_dataset,
            'language': dataset.language,
            'task': dataset.task
        }
        lang_datasets[dataset.language].append(dataset.name)
        task_datasets[dataset.task].append(dataset.name)

    # Create vocabs
    # I only keep words in the data sets to save memory
    # If the model will be applied to an unknown test set, it is better to keep
    # all words in pre-trained embeddings.
    logger.info('Creating vocabularies')
    dataset_counts = {}
    lang_token_vocabs = {}
    task_label_vocabs = {}
    for name, ds in datasets.items():
        dataset_counts[name] = compute_metadata(
            [ds['train'], ds['dev'], ds['test']])
    for lang, ds in lang_datasets.items():
        counts = [dataset_counts[d][0] for d in ds]
        lang_token_vocabs[lang] = count2vocab(counts,
                                              ignore_case=True,
                                              start_idx=2)
    for task, ds in task_datasets.items():
        counts = [dataset_counts[d][1] for d in ds]
        task_label_vocabs[task] = count2vocab(counts,
                                              ignore_case=False,
                                              start_idx=0,
                                              sort=True)
    char_vocab = count2vocab([c[2] for c in dataset_counts.values()],
                             ignore_case=False,
                             start_idx=1)

    # Report stats
    for lang, vocab in lang_token_vocabs.items():
        logger.info('#{} token: {}'.format(lang, len(vocab)))
    for task, vocab in task_label_vocabs.items():
        logger.info('#{} label: {}'.format(task, len(vocab)))
        logger.info(vocab)

    # Numberize datasets
    logger.info('Numberizing data sets')
    numberize_conf = []
    for ds in datasets.values():
        numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
        numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']],
                               task_label_vocabs[ds['task']], char_vocab))
    numberize_datasets(numberize_conf,
                       token_ignore_case=True,
                       label_ignore_case=False,
                       char_ignore_case=False)

    # Initialize component confs
    logger.info('Initializing component configurations')
    word_embed_dim = char_embed_dim = lstm_output_dim = 0
    cpnt_confs = {}
    for cpnt in conf.components:
        if cpnt.model == 'embedding':
            cpnt.embedding_dim = cpnt.dimension
            word_embed_dim = cpnt.dimension
        elif cpnt.model == 'char_cnn':
            cpnt.vocab_size = len(char_vocab)
            char_embed_dim = sum([x[1] for x in cpnt.filters])
        elif cpnt.model == 'lstm':
            lstm_output_dim = cpnt.hidden_size * (2
                                                  if cpnt.bidirectional else 1)
        cpnt_confs[cpnt.name] = cpnt.clone()

    # Update component configurations
    target_task = ''
    target_lang = ''
    for task_conf in conf.tasks:
        language = task_conf.language
        task = task_conf.task
        if task_conf.get('ref', False):
            target_lang = language
            target_task = task
        model_conf = task_conf.model
        if model_conf.model != 'lstm_crf':
            continue
        # Update word embedding configuration
        cpnt_confs[model_conf.word_embed].num_embeddings = len(
            lang_token_vocabs[language])
        cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language]
        # Update output layer configuration
        cpnt_confs[model_conf.univ_layer].out_features = len(
            task_label_vocabs[task])
        if hasattr(model_conf, 'spec_layer'):
            cpnt_confs[model_conf.spec_layer].out_features = len(
                task_label_vocabs[task])
        # Update CRF configuration
        cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task]

    for _, cpnt_conf in cpnt_confs.items():
        if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output':
            cpnt_conf.in_features = lstm_output_dim
        if cpnt_conf.model == 'lstm':
            cpnt_conf.input_size = char_embed_dim + word_embed_dim
        if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char':
            cpnt_conf.size = char_embed_dim

    # Create components
    logger.info('Creating components')
    components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()}

    # Construct models
    tasks = []
    for task_conf in conf.tasks:
        model_conf = task_conf.model
        language = task_conf.language
        task = task_conf.task
        if model_conf.model == 'lstm_crf':
            model = LstmCrf(
                lang_token_vocabs[language],
                task_label_vocabs[task],
                char_vocab,
                word_embedding=components[model_conf.word_embed],
                char_embedding=components[model_conf.char_embed] if hasattr(
                    model_conf, 'char_embed') else None,
                crf=components[model_conf.crf],
                lstm=components[model_conf.lstm],
                input_layer=None,
                univ_fc_layer=components[model_conf.univ_layer],
                spec_fc_layer=components[model_conf.spec_layer] if hasattr(
                    model_conf, 'spec_linear') else None,
                embed_dropout_prob=model_conf.embed_dropout,
                lstm_dropout_prob=model_conf.lstm_dropout,
                linear_dropout_prob=model_conf.linear_dropout,
                char_highway=components[model_conf.char_highway] if hasattr(
                    model_conf, 'char_highway') else None,
                use_char_embedding=model_conf.use_char_embedding if hasattr(
                    model_conf, 'use_char_embedding') else True,
            )
        # elif model_conf.model == 'cbow':
        #     pass
        else:
            raise ValueError('Unknown model: {}'.format(model_conf.model))
        logger.debug(model)

        task_classes = {'ner': NameTagging, 'pos': PosTagging}
        if task in task_classes:
            task_obj = task_classes[task](
                task_conf.name,
                model,
                datasets=datasets[task_conf.dataset],
                vocabs={
                    'token': lang_token_vocabs[language],
                    'label': task_label_vocabs[task],
                    'char': char_vocab
                },
                gpu=task_conf.gpu,
                # TODO: 'gpu' -> global config
                prob=getattr(task_conf, 'prob', 1.0),
                lr=getattr(task_conf, 'learning_rate', .001),
                momentum=getattr(task_conf, 'momentum', .9),
                decay_rate=getattr(task_conf, 'decay_rate', .9),
                decay_step=getattr(task_conf, 'decay_step', 10000),
                gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0),
                require_eval=getattr(task_conf, 'require_eval', True),
                ref=getattr(task_conf, 'ref', False),
                aux_task=task_conf.task != target_task,
                aux_lang=task_conf.language != target_lang,
            )
        else:
            raise ValueError('Unknown task {}'.format(task))
        tasks.append(task_obj)

    return tasks, {
        'lang_token_vocabs': lang_token_vocabs,
        'task_token_vocabs': task_label_vocabs,
        'components': components
    }
Exemple #2
0
# Load datasets
logger.info('Loading datasets')
train_set = SequenceDataset(Config({
    'path': args.train, 'parser': conll_parser, 'batch_size': args.batch_size}))
dev_set = SequenceDataset(Config({
    'path': args.dev, 'parser': conll_parser}))
test_set = SequenceDataset(Config({
    'path': args.test, 'parser': conll_parser}))
datasets = {'train': train_set, 'dev': dev_set, 'test': test_set}

# Vocabs
logger.info('Building vocabularies')
token_count, label_count, char_count = compute_metadata(
    [train_set, dev_set, test_set])
token_vocab = count2vocab([token_count],
                          start_idx=C.EMBED_START_IDX,
                          ignore_case=word_ignore_case)
label_vocab = count2vocab([label_count],
                          start_idx=0,
                          sort=True,
                          ignore_case=False)
char_vocab = count2vocab([char_count],
                         ignore_case=False,
                         start_idx=C.CHAR_EMBED_START_IDX)
if embed_file:
    logger.info('Scaning pre-trained embeddings')
    token_vocab = {}
    with open(embed_file, 'r', encoding='utf-8') as embed_r:
        if args.embed_skip_first:
            embed_r.readline()
        for line in embed_r:
Exemple #3
0
                     skip_comment=True)
train_set = SeqLabelDataset(args.train, parser=parser)
dev_set = SeqLabelDataset(args.dev, parser=parser)
test_set = SeqLabelDataset(args.test, parser=parser)
datasets = {'train': train_set, 'dev': dev_set, 'test': test_set}

# Vocabs
logger.info('Building vocabs')
token_count, char_count, label_count = Counter(), Counter(), Counter()
for _, ds in datasets.items():
    tc, cc, lc = ds.stats()
    token_count.update(tc)
    char_count.update(cc)
    label_count.update(lc)
token_vocab = count2vocab(token_count,
                          offset=len(C.TOKEN_PADS),
                          pads=C.TOKEN_PADS)
char_vocab = count2vocab(char_count, offset=len(C.CHAR_PADS), pads=C.CHAR_PADS)
label_vocab = count2vocab(label_count, offset=1, pads=[(C.PAD, C.PAD_INDEX)])
# print("label_vocab: ", label_vocab)  # DEBUG
# idx_token = {v: k for k, v in token_vocab.items()}  # not debug
idx_token = {v: k for k, v in token_vocab.items() if k != ''}  # DEBUG
# print(idx_token)  # DEBUG
# print(idx_token.get(243, "not found"))  # DEBUG
# print([str(k) + " " + str(v) for k, v in idx_token.items()  # DEBUG
#        if (k == 1 or v == 1 or k == '1' or v == '1')])  # DEBUG
idx_label = {v: k for k, v in label_vocab.items()}
train_set.numberize(token_vocab, label_vocab, char_vocab)
dev_set.numberize(token_vocab, label_vocab, char_vocab)
test_set.numberize(token_vocab, label_vocab, char_vocab)
# print("numberized train set:")
Exemple #4
0
    'dev': test_set_clct,
    'test': test_set_clct
}

# Vocabs
logger.info('Building vocabularies')
token_count_tgt, label_count_tgt, char_count_tgt = compute_metadata(
    [train_set_tgt, dev_set_tgt, test_set_tgt])
token_count_cl, label_count_cl, char_count_cl = compute_metadata(
    [train_set_cl, dev_set_cl, test_set_cl])
token_count_ct, label_count_ct, char_count_ct = compute_metadata(
    [train_set_ct, dev_set_ct, test_set_ct])
token_count_clct, label_count_clct, char_count_clct = compute_metadata(
    [train_set_clct, dev_set_clct, test_set_clct])
token_vocab_1 = count2vocab([token_count_tgt, token_count_ct],
                            start_idx=C.EMBED_START_IDX,
                            ignore_case=word_ignore_case)
token_vocab_2 = count2vocab([token_count_cl, token_count_clct],
                            start_idx=C.EMBED_START_IDX,
                            ignore_case=word_ignore_case)
label_vocab_1 = count2vocab([label_count_tgt, label_count_cl], start_idx=0)
label_vocab_2 = count2vocab([label_count_ct, label_count_clct], start_idx=0)
char_vocab = count2vocab(
    [char_count_tgt, char_count_cl, char_count_ct, char_count_clct],
    start_idx=C.CHAR_EMBED_START_IDX)

# Scan embedding file
if embed_file_1:
    logger.info('Scaning pre-trained embeddings for language 1')
    token_vocab_1 = {}
    with open(embed_file_1, 'r', encoding='utf-8') as embed_r: