def build_tasks_from_file(conf_path, options=None): if type(conf_path) is str: conf = Config.read(conf_path) elif type(conf_path) is Config: conf = conf_path else: raise TypeError('Unknown configuration type. Expect str or Config.') if options: for k, v in options: conf.update_value(k, v) # Create data sets logger.info('Loading data sets') datasets = {} lang_datasets = defaultdict(list) task_datasets = defaultdict(list) for dataset in conf.datasets: parser = create_parser(dataset.parser.format, dataset.parser) (train_conf, dev_conf, test_conf) = dataset.clone(), dataset.clone(), dataset.clone() train_conf.update({'path': dataset.files.train, 'parser': parser}) dev_conf.update({ 'path': dataset.files.dev, 'parser': parser, 'sample': None }) train_dataset = create_dataset(dataset.type, train_conf) dev_dataset = create_dataset(dataset.type, dev_conf) if hasattr(dataset.files, 'test'): test_conf.update({ 'path': dataset.files.test, 'parser': parser, 'sample': None }) test_dataset = create_dataset(dataset.type, test_conf) datasets[dataset.name] = { 'train': train_dataset, 'dev': dev_dataset, 'test': test_dataset, 'language': dataset.language, 'task': dataset.task } lang_datasets[dataset.language].append(dataset.name) task_datasets[dataset.task].append(dataset.name) # Create vocabs # I only keep words in the data sets to save memory # If the model will be applied to an unknown test set, it is better to keep # all words in pre-trained embeddings. logger.info('Creating vocabularies') dataset_counts = {} lang_token_vocabs = {} task_label_vocabs = {} for name, ds in datasets.items(): dataset_counts[name] = compute_metadata( [ds['train'], ds['dev'], ds['test']]) for lang, ds in lang_datasets.items(): counts = [dataset_counts[d][0] for d in ds] lang_token_vocabs[lang] = count2vocab(counts, ignore_case=True, start_idx=2) for task, ds in task_datasets.items(): counts = [dataset_counts[d][1] for d in ds] task_label_vocabs[task] = count2vocab(counts, ignore_case=False, start_idx=0, sort=True) char_vocab = count2vocab([c[2] for c in dataset_counts.values()], ignore_case=False, start_idx=1) # Report stats for lang, vocab in lang_token_vocabs.items(): logger.info('#{} token: {}'.format(lang, len(vocab))) for task, vocab in task_label_vocabs.items(): logger.info('#{} label: {}'.format(task, len(vocab))) logger.info(vocab) # Numberize datasets logger.info('Numberizing data sets') numberize_conf = [] for ds in datasets.values(): numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_datasets(numberize_conf, token_ignore_case=True, label_ignore_case=False, char_ignore_case=False) # Initialize component confs logger.info('Initializing component configurations') word_embed_dim = char_embed_dim = lstm_output_dim = 0 cpnt_confs = {} for cpnt in conf.components: if cpnt.model == 'embedding': cpnt.embedding_dim = cpnt.dimension word_embed_dim = cpnt.dimension elif cpnt.model == 'char_cnn': cpnt.vocab_size = len(char_vocab) char_embed_dim = sum([x[1] for x in cpnt.filters]) elif cpnt.model == 'lstm': lstm_output_dim = cpnt.hidden_size * (2 if cpnt.bidirectional else 1) cpnt_confs[cpnt.name] = cpnt.clone() # Update component configurations target_task = '' target_lang = '' for task_conf in conf.tasks: language = task_conf.language task = task_conf.task if task_conf.get('ref', False): target_lang = language target_task = task model_conf = task_conf.model if model_conf.model != 'lstm_crf': continue # Update word embedding configuration cpnt_confs[model_conf.word_embed].num_embeddings = len( lang_token_vocabs[language]) cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language] # Update output layer configuration cpnt_confs[model_conf.univ_layer].out_features = len( task_label_vocabs[task]) if hasattr(model_conf, 'spec_layer'): cpnt_confs[model_conf.spec_layer].out_features = len( task_label_vocabs[task]) # Update CRF configuration cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task] for _, cpnt_conf in cpnt_confs.items(): if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output': cpnt_conf.in_features = lstm_output_dim if cpnt_conf.model == 'lstm': cpnt_conf.input_size = char_embed_dim + word_embed_dim if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char': cpnt_conf.size = char_embed_dim # Create components logger.info('Creating components') components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()} # Construct models tasks = [] for task_conf in conf.tasks: model_conf = task_conf.model language = task_conf.language task = task_conf.task if model_conf.model == 'lstm_crf': model = LstmCrf( lang_token_vocabs[language], task_label_vocabs[task], char_vocab, word_embedding=components[model_conf.word_embed], char_embedding=components[model_conf.char_embed] if hasattr( model_conf, 'char_embed') else None, crf=components[model_conf.crf], lstm=components[model_conf.lstm], input_layer=None, univ_fc_layer=components[model_conf.univ_layer], spec_fc_layer=components[model_conf.spec_layer] if hasattr( model_conf, 'spec_linear') else None, embed_dropout_prob=model_conf.embed_dropout, lstm_dropout_prob=model_conf.lstm_dropout, linear_dropout_prob=model_conf.linear_dropout, char_highway=components[model_conf.char_highway] if hasattr( model_conf, 'char_highway') else None, use_char_embedding=model_conf.use_char_embedding if hasattr( model_conf, 'use_char_embedding') else True, ) # elif model_conf.model == 'cbow': # pass else: raise ValueError('Unknown model: {}'.format(model_conf.model)) logger.debug(model) task_classes = {'ner': NameTagging, 'pos': PosTagging} if task in task_classes: task_obj = task_classes[task]( task_conf.name, model, datasets=datasets[task_conf.dataset], vocabs={ 'token': lang_token_vocabs[language], 'label': task_label_vocabs[task], 'char': char_vocab }, gpu=task_conf.gpu, # TODO: 'gpu' -> global config prob=getattr(task_conf, 'prob', 1.0), lr=getattr(task_conf, 'learning_rate', .001), momentum=getattr(task_conf, 'momentum', .9), decay_rate=getattr(task_conf, 'decay_rate', .9), decay_step=getattr(task_conf, 'decay_step', 10000), gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0), require_eval=getattr(task_conf, 'require_eval', True), ref=getattr(task_conf, 'ref', False), aux_task=task_conf.task != target_task, aux_lang=task_conf.language != target_lang, ) else: raise ValueError('Unknown task {}'.format(task)) tasks.append(task_obj) return tasks, { 'lang_token_vocabs': lang_token_vocabs, 'task_token_vocabs': task_label_vocabs, 'components': components }
lstm_crf.cuda() else: lstm_crf.cpu() # Load dataset logger.info('Loading data') conll_parser = ConllParser( Config({ 'separator': '\t', 'token_col': 0, 'label_col': 1, 'skip_comment': True, })) test_set = SequenceDataset(Config({'path': data_file, 'parser': conll_parser})) numberize_datasets([(test_set, token_vocab, label_vocab, char_vocab)], token_ignore_case=train_args['word_ignore_case'], label_ignore_case=False, char_ignore_case=False) idx_token = {idx: token for token, idx in token_vocab.items()} idx_label = {idx: label for label, idx in label_vocab.items()} idx_token[C.UNKNOWN_TOKEN_INDEX] = C.UNKNOWN_TOKEN try: results = [] dataset_loss = [] for batch in test_set.get_dataset(gpu=use_gpu, shuffle_inst=False, batch_size=100): tokens, labels, chars, seq_lens, char_lens = batch pred, loss = lstm_crf.predict(tokens, labels, seq_lens, chars, char_lens) results.append((pred, labels, seq_lens, tokens))
if token.lower() not in token_vocab: token_vocab[token.lower()] = len(token_vocab) \ + C.EMBED_START_IDX except UnicodeDecodeError as e: logger.warning(e) idx_token = {idx: token for token, idx in token_vocab.items()} idx_label = {idx: label for label, idx in label_vocab.items()} idx_token[C.UNKNOWN_TOKEN_INDEX] = C.UNKNOWN_TOKEN # Numberize datasets logger.info('Numberizing datasets') numberize_datasets( [ (train_set, token_vocab, label_vocab, char_vocab), (dev_set, token_vocab, label_vocab, char_vocab), (test_set, token_vocab, label_vocab, char_vocab), ], token_ignore_case=word_ignore_case, label_ignore_case=False, char_ignore_case=False ) # Model components logger.info('Building the model') word_embed = Embedding(Config({ 'num_embeddings': len(token_vocab), 'embedding_dim': args.word_embed_dim, 'padding': C.EMBED_START_IDX, 'padding_idx': 0, 'sparse': True, 'trainable': True, 'file': embed_file,
'clct': idx_label_2 } # Numberize data sets logger.info('Numberizing data sets') numberize_datasets( [ # Target task (train_set_tgt, token_vocab_1, label_vocab_1, char_vocab), (dev_set_tgt, token_vocab_1, label_vocab_1, char_vocab), (test_set_tgt, token_vocab_1, label_vocab_1, char_vocab), # Auxiliary task: Cross-lingual (train_set_cl, token_vocab_2, label_vocab_1, char_vocab), (dev_set_cl, token_vocab_2, label_vocab_1, char_vocab), (test_set_cl, token_vocab_2, label_vocab_1, char_vocab), # Auxiliary task: Cross-task (train_set_ct, token_vocab_1, label_vocab_2, char_vocab), (dev_set_ct, token_vocab_1, label_vocab_2, char_vocab), (test_set_ct, token_vocab_1, label_vocab_2, char_vocab), # Auxiliary task: Cross-lingual Cross-task (train_set_clct, token_vocab_2, label_vocab_2, char_vocab), (dev_set_clct, token_vocab_2, label_vocab_2, char_vocab), (test_set_clct, token_vocab_2, label_vocab_2, char_vocab), ], token_ignore_case=word_ignore_case, label_ignore_case=False, char_ignore_case=False) # Model components logger.info('Building the models') word_embed_1 = Embedding( Config({