def build_tasks_from_file(conf_path, options=None): if type(conf_path) is str: conf = Config.read(conf_path) elif type(conf_path) is Config: conf = conf_path else: raise TypeError('Unknown configuration type. Expect str or Config.') if options: for k, v in options: conf.update_value(k, v) # Create data sets logger.info('Loading data sets') datasets = {} lang_datasets = defaultdict(list) task_datasets = defaultdict(list) for dataset in conf.datasets: parser = create_parser(dataset.parser.format, dataset.parser) (train_conf, dev_conf, test_conf) = dataset.clone(), dataset.clone(), dataset.clone() train_conf.update({'path': dataset.files.train, 'parser': parser}) dev_conf.update({ 'path': dataset.files.dev, 'parser': parser, 'sample': None }) train_dataset = create_dataset(dataset.type, train_conf) dev_dataset = create_dataset(dataset.type, dev_conf) if hasattr(dataset.files, 'test'): test_conf.update({ 'path': dataset.files.test, 'parser': parser, 'sample': None }) test_dataset = create_dataset(dataset.type, test_conf) datasets[dataset.name] = { 'train': train_dataset, 'dev': dev_dataset, 'test': test_dataset, 'language': dataset.language, 'task': dataset.task } lang_datasets[dataset.language].append(dataset.name) task_datasets[dataset.task].append(dataset.name) # Create vocabs # I only keep words in the data sets to save memory # If the model will be applied to an unknown test set, it is better to keep # all words in pre-trained embeddings. logger.info('Creating vocabularies') dataset_counts = {} lang_token_vocabs = {} task_label_vocabs = {} for name, ds in datasets.items(): dataset_counts[name] = compute_metadata( [ds['train'], ds['dev'], ds['test']]) for lang, ds in lang_datasets.items(): counts = [dataset_counts[d][0] for d in ds] lang_token_vocabs[lang] = count2vocab(counts, ignore_case=True, start_idx=2) for task, ds in task_datasets.items(): counts = [dataset_counts[d][1] for d in ds] task_label_vocabs[task] = count2vocab(counts, ignore_case=False, start_idx=0, sort=True) char_vocab = count2vocab([c[2] for c in dataset_counts.values()], ignore_case=False, start_idx=1) # Report stats for lang, vocab in lang_token_vocabs.items(): logger.info('#{} token: {}'.format(lang, len(vocab))) for task, vocab in task_label_vocabs.items(): logger.info('#{} label: {}'.format(task, len(vocab))) logger.info(vocab) # Numberize datasets logger.info('Numberizing data sets') numberize_conf = [] for ds in datasets.values(): numberize_conf.append((ds['train'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['dev'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_conf.append((ds['test'], lang_token_vocabs[ds['language']], task_label_vocabs[ds['task']], char_vocab)) numberize_datasets(numberize_conf, token_ignore_case=True, label_ignore_case=False, char_ignore_case=False) # Initialize component confs logger.info('Initializing component configurations') word_embed_dim = char_embed_dim = lstm_output_dim = 0 cpnt_confs = {} for cpnt in conf.components: if cpnt.model == 'embedding': cpnt.embedding_dim = cpnt.dimension word_embed_dim = cpnt.dimension elif cpnt.model == 'char_cnn': cpnt.vocab_size = len(char_vocab) char_embed_dim = sum([x[1] for x in cpnt.filters]) elif cpnt.model == 'lstm': lstm_output_dim = cpnt.hidden_size * (2 if cpnt.bidirectional else 1) cpnt_confs[cpnt.name] = cpnt.clone() # Update component configurations target_task = '' target_lang = '' for task_conf in conf.tasks: language = task_conf.language task = task_conf.task if task_conf.get('ref', False): target_lang = language target_task = task model_conf = task_conf.model if model_conf.model != 'lstm_crf': continue # Update word embedding configuration cpnt_confs[model_conf.word_embed].num_embeddings = len( lang_token_vocabs[language]) cpnt_confs[model_conf.word_embed].vocab = lang_token_vocabs[language] # Update output layer configuration cpnt_confs[model_conf.univ_layer].out_features = len( task_label_vocabs[task]) if hasattr(model_conf, 'spec_layer'): cpnt_confs[model_conf.spec_layer].out_features = len( task_label_vocabs[task]) # Update CRF configuration cpnt_confs[model_conf.crf].label_vocab = task_label_vocabs[task] for _, cpnt_conf in cpnt_confs.items(): if cpnt_conf.model == 'linear' and cpnt_conf.position == 'output': cpnt_conf.in_features = lstm_output_dim if cpnt_conf.model == 'lstm': cpnt_conf.input_size = char_embed_dim + word_embed_dim if cpnt_conf.model == 'highway' and cpnt_conf.position == 'char': cpnt_conf.size = char_embed_dim # Create components logger.info('Creating components') components = {k: create_module(v.model, v) for k, v in cpnt_confs.items()} # Construct models tasks = [] for task_conf in conf.tasks: model_conf = task_conf.model language = task_conf.language task = task_conf.task if model_conf.model == 'lstm_crf': model = LstmCrf( lang_token_vocabs[language], task_label_vocabs[task], char_vocab, word_embedding=components[model_conf.word_embed], char_embedding=components[model_conf.char_embed] if hasattr( model_conf, 'char_embed') else None, crf=components[model_conf.crf], lstm=components[model_conf.lstm], input_layer=None, univ_fc_layer=components[model_conf.univ_layer], spec_fc_layer=components[model_conf.spec_layer] if hasattr( model_conf, 'spec_linear') else None, embed_dropout_prob=model_conf.embed_dropout, lstm_dropout_prob=model_conf.lstm_dropout, linear_dropout_prob=model_conf.linear_dropout, char_highway=components[model_conf.char_highway] if hasattr( model_conf, 'char_highway') else None, use_char_embedding=model_conf.use_char_embedding if hasattr( model_conf, 'use_char_embedding') else True, ) # elif model_conf.model == 'cbow': # pass else: raise ValueError('Unknown model: {}'.format(model_conf.model)) logger.debug(model) task_classes = {'ner': NameTagging, 'pos': PosTagging} if task in task_classes: task_obj = task_classes[task]( task_conf.name, model, datasets=datasets[task_conf.dataset], vocabs={ 'token': lang_token_vocabs[language], 'label': task_label_vocabs[task], 'char': char_vocab }, gpu=task_conf.gpu, # TODO: 'gpu' -> global config prob=getattr(task_conf, 'prob', 1.0), lr=getattr(task_conf, 'learning_rate', .001), momentum=getattr(task_conf, 'momentum', .9), decay_rate=getattr(task_conf, 'decay_rate', .9), decay_step=getattr(task_conf, 'decay_step', 10000), gradient_clipping=getattr(task_conf, 'gradient_clipping', 5.0), require_eval=getattr(task_conf, 'require_eval', True), ref=getattr(task_conf, 'ref', False), aux_task=task_conf.task != target_task, aux_lang=task_conf.language != target_lang, ) else: raise ValueError('Unknown task {}'.format(task)) tasks.append(task_obj) return tasks, { 'lang_token_vocabs': lang_token_vocabs, 'task_token_vocabs': task_label_vocabs, 'components': components }
# Load datasets logger.info('Loading datasets') train_set = SequenceDataset(Config({ 'path': args.train, 'parser': conll_parser, 'batch_size': args.batch_size})) dev_set = SequenceDataset(Config({ 'path': args.dev, 'parser': conll_parser})) test_set = SequenceDataset(Config({ 'path': args.test, 'parser': conll_parser})) datasets = {'train': train_set, 'dev': dev_set, 'test': test_set} # Vocabs logger.info('Building vocabularies') token_count, label_count, char_count = compute_metadata( [train_set, dev_set, test_set]) token_vocab = count2vocab([token_count], start_idx=C.EMBED_START_IDX, ignore_case=word_ignore_case) label_vocab = count2vocab([label_count], start_idx=0, sort=True, ignore_case=False) char_vocab = count2vocab([char_count], ignore_case=False, start_idx=C.CHAR_EMBED_START_IDX) if embed_file: logger.info('Scaning pre-trained embeddings') token_vocab = {} with open(embed_file, 'r', encoding='utf-8') as embed_r: if args.embed_skip_first: embed_r.readline() for line in embed_r:
skip_comment=True) train_set = SeqLabelDataset(args.train, parser=parser) dev_set = SeqLabelDataset(args.dev, parser=parser) test_set = SeqLabelDataset(args.test, parser=parser) datasets = {'train': train_set, 'dev': dev_set, 'test': test_set} # Vocabs logger.info('Building vocabs') token_count, char_count, label_count = Counter(), Counter(), Counter() for _, ds in datasets.items(): tc, cc, lc = ds.stats() token_count.update(tc) char_count.update(cc) label_count.update(lc) token_vocab = count2vocab(token_count, offset=len(C.TOKEN_PADS), pads=C.TOKEN_PADS) char_vocab = count2vocab(char_count, offset=len(C.CHAR_PADS), pads=C.CHAR_PADS) label_vocab = count2vocab(label_count, offset=1, pads=[(C.PAD, C.PAD_INDEX)]) # print("label_vocab: ", label_vocab) # DEBUG # idx_token = {v: k for k, v in token_vocab.items()} # not debug idx_token = {v: k for k, v in token_vocab.items() if k != ''} # DEBUG # print(idx_token) # DEBUG # print(idx_token.get(243, "not found")) # DEBUG # print([str(k) + " " + str(v) for k, v in idx_token.items() # DEBUG # if (k == 1 or v == 1 or k == '1' or v == '1')]) # DEBUG idx_label = {v: k for k, v in label_vocab.items()} train_set.numberize(token_vocab, label_vocab, char_vocab) dev_set.numberize(token_vocab, label_vocab, char_vocab) test_set.numberize(token_vocab, label_vocab, char_vocab) # print("numberized train set:")
'dev': test_set_clct, 'test': test_set_clct } # Vocabs logger.info('Building vocabularies') token_count_tgt, label_count_tgt, char_count_tgt = compute_metadata( [train_set_tgt, dev_set_tgt, test_set_tgt]) token_count_cl, label_count_cl, char_count_cl = compute_metadata( [train_set_cl, dev_set_cl, test_set_cl]) token_count_ct, label_count_ct, char_count_ct = compute_metadata( [train_set_ct, dev_set_ct, test_set_ct]) token_count_clct, label_count_clct, char_count_clct = compute_metadata( [train_set_clct, dev_set_clct, test_set_clct]) token_vocab_1 = count2vocab([token_count_tgt, token_count_ct], start_idx=C.EMBED_START_IDX, ignore_case=word_ignore_case) token_vocab_2 = count2vocab([token_count_cl, token_count_clct], start_idx=C.EMBED_START_IDX, ignore_case=word_ignore_case) label_vocab_1 = count2vocab([label_count_tgt, label_count_cl], start_idx=0) label_vocab_2 = count2vocab([label_count_ct, label_count_clct], start_idx=0) char_vocab = count2vocab( [char_count_tgt, char_count_cl, char_count_ct, char_count_clct], start_idx=C.CHAR_EMBED_START_IDX) # Scan embedding file if embed_file_1: logger.info('Scaning pre-trained embeddings for language 1') token_vocab_1 = {} with open(embed_file_1, 'r', encoding='utf-8') as embed_r: