def build_all_vocabs(files, output_dir, prefix=''): from data import ConllParser, NameTaggingDataset parser = ConllParser([3, -1], processor={0: C.TOKEN_PROCESSOR}) token_counter, char_counter, label_counter = Counter(), Counter(), Counter() for file in files: dataset = NameTaggingDataset(file, parser) tc, cc, lc = dataset.counters token_counter.update(tc) char_counter.update(cc) label_counter.update(lc) token_vocab = counter_to_vocab(token_counter, offset=len(C.TOKEN_PADS), pads=C.TOKEN_PADS) char_vocab = counter_to_vocab(char_counter, offset=len(C.CHAR_PADS), pads=C.CHAR_PADS) label_vocab = counter_to_vocab(label_counter) token_vocab = [(t, c) for t, c in token_vocab.items()] char_vocab = [(t, c) for t, c in char_vocab.items()] label_vocab = [(t, c) for t, c in label_vocab.items()] with open(os.path.join(output_dir, '{}token.vocab.tsv'.format(prefix)), 'w', encoding='utf-8') as w: for t, c in token_vocab: w.write('{}\t{}\n'.format(t, c)) with open(os.path.join(output_dir, '{}char.vocab.tsv'.format(prefix)), 'w', encoding='utf-8') as w: for t, c in char_vocab: w.write('{}\t{}\n'.format(t, c)) with open(os.path.join(output_dir, '{}label.vocab.tsv'.format(prefix)), 'w', encoding='utf-8') as w: for t, c in label_vocab: w.write('{}\t{}\n'.format(t, c))
report_file.flush() # Train for dataset in datasets: best_model_file = os.path.join(output_dir, '{}.model.best.mdl'.format(dataset)) dev_result_file = os.path.join(output_dir, '{}.result.dev.bio'.format(dataset)) test_result_file = os.path.join(output_dir, '{}.result.test.bio'.format(dataset)) logger.info('Output directory: {}'.format(output_dir)) # data sets conll_parser = ConllParser( # use the 3rd and last column [3, -1], # process the 3rd column with C.TOKEN_PROCESSOR processor={0: C.TOKEN_PROCESSOR}) train_set = NameTaggingDataset(os.path.join( args.input, dataset, '{}train.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) dev_set = NameTaggingDataset(os.path.join(args.input, dataset, '{}dev.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) test_set = NameTaggingDataset(os.path.join( args.input, dataset, '{}test.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu)
# Logging file log_writer = None if args.log: log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp)) log_writer = open(log_file, 'a', encoding='utf-8') logger.addHandler(logging.FileHandler(log_file, encoding='utf-8')) logger.info('----------') logger.info('Parameters:') for arg in vars(args): logger.info('{}: {}'.format(arg, getattr(args, arg))) logger.info('----------') # Data file logger.info('Loading data sets') parser = ConllParser(separator='\t', token_col=0, label_col=1, skip_comment=True) train_set = SeqLabelDataset(args.train, parser=parser) dev_set = SeqLabelDataset(args.dev, parser=parser) test_set = SeqLabelDataset(args.test, parser=parser) datasets = {'train': train_set, 'dev': dev_set, 'test': test_set} # Vocabs logger.info('Building vocabs') token_count, char_count, label_count = Counter(), Counter(), Counter() for _, ds in datasets.items(): tc, cc, lc = ds.stats() token_count.update(tc) char_count.update(cc) label_count.update(lc) token_vocab = count2vocab(token_count,
log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp)) log_writer = open(log_file, 'a', encoding='utf-8') logger = get_logger(__name__, log_file=log_file) else: logger = get_logger(__name__) logger.info('----------') logger.info('Parameters:') for arg in vars(args): logger.info('{}: {}'.format(arg, getattr(args, arg))) logger.info('----------') # Parser for CoNLL format file conll_parser = ConllParser(Config({ 'separator': '\t', 'token_col': 0, 'label_col': 1, 'skip_comment': True, })) # Load datasets logger.info('Loading datasets') train_set = SequenceDataset(Config({ 'path': args.train, 'parser': conll_parser, 'batch_size': args.batch_size})) dev_set = SequenceDataset(Config({ 'path': args.dev, 'parser': conll_parser})) test_set = SequenceDataset(Config({ 'path': args.test, 'parser': conll_parser})) datasets = {'train': train_set, 'dev': dev_set, 'test': test_set} # Vocabs logger.info('Building vocabularies')
logger.info('Output directory: {}'.format(output_dir)) # deterministic behavior random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # set gpu device use_gpu = torch.cuda.is_available() if use_gpu: torch.cuda.set_device(args.device) torch.set_num_threads(args.thread) # data sets conll_parser = ConllParser([3, -1], processor={0: C.TOKEN_PROCESSOR}) train_set = NameTaggingDataset(os.path.join( args.input, '{}train.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) dev_set = NameTaggingDataset(os.path.join( args.input, '{}dev.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) test_set = NameTaggingDataset(os.path.join( args.input, '{}test.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) # embedding vocab if args.embed_vocab: embed_vocab = load_vocab(args.embed_vocab) else: embed_vocab = build_embedding_vocab(args.embed)
# Logging file log_writer = None if args.log: log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp)) log_writer = open(log_file, 'a', encoding='utf-8') logger.addHandler(logging.FileHandler(log_file, encoding='utf-8')) logger.info('----------') logger.info('Parameters:') for arg in vars(args): logger.info('{}: {}'.format(arg, getattr(args, arg))) logger.info('----------') # Data file logger.info('Loading data sets') ner_parser = ConllParser(skip_comment=True) pos_parser = ConllParser(token_col=1, label_col=3, skip_comment=True) train_set_tgt = SeqLabelDataset(args.train_tgt, parser=ner_parser) dev_set_tgt = SeqLabelDataset(args.dev_tgt, parser=ner_parser) test_set_tgt = SeqLabelDataset(args.test_tgt, parser=ner_parser) train_set_cl = SeqLabelDataset(args.train_cl, parser=ner_parser) dev_set_cl = SeqLabelDataset(args.dev_cl, parser=ner_parser) test_set_cl = SeqLabelDataset(args.test_cl, parser=ner_parser) train_set_ct = SeqLabelDataset(args.train_ct, parser=pos_parser) dev_set_ct = SeqLabelDataset(args.dev_ct, parser=pos_parser) test_set_ct = SeqLabelDataset(args.test_ct, parser=pos_parser) train_set_clct = SeqLabelDataset(args.train_clct, parser=pos_parser)
# Logging file log_writer = None if args.log: log_file = os.path.join(args.log, 'log.{}.txt'.format(timestamp)) log_writer = open(log_file, 'a', encoding='utf-8') logger.addHandler(logging.FileHandler(log_file, encoding='utf-8')) logger.info('----------') logger.info('Parameters:') for arg in vars(args): logger.info('{}: {}'.format(arg, getattr(args, arg))) logger.info('----------') # Data file logger.info('Loading data sets') ner_parser = ConllParser(skip_comment=True, separator='\t') train_set_tgt = SeqLabelDataset(args.train_tgt, parser=ner_parser) dev_set_tgt = SeqLabelDataset(args.dev_tgt, parser=ner_parser) test_set_tgt = SeqLabelDataset(args.test_tgt, parser=ner_parser) train_set_cl = SeqLabelDataset(args.train_cl, parser=ner_parser) dev_set_cl = SeqLabelDataset(args.dev_cl, parser=ner_parser) test_set_cl = SeqLabelDataset(args.test_cl, parser=ner_parser) datasets = { 'tgt': { 'train': train_set_tgt, 'dev': dev_set_tgt, 'test': test_set_tgt },
char_highway=char_hw if train_args['use_highway'] else None) word_embed.load_state_dict(state['model']['word_embed']) char_embed.load_state_dict(state['model']['char_embed']) char_hw.load_state_dict(state['model']['char_hw']) lstm.load_state_dict(state['model']['lstm']) crf.load_state_dict(state['model']['crf']) linear.load_state_dict(state['model']['linear']) lstm_crf.load_state_dict(state['model']['lstm_crf']) if use_gpu: lstm_crf.cuda() # Load dataset logger.info('Loading data') parser = ConllParser() test_set = SeqLabelDataset(data_file, parser=parser) test_set.numberize(token_vocab, label_vocab, char_vocab) idx_token = {v: k for k, v in token_vocab.items()} idx_label = {v: k for k, v in label_vocab.items()} processor = SeqLabelProcessor(gpu=use_gpu) try: results = [] dataset_loss = [] for batch in DataLoader(test_set, batch_size=50, shuffle=False, collate_fn=processor.process): tokens, labels, chars, seq_lens, char_lens = batch pred, loss = lstm_crf.predict(tokens, labels, seq_lens, chars,
log_writer = open(log_file, 'a', encoding='utf-8') logger = get_logger(__name__, log_file=log_file) else: logger = get_logger(__name__) logger.info('----------') logger.info('Parameters:') for arg in vars(args): logger.info('{}: {}'.format(arg, getattr(args, arg))) logger.info('----------') # Parser for CoNLL format file name_tagging_parser = ConllParser( Config({ 'separator': '\t', 'token_col': 0, 'label_col': 1, 'skip_comment': True, })) pos_tagging_parser = ConllParser( Config({ 'separator': '\t', 'token_col': 1, 'label_col': 3, 'skip_comment': True, })) # Load data sets logger.info('Loading data sets') datasets = {} train_set_tgt = SequenceDataset(