def main(): parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis collater = Collater(gpu=args.cuda, is_train=False, data_type=data_type) batcher = DataLoader(data, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict['config'] config['dump_feature'] = True opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) return num_all_batches = len(batcher) model = MTDNNModel( opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes] uids = batch_meta['uids'] masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, 'w', encoding='utf-8') as writer: for sample in data: uid = sample['uid'] tokens = sample['tokens'] feature = features_dict[uid] feature['tokens'] = tokens feature['uid'] = uid writer.write('{}\n'.format(json.dumps(feature)))
parser = argparse.ArgumentParser() parser = data_config(parser) parser = model_config(parser) parser = train_config(parser) args = parser.parse_args() output_dir = args.output_dir data_dir = args.data_dir args.train_datasets = args.train_datasets.split(',') args.test_datasets = args.test_datasets.split(',') pprint(args) os.makedirs(output_dir, exist_ok=True) output_dir = os.path.abspath(output_dir) set_environment(args.seed, args.cuda) log_path = args.log_file logger = create_logger(__name__, to_disk=True, log_file=log_path) logger.info(args.answer_opt) task_defs = TaskDefs(args.task_def) encoder_type = task_defs.encoderType args.encoder_type = encoder_type def dump(path, data): with open(path, 'w') as f: json.dump(data, f) def generate_decoder_opt(enable_san, max_opt):
def main(): task_def_path = 'data_complex/lcp.yml' task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() encoder_type = args.encoder_type layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis fout_temp = '{}.tmp'.format(args.finput) dump_data(data, fout_temp) collater = Collater(is_train=False, encoder_type=encoder_type) dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type) batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict['config'] config['dump_feature'] = True config['local_rank'] = -1 opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) return num_all_batches = len(batcher) model = MTDNNModel( opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes] #import pdb; pdb.set_trace() uids = batch_meta['uids'] masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, 'w', encoding='utf-8') as writer: for sample in data: uid = sample['uid'] tokens = sample['tokens'] feature = features_dict[uid] feature['tokens'] = tokens feature['uid'] = uid writer.write('{}\n'.format(json.dumps(feature)))
def main(): parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() encoder_type = args.encoder_type layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = (DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis) fout_temp = "{}.tmp".format(args.finput) dump_data(data, fout_temp) collater = Collater(is_train=False, encoder_type=encoder_type) dataset = SingleTaskDataset( fout_temp, False, maxlen=args.max_seq_length, ) batcher = DataLoader( dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda, ) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict["config"] config["dump_feature"] = True opt.update(config) else: logger.error("#" * 20) logger.error( "Could not find the init model!\n The parameters will be initialized randomly!" ) logger.error("#" * 20) return num_all_batches = len(batcher) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [ all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes ] uids = batch_meta["uids"] masks = batch_data[batch_meta["mask"]].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, "w", encoding="utf-8") as writer: for sample in data: uid = sample["uid"] feature = features_dict[uid] feature["uid"] = uid writer.write("{}\n".format(json.dumps(feature)))