def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) retype_vocab_size = len(self.vocab.types) rename_vocab_size = len(self.vocab.names) self.target_embedding = nn.Embedding( retype_vocab_size + rename_vocab_size, config["target_embedding_size"] ) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder( decoder_layer, config["num_layers"], decoder_norm ) self.output = nn.Linear( config["hidden_size"], retype_vocab_size + rename_vocab_size ) self.mem_mask = config["mem_mask"] self.config: Dict = config self.retype_vocab_size = retype_vocab_size
def __init__(self, url: str, config: Optional[Dict] = None, percent: float = 1.0): # support wildcards urls = sorted(glob.glob(url)) urls = urls[:int(percent * len(urls))] super().__init__(urls) if config: # annotate example for training from utils.vocab import Vocab self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.max_src_tokens_len = config["max_src_tokens_len"] self.max_num_var = config["max_num_var"] annotate = self._annotate self.rename = config.get("rename", False) # sort = Dataset._sort sort = identity else: # for creating the vocab annotate = identity sort = identity self = (self.pipe(Dataset._file_iter_to_line_iter).map( Example.from_json).map(annotate).shuffle( Dataset.SHUFFLE_BUFFER).pipe(sort))
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.typelib = self.typelib.fix() self.target_embedding = nn.Embedding(len(self.vocab.subtypes), config["target_embedding_size"]) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # self.cached_decode_mask: Dict[int, torch.Tensor] = {} # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder(decoder_layer, config["num_layers"], decoder_norm) self.output = nn.Linear(config["hidden_size"], len(self.vocab.subtypes)) self.config: Dict = config
def covar_analysis(args): model = GaussianBilinearModel.load_model(args.model) rel_vocab = Vocab.load(args.relation) rel_mats = model.relation_mats scores = [abs(np.linalg.det(mat)) for mat in rel_mats] sort_idxs = np.argsort(scores)[::-1] for idx in sort_idxs: print('{} : {}'.format(rel_vocab.get_word(idx), scores[idx]))
def build(cls, config): params = util.update(cls.default_params(), config) vocab = Vocab.load(params['vocab_file']) model = cls(params['ast_node_encoding_size'], params['hidden_size'], params['dropout'], vocab) model.config = params return model
def __init__(self, config): super().__init__() self.vocab = vocab = Vocab.load(config['vocab_file']) self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size']) self.config = config self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size']) if self.config['transformer'] == 'none': dropout = config['dropout'] self.lstm_encoder = nn.LSTM(input_size=self.src_word_embed.embedding_dim, hidden_size=config['source_encoding_size'] // 2, num_layers=config['num_layers'], batch_first=True, bidirectional=True, dropout=dropout) self.dropout = nn.Dropout(dropout) elif self.config['transformer'] == 'bert': self.vocab_size = len(self.vocab.source_tokens) + 1 state_dict = torch.load('saved_checkpoints/bert_2604/bert_pretrained_epoch_23_batch_140000.pth') keys_to_delete = ["cls.predictions.bias", "cls.predictions.transform.dense.weight", "cls.predictions.transform.dense.bias", "cls.predictions.transform.LayerNorm.weight", "cls.predictions.transform.LayerNorm.bias", "cls.predictions.decoder.weight", "cls.predictions.decoder.bias", "cls.seq_relationship.weight", "cls.seq_relationship.bias"] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict['model'].items(): if k in keys_to_delete: continue name = k[5:] # remove `bert.` new_state_dict[name] = v bert_config = BertConfig(vocab_size=self.vocab_size, max_position_embeddings=512, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) self.bert_model = BertModel(bert_config) self.bert_model.load_state_dict(new_state_dict) elif self.config['transformer'] == 'xlnet': self.vocab_size = len(self.vocab.source_tokens) + 1 state_dict = torch.load('saved_checkpoints/xlnet_2704/xlnet1_pretrained_epoch_13_iter_500000.pth') keys_to_delete = ["lm_loss.weight", "lm_loss.bias"] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict['model'].items(): if k in keys_to_delete: continue if k[:12] == 'transformer.': name = k[12:] else: name = k new_state_dict[name] = v xlnet_config = XLNetConfig(vocab_size=self.vocab_size, d_model=256, n_layer=12) self.xlnet_model = XLNetModel(xlnet_config) self.xlnet_model.load_state_dict(new_state_dict) else: print("Error! Unknown transformer type '{}'".format(self.config['transformer']))
def build(cls, config): params = util.update(cls.default_params(), config) vocab = Vocab.load(params['vocab_file']) model = cls(params['variable_encoding_size'], params['hidden_size'], params['dropout'], params['tie_embedding'], params['input_feed'], vocab) model.config = params return model
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs): db = AbstractDB(db_file, 'r') entity_db = EntityDB.load(entity_db_file) vocab = Vocab.load(vocab_file) if word2vec: w2vec = ModelReader(word2vec) else: w2vec = None train.train(db, entity_db, vocab, w2vec, **kwargs)
def __init__(self, config): super().__init__() self.vocab = vocab = Vocab.load(config['vocab_file']) self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size']) dropout = config['dropout'] self.encoder = TransformerModel(self.src_word_embed.embedding_dim, 1, config['source_encoding_size'], config['num_layers'], dropout=dropout) self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size']) self.dropout = nn.Dropout(dropout) self.config = config
def __init__(self, config, train=True): self.config = config self.train = train # model specific config self.is_ensemble = config['encoder']['type'] == 'EnsembleModel' if not self.is_ensemble: self.vocab = Vocab.load(config['data']['vocab_file']) self.grammar = self.vocab.grammar self.use_seq_encoder = config['encoder']['type'] == 'SequentialEncoder' self.use_hybrid_encoder = config['encoder']['type'] == 'HybridEncoder' self.init_gnn_with_seq_encoding = \ config['encoder']['type'] == 'GraphASTEncoder' \ and config['encoder']['init_with_seq_encoding']
def __init__(self, config): super().__init__() self.vocab = vocab = Vocab.load(config["vocab_file"]) self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config["source_embedding_size"]) dropout = config["dropout"] self.encoder = TransformerModel( self.src_word_embed.embedding_dim, config["num_heads"], config["hidden_size"], config["num_layers"], dropout=dropout, ) self.dropout = nn.Dropout(dropout) self.config = config
def build(cls, config): params = util.update(GraphASTEncoder.default_params(), config) print(params) connections = params['connections'] connection2edge_type = { 'top_down': 1, 'bottom_up': 1, 'variable_master_nodes': 2, 'terminals': 2, 'master_node': 2, 'var_usage': 2, 'func_root_to_arg': 1 } num_edge_types = sum(connection2edge_type[key] for key in connections) gnn = GatedGraphNeuralNetwork( hidden_size=params['gnn']['hidden_size'], layer_timesteps=params['gnn']['layer_timesteps'], residual_connections=params['gnn']['residual_connections'], num_edge_types=num_edge_types ) vocab = Vocab.load(params['vocab_file']) node_type_embedder = NodeTypeEmbedder( len(vocab.grammar.variable_types), params['node_type_embedding_size'] ) node_content_embedder = SubTokenEmbedder( vocab.obj_name.subtoken_model_path, params['node_content_embedding_size'] ) model = cls(gnn, params['connections'], params['node_syntax_type_embedding_size'], params['decoder_hidden_size'], node_type_embedder, node_content_embedder, vocab, config=params) return model
def __init__(self, config, config_load=None): super().__init__() if config_load is not None: config = config_load self.encoder = Encoder.build(config["encoder"]) self.retype = config["data"].get("retype", False) self.rename = config["data"].get("rename", False) self.interleave = config["data"].get("interleave", False) if self.interleave: self.interleave_module = InterleaveDecodeModule(config) else: if self.retype: self.retyping_module = RetypingDecodeModule(config) if self.rename: self.renaming_module = RenamingDecodeModule(config) self.config = config self.vocab = Vocab.load(config["data"]["vocab_file"]) self._preprocess() self.soft_mem_mask = config["decoder"]["mem_mask"] == "soft"
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) vocab_size = ( len(self.vocab.names) if config.get("rename", False) else len(self.vocab.types) ) self.target_id_key = ( "target_name_id" if config.get("rename", False) else "target_type_id" ) self.target_embedding = nn.Embedding( vocab_size, config["target_embedding_size"] ) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) self.cached_decode_mask: Dict[int, torch.Tensor] = {} self.size = torch.zeros(vocab_size, dtype=torch.long) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], config["num_heads"], 4 * config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder( decoder_layer, config["num_layers"], decoder_norm ) self.output = nn.Linear(config["hidden_size"], vocab_size) self.mem_mask = config["mem_mask"] if config.get("rename", False): self.mem_mask = "none" self.config: Dict = config
def __init__(self, args): self.p = args if not os.path.isdir(self.p.log_dir): os.mkdir(self.p.log_dir) if not os.path.isdir(self.p.save_dir): os.mkdir(self.p.save_dir) pprint(vars(self.p)) self.logger = get_logger(self.p.name, self.p.log_dir) self.logger.info(vars(self.p)) self.save_path = os.path.join(self.p.save_dir, self.p.name) + '.pth' if self.p.gpu != '-1': self.device = torch.device('cuda') torch.cuda.set_rng_state(torch.cuda.get_rng_state()) torch.backends.cudnn.deterministic = True else: self.device = torch.device('cpu') def lr_func(epoch): if epoch < 10: return 1.0 elif 10 <= epoch and epoch < 25: return 0.3 else: return 0.1 self.data = self.load_data() self.vocab = Vocab.load('data/vocab.bpe10000/vocab') self.model = self.add_model() self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100, reduction='none') self.optim = torch.optim.Adam(self.model.parameters(), lr=self.p.lr) self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optim, lr_lambda=lr_func, last_epoch=-1) self.curr_epoch = 0 if self.p.restore: self.load_model(self.save_path)
def __init__(self, config): super().__init__() self.vocab = vocab = Vocab.load(config['vocab_file']) self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size']) dropout = config['dropout'] self.lstm_encoder = nn.LSTM( input_size=self.src_word_embed.embedding_dim, hidden_size=config['source_encoding_size'] // 2, num_layers=config['num_layers'], batch_first=True, bidirectional=True, dropout=dropout) self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size']) self.dropout = nn.Dropout(dropout) self.config = config
def path_analysis(args): ent_vocab = Vocab.load(args.entity) rel_vocab = RelationVocab.load(args.relation, inv_flg=True) triple_dat = TripletDataset.load(args.triple, ent_vocab, rel_vocab) pq_dat = PathQueryDataset.load(args.query, ent_vocab, rel_vocab) g = LabeledDiGraph(triple_dat, inv_flg=True) # traversal path querys n_rel = [] n_tail = [] for (sub, rels, _) in pq_dat.samples: cur_ents = set([sub]) for r in rels: next_ents = set() for e in cur_ents: new_ents = g.walk(e, r) next_ents.update(new_ents) cur_ents = next_ents n_rel.append(len(rels)) n_tail.append(len(cur_ents)) print(n_rel) print(n_tail) print('Correlation Coefficient: {}'.format( np.corrcoef(n_rel, n_tail)[0, 1]))
def train(args): if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) if args.gpu != '-1' and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_rng_state(torch.cuda.get_rng_state()) torch.backends.cudnn.deterministic = True else: device = torch.device('cpu') config = { 'train': { 'unchanged_variable_weight': 0.1, 'buffer_size': 5000 }, 'encoder': { 'type': 'SequentialEncoder' }, 'data': { 'vocab_file': 'data/vocab.bpe10000/vocab' } } train_set = Dataset('data/preprocessed_data/train-shard-*.tar') dev_set = Dataset('data/preprocessed_data/dev.tar') vocab = Vocab.load('data/vocab.bpe10000/vocab') if args.decoder: vocab_size = len(vocab.all_subtokens) + 1 else: vocab_size = len(vocab.source_tokens) + 1 max_iters = args.max_iters lr = args.lr warm_up = args.warm_up batch_size = 4096 effective_batch_size = args.batch_size max_embeds = 1000 if args.decoder else 512 bert_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_embeds, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) model = BertForPreTraining(bert_config) if args.restore: state_dict = torch.load(os.path.join(args.save_dir, args.res_name)) model.load_state_dict(state_dict['model']) batch_count = state_dict['step'] epoch = state_dict['epoch'] model.train() model.to(device) if len(args.gpu) > 1 and device == torch.device('cuda'): model = nn.DataParallel(model) def lr_func(step): if step > warm_up: return (max_iters - step) / (max_iters - warm_up) else: return (step / warm_up) optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-6, weight_decay=0.01) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_func, last_epoch=-1) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') if args.restore: optimizer.load_state_dict(state_dict['optim']) scheduler.load_state_dict(state_dict['scheduler']) batch_count = 0 epoch = 0 cum_loss = 0.0 while True: # load training dataset, which is a collection of ASTs and maps of gold-standard renamings train_set_iter = train_set.batch_iterator( batch_size=batch_size, return_examples=False, config=config, progress=True, train=True, max_seq_len=512, num_readers=args.num_readers, num_batchers=args.num_batchers) epoch += 1 print("Epoch {}".format(epoch)) loss = 0 num_seq = 0 optimizer.zero_grad() for batch in train_set_iter: if args.decoder: input_ids = batch.tensor_dict['prediction_target'][ 'src_with_true_var_names'] else: input_ids = batch.tensor_dict['src_code_tokens'] attention_mask = torch.ones_like(input_ids) attention_mask[input_ids == 0] = 0.0 assert torch.max(input_ids) < vocab_size assert torch.min(input_ids) >= 0 if input_ids.shape[0] > max_embeds: print( "Warning - length {} is greater than max length {}. Skipping." .format(input_ids.shape[0], max_embeds)) continue input_ids, labels = mask_tokens(inputs=input_ids, mask_token_id=vocab_size - 1, vocab_size=vocab_size, mlm_probability=0.15) input_ids[attention_mask == 0] = 0 labels[attention_mask == 0] = -100 if torch.cuda.is_available(): input_ids = input_ids.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() outputs = model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) unreduced_loss = loss_fn( outputs[0].view(-1, bert_config.vocab_size), labels.view(-1)).reshape(labels.shape) / ( torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7) loss += unreduced_loss.sum() num_seq += input_ids.shape[0] if num_seq > effective_batch_size: batch_count += 1 loss /= num_seq cum_loss += loss.item() if batch_count % 20 == 0: print("{} batches, Loss : {:.4}, LR : {:.6}".format( batch_count, cum_loss / 20, scheduler.get_lr()[0])) cum_loss = 0.0 if batch_count % 10000 == 0: fname1 = os.path.join( args.save_dir, 'bert_{}_step_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) fname2 = os.path.join( args.save_dir, 'bert_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) state = { 'epoch': epoch, 'step': batch_count, 'model': model.module.state_dict(), 'optim': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(state, fname1) torch.save(state, fname2) print("Saved file to path {}".format(fname1)) print("Saved file to path {}".format(fname2)) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() loss = 0 num_seq = 0 if batch_count == max_iters: print(f'[Learner] Reached max iters', file=sys.stderr) exit() print("Max_len = {}".format(max_len)) break
p.add_argument('--entity') p.add_argument('--relation') args = p.parse_args() assert args.task in ['kbc', 'pq'], 'Invalid task: {}'.format(args.task) assert args.metric in ['mrr', 'hits'], 'Invalid metric: {}'.format(args.metric) if args.metric == 'hits': assert args.nbest, 'Please indecate n-best in using hits' model = GaussianBilinearModel.load_model(args.model) print('Preparing dataset...') if args.task == 'kbc': ent_vocab = Vocab.load(args.entity) rel_vocab = Vocab.load(args.relation) dataset = TripletDataset.load(args.data, ent_vocab, rel_vocab) elif args.task == 'pq': ent_vocab = Vocab.load(args.entity) rel_vocab = RelationVocab.load(args.relation, inv_flg=True) dataset = PathQueryDataset.load(args.data, ent_vocab, rel_vocab) if not hasattr(model, 'inv_flg') or not model.inv_flg: print('initializing inverse relation representations...') model.init_inverse() print('Start evaluation...') if args.metric == 'mrr': from evaluation import mrr # res = mrr.cal_mrr(model, dataset) res = mrr.multi_cal_mrr(model, dataset)
def main(args): # print argument values print("Info: arguments\n\t" + "\n\t".join(["{}: {}".format(a, v) for a, v in vars(args).items()]), file=sys.stderr) # set seed if not args.seed: seed = random.randint(1, MAX_SEED) args.seed = seed print("using seed: ", args.seed) init_dynet(args) assert os.path.exists(args.data) if args.task == SENTIMENT: assert args.trg_domain in SENTIMENT_DOMAINS, f'Error: {args.trg_domain} is not a sentiment domain.' assert args.src_domain is not None, 'Error: A source domain must be specified.' elif args.task == POS: assert args.trg_domain in POS_DOMAINS, f'Error: {args.trg_domain} is not a POS domain.' if args.task == SENTIMENT: assert args.max_vocab_size == 5000, f'Error: Max vocab size is not 5000.' # create the model and log directories if they do not exist for dir_path in [args.model_dir, os.path.dirname(args.log_file)]: print("Check if directory exists:", dir_path) if not os.path.exists(dir_path): print('Creating %s...' % dir_path) os.makedirs(dir_path) # create predictions folder if it does not exist if args.output_predictions: if not os.path.exists(args.output_predictions): print('Creating output predictions folder: {}'.format( args.output_predictions)) os.makedirs(args.output_predictions) if args.strategy not in [BASE, MTTRI_BASE]: # check that pre-trained models exist assert args.start_model_dir is not None,\ 'Error: start_model_dir needs to be provided.' for suffix in ['.model', '.params.pickle']: if args.strategy != TRI_TRAINING: # tri-training w/ disagreement is enabled with --disagreement model_file = os.path.join( args.start_model_dir, args.start + "_run" + str(args.start_run) + suffix) assert os.path.exists(model_file),\ 'Error: %s does not exist.' % model_file else: # check if 3 exists for tri_training model_name = args.start + "_bootstrap3_run" + str( args.start_run) + suffix model_file = os.path.join(args.start_model_dir, model_name) assert os.path.exists(model_file), \ 'Error: %s does not exist.' % model_file if args.task == POS: pos_path = os.path.join(args.data, 'gweb_sancl', 'pos_fine') assert os.path.exists(pos_path) train_path = os.path.join(pos_path, 'wsj', 'gweb-wsj-train.conll') dev_path = os.path.join(pos_path, 'wsj', 'gweb-wsj-dev.conll') unlabeled_path = os.path.join( args.data, 'gweb_sancl', 'unlabeled', 'gweb-%s.unlabeled.txt' % args.trg_domain) dev_test_path = os.path.join(pos_path, args.trg_domain, 'gweb-%s-dev.conll' % args.trg_domain) test_path = os.path.join(pos_path, args.trg_domain, 'gweb-%s-test.conll' % args.trg_domain) elif args.task == SENTIMENT: sentiment_path = os.path.join(args.data, 'processed_acl') train_path = dev_path = os.path.join(sentiment_path, args.src_domain) # since there is no target domain test set, we just tune hyperparams # on book->dvd unlabeled_path = dev_test_path = test_path = os.path.join( sentiment_path, args.trg_domain) else: raise ValueError() # load the data and save it to a pickle file split2data = {} read_data = data_readers.task2read_data_func(args.task) for split, path_ in zip( ['train', 'dev', 'dev_test', 'test', 'unlabeled'], [train_path, dev_path, dev_test_path, test_path, unlabeled_path]): if split == 'unlabeled': data = read_data( path_, unlabeled=True, max_unlabeled=args.max_unlabeled) # [[instances],[]] else: data = read_data( path_, unlabeled=False, max_train=args.max_train) # keeps [[instances],[labels]] # the DANN paper uses somewhat different splits than the standard, so # we create the splits here if args.task == SENTIMENT: if split == 'train': # in the DANN paper, they use all 2000 training examples pass elif split == 'dev': # the DANN paper uses 200 target samples for testing, # which are read from the unlabeled file continue elif split == 'unlabeled': # in the DANN paper, we use the content of the unlabeled file # for testing split = 'test' data, data_dev = (data[0][:-200], data[1][:-200]), (data[0][-200:], data[1][-200:]) # we use 200 labeled samples for validation split2data['dev'] = list(data_dev) elif split == 'test': # in the DANN set-up, we use this data as unlabeled data split = 'unlabeled' data = data[0], [] elif split == 'unlabeled': data = data[0], [] elif args.max_unlabeled and split == 'unlabeled': print('Restricting # of unlabeled examples to', args.max_unlabeled, file=sys.stderr) new_data = data[0][:args.max_unlabeled], data[1][:args. max_unlabeled] if len(new_data[0]) < args.max_unlabeled: args.max_unlabeled = len( new_data[0]) # set if |unlabeled| < --max-unlabeled data = new_data elif args.max_train and split == 'train': print('Restricting # of labeled training examples to', args.max_train, file=sys.stderr) data = data[0][:args.max_train], data[1][:args.max_train] split2data[split] = list(data) print('# of %s examples: %d.' % (split, len(data[0]))) vocab_dir = args.model_dir if args.strategy in [BASE, MTTRI_BASE ] else args.start_model_dir vocab_path = os.path.join(vocab_dir, 'vocab.txt') vocab = Vocab(vocab_path, max_vocab_size=args.max_vocab_size) if not os.path.exists(vocab_path): # build the vocabulary assert args.strategy in [BASE, MTTRI_BASE],\ 'Error: Vocabulary should only be created with the base model.' vocab.create(split2data['train'][0] + split2data['unlabeled'][0], lowercase=args.lowercase) else: vocab.load() if args.task == SENTIMENT: print('Creating binary training data...') split2data = data_utils.get_tfidf_data(split2data, vocab, tfidf=True) elif args.task.startswith('pos'): print('Using words as training data for POS tagging...') elif args.task == 'parsing': print( 'Using CoNLL entries as training data for parsing. Using word forms to extract feature representations...' ) for split, data in split2data.items(): split2data[split][0] = [[ conll_entry.form for conll_entry in conll_entries ] for conll_entries in data[0]] else: raise ValueError( 'Training data retrieval for task %s is not implemented.' % args.task) run_scores = [] train_func = task_utils.task2train_func(args.task, args.strategy) for i in range(args.num_runs): run_num = i + 1 print('\nRun %d/%d.' % (run_num, args.num_runs)) val_score, test_score = train_func( vocab, args, *itertools.chain.from_iterable([ split2data['train'], split2data['dev'], split2data['dev_test'], split2data['test'], split2data['unlabeled'] ]), run_num) print('Validation score: %.3f. Test score: %.3f' % (val_score, test_score)) run_scores.append((val_score, test_score)) if args.num_runs > 1: # log the results of multiple runs to a file data_utils.log_to_file(args, run_scores)
def train(args): if args.log: log_dir = args.log else: log_dir = os.path.join( os.path.abspath(os.path.dirname(__file__)), '{}'.format(datetime.now().strftime('%Y%m%d_%H:%M'))) if not os.path.exists(log_dir): os.mkdir(log_dir) # setting for logging logger = logging.getLogger() logging.basicConfig(level=logging.INFO) log_path = os.path.join(log_dir, 'log') file_handler = logging.FileHandler(log_path) fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(fmt) logger.addHandler(file_handler) logger.info('Arguments...') for arg, val in vars(args).items(): logger.info('{} : {}'.format(arg, val)) logger.info('Preparing dataset...') if not args.entity or not args.relation: # make vocab from train set logger.info('Making entity/relation vocab from train data...') raise NotImplementedError() else: ent_vocab = Vocab.load(args.entity) rel_vocab = Vocab.load(args.relation) n_entity, n_relation = len(ent_vocab), len(rel_vocab) train_dat = TripletDataset.load(args.train, ent_vocab, rel_vocab) logger.info('') if args.valid: assert args.metric in ['mrr', 'hits'], 'Invalid evaluation metric: {}'.format( args.metric) assert args.metric, 'Please indecate evaluation metric for validation' if args.metric == 'hits': assert args.nbest, 'Please indecate nbest for hits' valid_dat = TripletDataset.load(args.valid, ent_vocab, rel_vocab) if args.restart: logger.info('Restarting training: {}'.format(args.restart)) model = GaussianBilinearModel.load_model(args.restart) else: logger.info('Building new model') opt = SGD(args.lr, args.gradclip) model = GaussianBilinearModel(n_entity, n_relation, args.dim, args.cmin, args.cmax, opt, args.tri, args.init_sigma) best_model = None best_val = -1 for epoch in range(args.epoch): logger.info('start {} epoch'.format(epoch + 1)) sum_loss = 0 start = time.time() for i, pos_sample in enumerate(data_iter(train_dat)): neg_samples = [(pos_sample[0], pos_sample[1], np.random.randint(n_entity)) for _ in range(args.num_negative)] for neg_sample in neg_samples: loss = model.update(pos_sample, neg_sample) sum_loss += loss # logger.info('loss: {}'.format(loss)) # logger.info('processing {} samples in this epoch'.format(i+1)) print('processing {} samples in this epoch'.format(i + 1)) logger.info('sum loss: {}'.format(sum_loss)) logger.info('{} sec/epoch for training'.format(time.time() - start)) model_path = os.path.join(log_dir, 'model{}'.format(epoch + 1)) model.save_model(model_path) if args.valid and (epoch + 1) % args.evalstep == 0: val = evaluation(model, valid_dat, args.metric, args.nbest) logger.info('{} in validation: {}'.format(args.metric, val)) if val > best_val: best_model = copy.deepcopy(model) best_val = val best_epoch = epoch + 1 if args.valid: logger.info('best model is {} epoch'.format(best_epoch)) model_path = os.path.join(log_dir, 'bestmodel') best_model.save_model(model_path) logger.info('done all')
from helper import * from utils.vocab import PAD_ID, Vocab vocab = Vocab.load('data/vocab.bpe10000/vocab') def tokens_to_word(inp_seq): output = '' for t in inp_seq: c = vocab.all_subtokens.id2word[t] if c == '<s>': c = '' if c == '</s>': c = '' if c == '<pad>': c = '' output += c return output class HistogramBins(object): def __init__(self, thresholds, key_func): self.thresholds = np.array(thresholds) self.thresholds = np.concatenate((self.thresholds, [math.inf])) self.key_func = key_func self.bins = {thresh : [] for thresh in self.thresholds} def process(self, data): for x in data: key = self.key_func(x) bin_id = np.argmax((self.thresholds - key) > 0) self.bins[self.thresholds[bin_id]].append(x)