def export_vocab(text_hist, program_hist, schema_lexical_vocab, world_vocab, args): if schema_lexical_vocab is not None: # Merge the lexicon based on the natural language text and the database schema for v in schema_lexical_vocab: text_hist[v] = -1 text_vocab = Vocabulary('text', func_token_index=functional_token_index, tu=utils.get_trans_utils(args)) full_vocab = Vocabulary('full', func_token_index=functional_token_index, tu=utils.get_trans_utils(args)) for v in text_hist: text_vocab.index_token(v, True, text_hist[v]) text_vocab_path = get_vocab_path(args, 'nl') text_vocab.save_to_disk(text_vocab_path) program_vocab = Vocabulary('program', func_token_index=functional_token_index) for v in program_hist: program_vocab.index_token(v, True, program_hist[v]) program_vocab_path = get_vocab_path(args, 'cm') program_vocab.save_to_disk(program_vocab_path) # Combine text and program vocabularies full_vocab.merge_with(text_vocab) full_vocab.merge_with(program_vocab) full_vocab_path = get_vocab_path(args, 'full') full_vocab.save_to_disk(full_vocab_path) world_vocab_path = get_vocab_path(args, 'world') world_vocab.save_to_disk(world_vocab_path)
def __init__(self, args, in_vocab=None, out_vocab=None): super().__init__() self.share_vocab = args.share_vocab self.in_vocab, self.out_vocab = in_vocab, out_vocab self.input_vocab_size = self.in_vocab.full_size self.output_vocab_size = self.out_vocab.size self.tu = utils.get_trans_utils(args) self.max_in_seq_len = self.tu.tokenizer.max_len if args.pretrained_transformer != 'null' else args.max_in_seq_len self.max_out_seq_len = args.max_out_seq_len self.encoder_input_dim = args.encoder_input_dim self.decoder_input_dim = args.decoder_input_dim self.emb_dropout = args.emb_dropout_rate self.res_dropout = (args.res_input_dropout_rate, args.res_layer_dropout_rate) self.cross_attn_dropout = args.cross_attn_dropout_rate self.cross_attn_num_heads = args.cross_attn_num_heads self.xavier_initialization = args.xavier_initialization self.pretrained_transformer = args.pretrained_transformer self.pretrained_lm_dropout = args.pretrained_lm_dropout_rate self.fix_pretrained_transformer_parameters = args.fix_pretrained_transformer_parameters self.data_parallel = args.data_parallel self.dataset_name = args.dataset_name self.encoder_embeddings = None self.decoder_embeddings = None self.encoder = None self.decoder = None
def load_vocabs_seq2seq_ptr(args): if args.pretrained_transformer: tu = utils.get_trans_utils(args) text_vocab = Vocabulary(tag='text', func_token_index=None, tu=tu) for v in tu.tokenizer.vocab: text_vocab.index_token(v, in_vocab=True, check_for_seen_vocab=True) else: text_vocab_path = get_vocab_path(args, 'nl') text_vocab = load_vocab(text_vocab_path, args.text_vocab_min_freq, tu=utils.get_trans_utils(args)) program_vocab = sql_reserved_tokens if args.pretrained_transformer else sql_reserved_tokens_revtok print('* text vocab size = {}'.format(text_vocab.size)) print('* program vocab size = {}'.format(program_vocab.size)) print() vocabs = {'text': text_vocab, 'program': program_vocab} return vocabs
def load_vocabs_seq2seq(args): if args.share_vocab: vocab_path = get_vocab_path(args, 'full') vocab = load_vocab(vocab_path, args.vocab_min_freq, tu=utils.get_trans_utils(args)) text_vocab, program_vocab = vocab, vocab else: text_vocab_path = get_vocab_path(args, 'nl') text_vocab = load_vocab(text_vocab_path, args.text_vocab_min_freq, tu=utils.get_trans_utils(args)) program_vocab_path = get_vocab_path(args, 'cm') program_vocab = load_vocab(program_vocab_path, args.program_vocab_min_freq) print('* text vocab size = {}'.format(text_vocab.size)) print('* program vocab size = {}'.format(program_vocab.size)) vocabs = {'text': text_vocab, 'program': program_vocab} return vocabs
def get_tokenizers(args): if args.pretrained_transformer == 'null': text_tokenize = revtok_tokenize_with_functional program_tokenize = revtok_sql_tokenize post_process = revtok_de_tokenize transformer_utils = None else: transformer_utils = utils.get_trans_utils(args) text_tokenize = transformer_utils.tokenizer.tokenize def p_tokenize(sql, **kwargs): return sql_tokenize(sql, text_tokenize, **kwargs) program_tokenize = p_tokenize def p_detokenize(tokens, **kwargs): return trans_de_tokenize(tokens, transformer_utils, **kwargs) post_process = p_detokenize return text_tokenize, program_tokenize, post_process, transformer_utils
def __init__(self, args): super(LFramework, self).__init__() self.model = args.model self.model_id = args.model_id self.tu = utils.get_trans_utils(args) self.schema_graphs = None # Training hyperparameters self.args = args _, _, _, self.tu = tok.get_tokenizers(args) self.dataset = args.dataset_name self.model_dir = args.model_dir self.train_batch_size = args.train_batch_size self.dev_batch_size = args.dev_batch_size self.start_step = args.start_step self.num_steps = args.num_steps self.num_peek_steps = args.num_peek_steps self.num_log_steps = args.num_log_steps self.num_accumulation_steps = args.num_accumulation_steps self.save_best_model_only = args.save_best_model_only self.optimizer = args.optimizer self.bert_finetune_rate = args.bert_finetune_rate self.learning_rate = args.learning_rate self.learning_rate_scheduler = learning_rate_scheduler_sigs[ args.learning_rate_scheduler] self.ft_learning_rate_scheduler = learning_rate_scheduler_sigs[ args.trans_learning_rate_scheduler] self.warmup_init_lr = args.warmup_init_lr self.warmup_init_ft_lr = args.warmup_init_ft_lr self.num_warmup_steps = args.num_warmup_steps self.grad_norm = args.grad_norm self.adam_beta1 = args.adam_beta1 self.adam_beta2 = args.adam_beta2 self.optim = None self.lr_scheduler = None self.decoding_algorithm = args.decoding_algorithm self.beam_size = args.beam_size self.save_all_checkpoints = args.save_all_checkpoints # Visualization saver self.vis_writer = LayerVisualizationDataWriter(log_dir=args.viz_dir)
import src.common.lr_scheduler as lrs from src.common.nn_modules import * import src.common.ops as ops from src.data_processor.processor_utils import get_table_aware_transformer_encoder_inputs from src.data_processor.schema_loader import load_schema_graphs import src.data_processor.tokenizers as tok from src.trans_checker.args import args # from src.utils.trans import bert_utils as bu import src.utils.utils as utils # torch.cuda.set_device('cuda:{}'.format(args.gpu)) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") bu = utils.get_trans_utils(args) class SpanExtractor(nn.Module): def __init__(self, input_dim): super().__init__() self.start_pred = Linear(input_dim, 1) self.end_pred = Linear(input_dim, 1) self.log_softmax = nn.LogSoftmax(dim=-1) def forward(self, encoder_hiddens, text_masks): """ :param encoder_hiddens: [batch_size, encoder_seq_len, hidden_dim] :param text_masks: [batch_size, text_len + text_start_offset] """ # [batch_size, encoder_seq_len]