コード例 #1
0
def export_vocab(text_hist, program_hist, schema_lexical_vocab, world_vocab,
                 args):

    if schema_lexical_vocab is not None:
        # Merge the lexicon based on the natural language text and the database schema
        for v in schema_lexical_vocab:
            text_hist[v] = -1

    text_vocab = Vocabulary('text',
                            func_token_index=functional_token_index,
                            tu=utils.get_trans_utils(args))
    full_vocab = Vocabulary('full',
                            func_token_index=functional_token_index,
                            tu=utils.get_trans_utils(args))
    for v in text_hist:
        text_vocab.index_token(v, True, text_hist[v])
    text_vocab_path = get_vocab_path(args, 'nl')
    text_vocab.save_to_disk(text_vocab_path)

    program_vocab = Vocabulary('program',
                               func_token_index=functional_token_index)
    for v in program_hist:
        program_vocab.index_token(v, True, program_hist[v])
    program_vocab_path = get_vocab_path(args, 'cm')
    program_vocab.save_to_disk(program_vocab_path)

    # Combine text and program vocabularies
    full_vocab.merge_with(text_vocab)
    full_vocab.merge_with(program_vocab)
    full_vocab_path = get_vocab_path(args, 'full')
    full_vocab.save_to_disk(full_vocab_path)

    world_vocab_path = get_vocab_path(args, 'world')
    world_vocab.save_to_disk(world_vocab_path)
コード例 #2
0
ファイル: encoder_decoder.py プロジェクト: TooTouch/SPARTA
    def __init__(self, args, in_vocab=None, out_vocab=None):
        super().__init__()
        self.share_vocab = args.share_vocab
        self.in_vocab, self.out_vocab = in_vocab, out_vocab
        self.input_vocab_size = self.in_vocab.full_size
        self.output_vocab_size = self.out_vocab.size
        self.tu = utils.get_trans_utils(args)
        self.max_in_seq_len = self.tu.tokenizer.max_len if args.pretrained_transformer != 'null' else args.max_in_seq_len
        self.max_out_seq_len = args.max_out_seq_len
        self.encoder_input_dim = args.encoder_input_dim
        self.decoder_input_dim = args.decoder_input_dim
        self.emb_dropout = args.emb_dropout_rate
        self.res_dropout = (args.res_input_dropout_rate,
                            args.res_layer_dropout_rate)
        self.cross_attn_dropout = args.cross_attn_dropout_rate
        self.cross_attn_num_heads = args.cross_attn_num_heads
        self.xavier_initialization = args.xavier_initialization

        self.pretrained_transformer = args.pretrained_transformer
        self.pretrained_lm_dropout = args.pretrained_lm_dropout_rate
        self.fix_pretrained_transformer_parameters = args.fix_pretrained_transformer_parameters
        self.data_parallel = args.data_parallel

        self.dataset_name = args.dataset_name

        self.encoder_embeddings = None
        self.decoder_embeddings = None
        self.encoder = None
        self.decoder = None
コード例 #3
0
def load_vocabs_seq2seq_ptr(args):
    if args.pretrained_transformer:
        tu = utils.get_trans_utils(args)
        text_vocab = Vocabulary(tag='text', func_token_index=None, tu=tu)
        for v in tu.tokenizer.vocab:
            text_vocab.index_token(v, in_vocab=True, check_for_seen_vocab=True)
    else:
        text_vocab_path = get_vocab_path(args, 'nl')
        text_vocab = load_vocab(text_vocab_path,
                                args.text_vocab_min_freq,
                                tu=utils.get_trans_utils(args))

    program_vocab = sql_reserved_tokens if args.pretrained_transformer else sql_reserved_tokens_revtok

    print('* text vocab size = {}'.format(text_vocab.size))
    print('* program vocab size = {}'.format(program_vocab.size))
    print()
    vocabs = {'text': text_vocab, 'program': program_vocab}
    return vocabs
コード例 #4
0
def load_vocabs_seq2seq(args):
    if args.share_vocab:
        vocab_path = get_vocab_path(args, 'full')
        vocab = load_vocab(vocab_path,
                           args.vocab_min_freq,
                           tu=utils.get_trans_utils(args))
        text_vocab, program_vocab = vocab, vocab
    else:
        text_vocab_path = get_vocab_path(args, 'nl')
        text_vocab = load_vocab(text_vocab_path,
                                args.text_vocab_min_freq,
                                tu=utils.get_trans_utils(args))
        program_vocab_path = get_vocab_path(args, 'cm')
        program_vocab = load_vocab(program_vocab_path,
                                   args.program_vocab_min_freq)

    print('* text vocab size = {}'.format(text_vocab.size))
    print('* program vocab size = {}'.format(program_vocab.size))
    vocabs = {'text': text_vocab, 'program': program_vocab}
    return vocabs
コード例 #5
0
def get_tokenizers(args):
    if args.pretrained_transformer == 'null':
        text_tokenize = revtok_tokenize_with_functional
        program_tokenize = revtok_sql_tokenize
        post_process = revtok_de_tokenize
        transformer_utils = None
    else:
        transformer_utils = utils.get_trans_utils(args)
        text_tokenize = transformer_utils.tokenizer.tokenize
        def p_tokenize(sql, **kwargs):
            return sql_tokenize(sql, text_tokenize, **kwargs)
        program_tokenize = p_tokenize
        def p_detokenize(tokens, **kwargs):
            return trans_de_tokenize(tokens, transformer_utils, **kwargs)
        post_process = p_detokenize
    return text_tokenize, program_tokenize, post_process, transformer_utils
コード例 #6
0
    def __init__(self, args):
        super(LFramework, self).__init__()
        self.model = args.model
        self.model_id = args.model_id

        self.tu = utils.get_trans_utils(args)
        self.schema_graphs = None

        # Training hyperparameters
        self.args = args
        _, _, _, self.tu = tok.get_tokenizers(args)
        self.dataset = args.dataset_name
        self.model_dir = args.model_dir
        self.train_batch_size = args.train_batch_size
        self.dev_batch_size = args.dev_batch_size

        self.start_step = args.start_step
        self.num_steps = args.num_steps
        self.num_peek_steps = args.num_peek_steps
        self.num_log_steps = args.num_log_steps
        self.num_accumulation_steps = args.num_accumulation_steps
        self.save_best_model_only = args.save_best_model_only

        self.optimizer = args.optimizer
        self.bert_finetune_rate = args.bert_finetune_rate
        self.learning_rate = args.learning_rate
        self.learning_rate_scheduler = learning_rate_scheduler_sigs[
            args.learning_rate_scheduler]
        self.ft_learning_rate_scheduler = learning_rate_scheduler_sigs[
            args.trans_learning_rate_scheduler]
        self.warmup_init_lr = args.warmup_init_lr
        self.warmup_init_ft_lr = args.warmup_init_ft_lr
        self.num_warmup_steps = args.num_warmup_steps
        self.grad_norm = args.grad_norm
        self.adam_beta1 = args.adam_beta1
        self.adam_beta2 = args.adam_beta2
        self.optim = None
        self.lr_scheduler = None

        self.decoding_algorithm = args.decoding_algorithm
        self.beam_size = args.beam_size

        self.save_all_checkpoints = args.save_all_checkpoints

        # Visualization saver
        self.vis_writer = LayerVisualizationDataWriter(log_dir=args.viz_dir)
コード例 #7
0
ファイル: trans_checker.py プロジェクト: TooTouch/SPARTA
import src.common.lr_scheduler as lrs
from src.common.nn_modules import *
import src.common.ops as ops
from src.data_processor.processor_utils import get_table_aware_transformer_encoder_inputs
from src.data_processor.schema_loader import load_schema_graphs
import src.data_processor.tokenizers as tok
from src.trans_checker.args import args
# from src.utils.trans import bert_utils as bu
import src.utils.utils as utils

# torch.cuda.set_device('cuda:{}'.format(args.gpu))
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bu = utils.get_trans_utils(args)


class SpanExtractor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.start_pred = Linear(input_dim, 1)
        self.end_pred = Linear(input_dim, 1)
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, encoder_hiddens, text_masks):
        """
        :param encoder_hiddens: [batch_size, encoder_seq_len, hidden_dim]
        :param text_masks: [batch_size, text_len + text_start_offset]
        """
        # [batch_size, encoder_seq_len]