def build_sent_encoder(args, vocab, d_emb, tasks, embedder, cove_layer): # Build single sentence encoder: the main component of interest # Need special handling for language modeling # Note: sent_enc is expected to apply dropout to its input _and_ output if needed. tfm_params = Params({ 'input_dim': d_emb, 'hidden_dim': args.d_hid, 'projection_dim': args.d_tproj, 'feedforward_hidden_dim': args.d_ff, 'num_layers': args.n_layers_enc, 'num_attention_heads': args.n_heads }) rnn_params = Params({ 'input_size': d_emb, 'bidirectional': True, 'hidden_size': args.d_hid, 'num_layers': args.n_layers_enc }) # Make sentence encoder if any(isinstance(task, LanguageModelingTask) for task in tasks) or \ args.sent_enc == 'bilm': assert_for_log(args.sent_enc in ['rnn', 'bilm'], "Only RNNLM supported!") if args.elmo: assert_for_log(args.elmo_chars_only, "LM with full ELMo not supported") bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, bilm, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 2 * args.d_hid log.info("Using BiLM architecture for shared encoder!") elif args.sent_enc == 'bow': sent_encoder = BoWSentEncoder(vocab, embedder) log.info("Using BoW architecture for shared encoder!") assert_for_log( not args.skip_embs, "Skip connection not currently supported with `bow` encoder.") d_sent = d_emb elif args.sent_enc == 'rnn': sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, sent_rnn, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 2 * args.d_hid log.info("Using BiLSTM architecture for shared encoder!") elif args.sent_enc == 'transformer': transformer = StackedSelfAttentionEncoder.from_params( copy.deepcopy(tfm_params)) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, transformer, dropout=args.dropout, skip_embs=args.skip_embs, cove_layer=cove_layer, sep_embs_for_skip=args.sep_embs_for_skip) log.info("Using Transformer architecture for shared encoder!") elif args.sent_enc == 'null': # Expose word representation layer (GloVe, ELMo, etc.) directly. assert_for_log( args.skip_embs, f"skip_embs must be set for " "'{args.sent_enc}' encoder") phrase_layer = NullPhraseLayer(rnn_params['input_size']) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, phrase_layer, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 0 # skip connection added below log.info("No shared encoder (just using word embeddings)!") else: assert_for_log(False, "No valid sentence encoder specified.") return sent_encoder, d_sent
def build_model(args, vocab, pretrained_embs, tasks): '''Build model according to args ''' # Build embeddings. if args.openai_transformer: # Note: incompatible with other embedders, but logic in preprocess.py # should prevent these from being enabled anyway. from .openai_transformer_lm.utils import OpenAIEmbedderModule log.info("Using OpenAI transformer model; skipping other embedders.") cove_layer = None embedder = OpenAIEmbedderModule(args) d_emb = embedder.get_output_dim() else: # Default case, used for ELMo, CoVe, word embeddings, etc. d_emb, embedder, cove_layer = build_embeddings(args, vocab, tasks, pretrained_embs) d_sent = args.d_hid # Build single sentence encoder: the main component of interest # Need special handling for language modeling # Note: sent_enc is expected to apply dropout to its input _and_ output if needed. # So, embedding modules and classifier modules should not apply dropout there. tfm_params = Params({ 'input_dim': d_emb, 'hidden_dim': args.d_hid, 'projection_dim': args.d_tproj, 'feedforward_hidden_dim': args.d_ff, 'num_layers': args.n_layers_enc, 'num_attention_heads': args.n_heads }) rnn_params = Params({ 'input_size': d_emb, 'bidirectional': True, 'hidden_size': args.d_hid, 'num_layers': args.n_layers_enc }) if any(isinstance(task, LanguageModelingTask) for task in tasks) or \ args.sent_enc == 'bilm': assert_for_log(args.sent_enc in ['rnn', 'bilm'], "Only RNNLM supported!") if args.elmo: assert_for_log(args.elmo_chars_only, "LM with full ELMo not supported") bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, bilm, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 2 * args.d_hid log.info("Using BiLM architecture for shared encoder!") elif args.sent_enc == 'bow': sent_encoder = BoWSentEncoder(vocab, embedder) log.info("Using BoW architecture for shared encoder!") assert_for_log( not args.skip_embs, "Skip connection not currently supported with `bow` encoder.") d_sent = d_emb elif args.sent_enc == 'rnn': sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, sent_rnn, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 2 * args.d_hid log.info("Using BiLSTM architecture for shared encoder!") elif args.sent_enc == 'transformer': transformer = StackedSelfAttentionEncoder.from_params( copy.deepcopy(tfm_params)) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, transformer, dropout=args.dropout, skip_embs=args.skip_embs, cove_layer=cove_layer, sep_embs_for_skip=args.sep_embs_for_skip) log.info("Using Transformer architecture for shared encoder!") elif args.sent_enc == 'null': # Expose word representation layer (GloVe, ELMo, etc.) directly. assert_for_log( args.skip_embs, f"skip_embs must be set for " "'{args.sent_enc}' encoder") phrase_layer = NullPhraseLayer(rnn_params['input_size']) sent_encoder = SentenceEncoder( vocab, embedder, args.n_layers_highway, phrase_layer, skip_embs=args.skip_embs, dropout=args.dropout, sep_embs_for_skip=args.sep_embs_for_skip, cove_layer=cove_layer) d_sent = 0 # skip connection added below log.info("No shared encoder (just using word embeddings)!") else: assert_for_log(False, "No valid sentence encoder specified.") d_sent += args.skip_embs * d_emb # Build model and classifiers model = MultiTaskModel(args, sent_encoder, vocab) if args.is_probing_task: # TODO: move this logic to preprocess.py; # current implementation reloads MNLI data, which is slow. train_task_whitelist, eval_task_whitelist = get_task_whitelist(args) tasks_to_build, _, _ = get_tasks(train_task_whitelist, eval_task_whitelist, args.max_seq_len, path=args.data_dir, scratch_path=args.exp_dir) else: tasks_to_build = tasks # Attach task-specific params. for task in set(tasks + tasks_to_build): task_params = get_task_specific_params(args, task.name) log.info("\tTask '%s' params: %s", task.name, json.dumps(task_params.as_dict(), indent=2)) # Store task-specific params in case we want to access later setattr(model, '%s_task_params' % task.name, task_params) # Actually construct modules. for task in tasks_to_build: # If the name of the task is different than the classifier it should use # then skip the module creation. if task.name != model._get_task_params(task.name).get( 'use_classifier', task.name): continue build_module(task, model, d_sent, d_emb, vocab, embedder, args) model = model.cuda() if args.cuda >= 0 else model log.info(model) param_count = 0 trainable_param_count = 0 for name, param in model.named_parameters(): param_count += np.prod(param.size()) if param.requires_grad: trainable_param_count += np.prod(param.size()) log.info(">> Trainable param %s: %s = %d", name, str(param.size()), np.prod(param.size())) log.info( "Total number of parameters: {ct:d} ({ct:g})".format(ct=param_count)) log.info("Number of trainable parameters: {ct:d} ({ct:g})".format( ct=trainable_param_count)) return model
def build_model(args, vocab, pretrained_embs, tasks): '''Build model according to args ''' # Build embeddings. d_emb, embedder, cove_emb = build_embeddings(args, vocab, pretrained_embs) d_sent = args.d_hid # Build single sentence encoder: the main component of interest # Need special handling for language modeling tfm_params = Params({'input_dim': d_emb, 'hidden_dim': args.d_hid, 'projection_dim': args.d_tproj, 'feedforward_hidden_dim': args.d_ff, 'num_layers': args.n_layers_enc, 'num_attention_heads': args.n_heads}) rnn_params = Params({'input_size': d_emb, 'bidirectional': args.bidirectional, 'hidden_size': args.d_hid, 'num_layers': args.n_layers_enc}) if sum([isinstance(task, LanguageModelingTask) for task in tasks]): if args.bidirectional: rnn_params['bidirectional'] = False if args.sent_enc == 'rnn': fwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) bwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) elif args.sent_enc == 'transformer': fwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params)) bwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params)) sent_encoder = BiLMEncoder(vocab, embedder, args.n_layers_highway, fwd, bwd, dropout=args.dropout, skip_embs=args.skip_embs, cove_layer=cove_emb) else: # not bidirectional if args.sent_enc == 'rnn': fwd = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) elif args.sent_enc == 'transformer': fwd = MaskedStackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params)) sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway, fwd, skip_embs=args.skip_embs, dropout=args.dropout, cove_layer=cove_emb) elif args.sent_enc == 'bow': sent_encoder = BoWSentEncoder(vocab, embedder) d_sent = d_emb elif args.sent_enc == 'rnn': sent_rnn = s2s_e.by_name('lstm').from_params(copy.deepcopy(rnn_params)) sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway, sent_rnn, skip_embs=args.skip_embs, dropout=args.dropout, cove_layer=cove_emb) d_sent = (1 + args.bidirectional) * args.d_hid elif args.sent_enc == 'transformer': transformer = StackedSelfAttentionEncoder.from_params(copy.deepcopy(tfm_params)) sent_encoder = SentenceEncoder(vocab, embedder, args.n_layers_highway, transformer, dropout=args.dropout, skip_embs=args.skip_embs, cove_layer=cove_emb) else: assert_for_log(False, "No valid sentence encoder specified.") d_sent += args.skip_embs * d_emb # Build model and classifiers model = MultiTaskModel(args, sent_encoder, vocab) if args.is_probing_task: # TODO: move this logic to preprocess.py; # current implementation reloads MNLI data, which is slow. train_task_whitelist, eval_task_whitelist = get_task_whitelist(args) tasks_to_build, _, _ = get_tasks(train_task_whitelist, eval_task_whitelist, args.max_seq_len, path=args.data_dir, scratch_path=args.exp_dir) else: tasks_to_build = tasks # Attach task-specific params. for task in set(tasks + tasks_to_build): task_params = get_task_specific_params(args, task.name) log.info("\tTask '%s' params: %s", task.name, json.dumps(task_params.as_dict(), indent=2)) # Store task-specific params in case we want to access later setattr(model, '%s_task_params' % task.name, task_params) # Actually construct modules. for task in tasks_to_build: build_module(task, model, d_sent, vocab, embedder, args) model = model.cuda() if args.cuda >= 0 else model log.info(model) param_count = 0 trainable_param_count = 0 for name, param in model.named_parameters(): param_count += np.prod(param.size()) if param.requires_grad: trainable_param_count += np.prod(param.size()) log.info("Total number of parameters: {}".format(param_count)) log.info("Number of trainable parameters: {}".format(trainable_param_count)) return model