def build_model(vocab: Vocabulary, args, **kwargs) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 200 if args.pretrained_WE_path: # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size, pretrained_file=args.pretrained_WE_path, vocab=vocab, )}) else: embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)}) encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5), num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if args.use_reg : l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg) ] regularizer_applicator = RegularizerApplicator(regexes) return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 300 # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder({ "tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size) }) encoder = CnnEncoder( embedding_dim=EMBED_DIMS, ngram_filter_sizes=(2, 3, 4, 5), num_filters=5 ) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if use_reg: l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg)] regularizer_applicator = RegularizerApplicator(regexes) return DecompensationClassifier(vocab, embedder, encoder, regularizer_applicator)
def test_l2_regularization(self): model = torch.nn.Sequential( torch.nn.Linear(5, 10), torch.nn.Linear(10, 5) ) initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 0.5))]) initializer(model) value = RegularizerApplicator([("", L2Regularizer(1.0))])(model) assert value.data.numpy() == 28.75
def test_regularizer_applicator_respects_regex_matching(self): model = torch.nn.Sequential( torch.nn.Linear(5, 10), torch.nn.Linear(10, 5) ) initializer = InitializerApplicator([(".*", lambda tensor: constant_(tensor, 1.))]) initializer(model) value = RegularizerApplicator([("weight", L2Regularizer(0.5)), ("bias", L1Regularizer(1.0))])(model) assert value.data.numpy() == 65.0
def test_l2_regularization(self): model = torch.nn.Sequential(torch.nn.Linear(5, 10), torch.nn.Linear(10, 5)) constant_init = Initializer.from_params( Params({ "type": "constant", "val": 0.5 })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(model) value = RegularizerApplicator([("", L2Regularizer(1.0))])(model) assert value.data.numpy() == 28.75
def test_regularizer_applicator_respects_regex_matching(self): model = torch.nn.Sequential(torch.nn.Linear(5, 10), torch.nn.Linear(10, 5)) constant_init = Initializer.from_params( Params({ "type": "constant", "val": 1. })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(model) value = RegularizerApplicator([("weight", L2Regularizer(0.5)), ("bias", L1Regularizer(1.0))])(model) assert value.data.numpy() == 65.0
def build_model_Transformer(vocab: Vocabulary, use_reg: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 300 # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = PretrainedTransformerEmbedder(BERT_MODEL_NAME) encoder = BertPooler( BERT_MODEL_NAME ) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if use_reg: l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg)] regularizer_applicator = RegularizerApplicator(regexes) return MortalityClassifier(vocab, embedder, encoder, regularizer_applicator)
def run_training_loop(): tokenizer = BERTTokenizer(vocab_file='/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/My-pipeline/allennlp_tutorial/BertTokenizer/vocab.txt') # tokenizer = BERTTokenizer('bert-base-multilingual-cased') # same as above # Try to use ELMo # tokenindexer = ELMoTokenCharactersIndexer() # elmo_tokens = tokenindexer.tokens_to_indices([Token("happy")], None) # print(len(elmo_tokens["elmo_tokens"][0]), elmo_tokens) # Try to use BERT # tokenizer = PretrainedTransformerTokenizer( # model_name="bert-base-multilingual-cased", # add_special_tokens=True, # max_length=512 # ) # token_indexer = PretrainedTransformerIndexer( # model_name="bert-base-multilingual-cased", # max_length=512, # ) cached_directory = None # "cached_dir" dataset_reader = ClassificationTsvReader(tokenizer=tokenizer, cache_directory=cached_directory) print("Reading data") train_data = dataset_reader.read(file_path='/Users/tianhongzxy/Downloads/contradictory-my-dear-watson/train.txt') pretrained_files = None # {"tokens": "/Users/tianhongzxy/Downloads/BiSentESIM/BiSentESIM/embedding/glove.6B.300d.txt"} cuda_device = -1 batch_size = 8 vocab = build_vocab(train_data, pretrained_files=pretrained_files, include_full_pretrained_words=False) init_uniform = XavierUniformInitializer() # init_uniform(model.embedder.token_embedder_tokens.weight) init_const = ConstantInitializer(val=0) # init_const(model.classifier.bias) init_normal = NormalInitializer(mean=0., std=1.) # init_normal(model.classifier.weight) applicator = InitializerApplicator( regexes=[ ('embedder.*', init_uniform), ('classifier.*weight', init_normal), ('classifier.*bias', init_const) ] ) regularizer = RegularizerApplicator( regexes=[ ('embedder.*', L2Regularizer(alpha=1e-3)), ('classifier.*weight', L2Regularizer(alpha=1e-3)), # ('classifier.*bias', L1Regularizer(alpha=1e-2)) # 不要对bias进行正则,否则容易欠拟合 ] ) model = build_model(vocab, embedding_dim=10, pretrained_file=None, # pretrained_files["tokens"] initializer=applicator, regularizer=regularizer ) if cuda_device >= 0: model = model.cuda(cuda_device) # split train data into train & dev data from allennlp.data.dataset_readers import AllennlpDataset print('origin train data size: ', len(train_data)) train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=20020206) assert type(train_data[0]) == type(dev_data[0]) == Instance train_data, dev_data = AllennlpDataset(train_data), AllennlpDataset(dev_data) print('train data size: ', len(train_data), 'dev data size', len(dev_data)) assert type(train_data) == type(dev_data) == AllennlpDataset train_data.index_with(vocab) dev_data.index_with(vocab) train_loader, dev_loader = build_data_loaders(train_data=train_data, dev_data=dev_data, batch_size=batch_size) with tempfile.TemporaryDirectory() as serialization_dir: # serialization_dir = 'temp_dir/' trainer = build_trainer( model=model, serialization_dir=serialization_dir, train_loader=train_loader, dev_loader=dev_loader, num_epochs=5, cuda_device=cuda_device, patience=5 ) print("Starting training") trainer.train() print("Finished training") # Evaluate model on test data # print("Starting testing") # test_data = dataset_reader.read('test.txt') # test_data.index_with(vocab) # data_loader = DataLoader(test_data, batch_size=batch_size) # results = evaluate(model, data_loader, cuda_device=cuda_device) # print('Test results: ', results) # outputs = model.forward_on_instances(instances) # print(outputs) return model, dataset_reader
def train(self, args_hpo, index): """ trains the model, and return the metrics to the meta optimizer. :param args_hpo: :param index: :return: """ PrintColors.prYellow('\n===== training with: {}'.format(args_hpo)) PrintColors.prGreen('----- in {} mode -----'.format('train')) ''' ============ LOAD DATA ================================================================================ ''' starting_time = time.time() lm_dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS) train_data, val_data = (lm_dataset_reader.read(folder) for folder in [_train_data_path, _val_data_path]) lm_vocabulary = Vocabulary.from_instances(train_data + val_data) iterator = BasicIterator(batch_size=args_hpo.batch_size) iterator.index_with(lm_vocabulary) ''' ============ DEFINE MODEL ============================================================================= ''' ''' the class params 'pop' its parameters i.e. they disappear after first use. So we instantiate a Params instance for each model defining execution. More than that, they turn dicts into Mutable mappings and destroys the original dict. So here's your copy allennlp. Thanks. (I still love you) ''' token_embedding = Embedding.from_params(vocab=lm_vocabulary, params=Params(copy.deepcopy(GLOBAL_CONSTANTS.GLOVE_PARAMS_CONFIG))) token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({'tokens': token_embedding}) ''' define encoder to wrap up an lstm feature extractor ''' contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=args_hpo.word_embedding_size, hidden_size=args_hpo.ed_ncoder_size, bidirectional=False, batch_first=True)) model = LanguageModel(vocab=lm_vocabulary, text_field_embedder=token_embedder, contextualizer=contextualizer, dropout=args_hpo.dropout, regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=args_hpo.l2))]), )\ .cuda(_device) ''' ============ TRAIN ================================================================================ ''' ''' callbacks ''' if index == 0: for file in os.listdir(os.path.join(*['.', 'lm_models'])): path = os.path.join(*['.', 'lm_models', file]) if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) serialization_path = 'models_lm_{}_{}'.format(_tag, index) serialization_path_longer = os.path.join(*['.', 'lm_models', serialization_path]) vocab_path = 'vocab_lm_{}_{}'.format(_tag, index) vocab_dir_longer = os.path.join(*['.', 'lm_models', vocab_path]) if not os.path.exists(serialization_path_longer): os.mkdir(serialization_path_longer) callbacks = list() ''' for validation ''' callbacks.append(validate.Validate(validation_data=val_data, validation_iterator=iterator)) ''' for early stopping. it tracks 'loss' returned by model.forward() ''' callbacks.append(track_metrics.TrackMetrics(patience=3)) ''' for grad clipping ''' callbacks.append(gradient_norm_and_clip.GradientNormAndClip(grad_clipping=args_hpo.clip)) ''' for checkpointing TODO: NOTE:serialization path CANNOT exist before training ?? ''' model_checkpointer = checkpointer.Checkpointer(serialization_dir=serialization_path_longer, num_serialized_models_to_keep=1) callbacks.append(checkpoint.Checkpoint(checkpointer=model_checkpointer)) ''' for sample generations ''' callback_trainer = CallbackTrainer( model=model, training_data=train_data, iterator=iterator, optimizer=torch.optim.Adam(model.parameters(), lr=args_hpo.lr), num_epochs=_n_epochs, serialization_dir=serialization_path_longer, cuda_device=_device, callbacks=callbacks ) ''' trainer saves the model, but the vocabulary needs to be saved, too ''' lm_vocabulary.save_to_files(vocab_dir_longer) ''' check the metric names to synchronize with the class ''' metrics = callback_trainer.train() metrics['time_consumed(hrs)'] = round((time.time() - starting_time) / 3600, 4) return metrics
def build_model( vocab, embed_dim: int = 100, hid_dim: int = 100, min_dec_step: int = 2, max_decoding_steps: int = 3, fix_edu_num: int = -1, use_elmo: bool = False, dropout=0.5, dropout_emb=0.2, span_encoder_type='self_attentive', attn_type='dot', schedule_ratio_from_ground_truth=0.7, pretrain_embedding=None, nenc_lay: int = 1, mult_orac_sampling: bool = True, compression: bool = True, word_token_indexers=None, alpha: float = 1.0, dbg: bool = False, dec_avd_trigram_rep: bool = True, aggressive_compression: int = -1, keep_threshold: float = 0.5, weight_alpha=0.0, bias_alpha=0.0, abs_board_file: str = "/home/cc/exComp/board.txt", compress_leadn=-1, gather='mean', abs_dir_root: str = "/scratch/cluster/jcxu", serilization_name="", load_save_model: str = None ): model = Seq2IdxSum( vocab=vocab, word_embedding_dim=embed_dim, hidden_dim=hid_dim, min_dec_step=min_dec_step, max_decoding_steps=max_decoding_steps, fix_edu_num=fix_edu_num, use_elmo=use_elmo, span_encoder_type=span_encoder_type, dropout=dropout, dropout_emb=dropout_emb, attn_type=attn_type, schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth, pretrain_embedding_file=pretrain_embedding, nenc_lay=nenc_lay, mult_orac_sampling=mult_orac_sampling, word_token_indexers=word_token_indexers, compression=compression, alpha=alpha, dbg=dbg, dec_avd_trigram_rep=dec_avd_trigram_rep, aggressive_compression=aggressive_compression, keep_threshold=keep_threshold, regularizer=RegularizerApplicator([("weight", L2Regularizer(weight_alpha)), ("bias", L1Regularizer(bias_alpha))]), abs_board_file=abs_board_file, gather=gather, compress_leadn=compress_leadn, abs_dir_root=abs_dir_root, serilization_name=serilization_name ) if load_save_model: model.load_state_dict(torch.load(load_save_model, map_location=get_device())) # `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` # model = torch.nn.DataParallel(model) device = get_device() model = model.to(device) return model
''' the language model used Glove but we just build an embedder to load the trained parameters ''' token_embedding = Embedding( num_embeddings=vocabulary.get_vocab_size(namespace='tokens'), embedding_dim=combination.word_embedding_size, padding_index=0) token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {'tokens': token_embedding}) ''' define encoder to wrap up an lstm feature extractor ''' contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=combination.word_embedding_size, hidden_size=combination.ed_ncoder_size, bidirectional=False, batch_first=True)) model = LanguageModel(vocab=vocabulary, text_field_embedder=token_embedder, contextualizer=contextualizer, dropout=combination.dropout, regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]), ) \ .cuda(device) model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True) dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS) language_model_predictor = Predictor(model=model, dataset_reader=dataset_reader) val_data_path = os.path.join('.', 'data_seg_val_toytoy') instances = dataset_reader.read(val_data_path) predictions = [ language_model_predictor.predict_instance(instance) for instance in instances ]
class Net(torch.nn.Module): def __init__(self): super().__init__() self.linear1 = torch.nn.Linear(2, 3) self.linear2 = torch.nn.Linear(3, 2) self.conv = torch.nn.Conv1d(2, 2, 2) def forward(self, inputs): pass print('Using individual regularizers:') model = Net() init_const = ConstantInitializer(val=10.) init_const(model.linear1.weight) init_const(model.linear2.weight) l1_regularizer = L1Regularizer(alpha=0.01) print(l1_regularizer(model.linear1.weight)) # 0.01 * 10 * 6 = 0.6 l2_regularizer = L2Regularizer(alpha=0.01) print(l2_regularizer(model.linear2.weight)) # 0.01 * (10)^2 * 6 print('Using an applicator:') applicator = RegularizerApplicator( regexes=[('linear1.weight', L1Regularizer(alpha=.01)), ('linear2.weight', L2Regularizer())]) print(applicator(model)) # 0.6 + 6
class Net(torch.nn.Module): def __init__(self): super().__init__() self.linear1 = torch.nn.Linear(2, 3) self.linear2 = torch.nn.Linear(3, 2) self.conv = torch.nn.Conv1d(2, 2, 2) def forward(self, inputs): pass print("Using individual regularizers:") model = Net() init_const = ConstantInitializer(val=10.0) init_const(model.linear1.weight) init_const(model.linear2.weight) l1_regularizer = L1Regularizer(alpha=0.01) print(l1_regularizer(model.linear1.weight)) # 0.01 * 10 * 6 = 0.6 l2_regularizer = L2Regularizer(alpha=0.01) print(l2_regularizer(model.linear2.weight)) # 0.01 * (10)^2 * 6 print("Using an applicator:") applicator = RegularizerApplicator(regexes=[ ("linear1.weight", L1Regularizer(alpha=0.01)), ("linear2.weight", L2Regularizer()), ]) print(applicator(model)) # 0.6 + 6