def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True ) if self.config.concate_trace: self.trace_feature_module = build_encoder(self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased" ) elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_position_embeddings = 1090 config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method ) if ( hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention ): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 self.vae = OpenAIDiscreteVAE() image_code_dim = 768 image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers) self.image_seq_len = image_fmap_size ** 2 self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim) self.image_pos_emb = AxialPositionalEmbedding( image_code_dim, axial_shape=(image_fmap_size, image_fmap_size) )
def set_model_config(args, tokenizer): sentence_config = BertConfig() sentence_config.vocab_size = tokenizer.get_vocab_size() sentence_config.num_hidden_layers = args.num_layers1 sentence_config.hidden_size = args.hidden_size1 sentence_config.num_attention_heads = args.attention_heads1 sentence_config.max_position_embeddings = args.block_length document_config = BertConfig() document_config.vocab_size = tokenizer.get_vocab_size() document_config.num_hidden_layers = args.num_layers2 document_config.hidden_size = args.hidden_size2 document_config.num_attention_heads = args.attention_heads2 document_config.num_masked_blocks = args.max_blocks document_config.max_position_embeddings = args.max_blocks return sentence_config, document_config
def __init__(self): super(BERTdownsized, self).__init__() options_name = "bert-base-uncased" from transformers import BertConfig configuration = BertConfig() configuration.num_hidden_layers = 12 self.encoder = BertForSequenceClassification(configuration) # import pdb;pdb.set_trace() print(self.encoder)
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101
r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\vocab.txt', lowercase=True) pretrain = True sentence_block_length = 32 max_sentence_blocks = 48 hidden_size = 256 batch_size = 4 shuffle = True drop_last = True sentence_block_vector = torch.normal(mean=0.0, std=1.0, size=[hidden_size]) sentence_config = BertConfig() sentence_config.vocab_size = tokenizer.get_vocab_size() sentence_config.num_hidden_layers = 6 sentence_config.hidden_size = 256 sentence_config.num_attention_heads = 4 sentence_config.max_position_embeddings = sentence_block_length # sentence_block_length document_config = BertConfig() document_config.vocab_size = tokenizer.get_vocab_size() document_config.num_hidden_layers = 3 document_config.hidden_size = 256 document_config.num_attention_heads = 4 document_config.max_position_embeddings = max_sentence_blocks # sentence_block_length dataset = Dataset(file_path, tokenizer, sentence_block_length, max_sentence_blocks,
def get_model(enable_model_name, is_pretraining, pretrained_path): # tile(37), menzen(2), reach_state(2), n_reach(3), # reach_ippatsu(2), dans(21), rates(19), oya(4), # scores(13), n_honba(3), n_round(12), sanma_or_yonma(2), # han_or_ton(2), aka_ari(2), kui_ari(2), special_token(4) # vocab_size = 37 + 2 + 2 + 3 + 2 + 21 + 19 + 4 + 13 + 3 + 12 + 2 + 2 + 2 + 2 + 4 + 2 + 4 + 6 + 8 # 130 + shanten_diff(2) + who(4) + sum_discards(6) + shanten(8) vocab_size = 37 + 2 + 2 + 3 + 2 + 21 + 19 + 4 + 13 + 3 + 12 + 2 + 2 + 2 + 2 + 4 + 4 + 6 + 8 # 130 + who(4) + sum_discards(6) + shanten(8) # hidden_size = 1024 # num_attention_heads = 16 hidden_size = 768 num_attention_heads = 12 max_position_embeddings = 239 # base + who(1) + sum_discards(1) + shanten(1) # intermediate_size = 64 # intermediate_size = 3072 # max_position_embeddings = 239 # base + pad(1) + who(1) + pad(1) + sum_discards(1) + pad(1) + shanten(1) # max_position_embeddings = 281 # 260 + pad(1) + shanten_diff(14) + pad(1) + who(1) + pad(1) + sum_discards(1) + pad(1) + shanten(1) if is_pretraining: config = BertConfig() config.vocab_size = vocab_size config.hidden_size = hidden_size config.num_attention_heads = num_attention_heads config.max_position_embeddings = max_position_embeddings config.num_hidden_layers = 12 return MahjongPretrainingModel(config) model = None if enable_model_name == 'discard': discard_config = BertConfig() discard_config.vocab_size = vocab_size discard_config.hidden_size = hidden_size discard_config.num_attention_heads = num_attention_heads discard_config.max_position_embeddings = max_position_embeddings discard_config.num_hidden_layers = 12 # discard_config.intermediate_size = intermediate_size # discard_config.num_hidden_layers = 24 # discard_config.num_hidden_layers = 12 model = MahjongDiscardModel(discard_config) elif enable_model_name == 'reach': reach_config = BertConfig() reach_config.vocab_size = vocab_size reach_config.hidden_size = hidden_size reach_config.num_attention_heads = num_attention_heads reach_config.max_position_embeddings = max_position_embeddings reach_config.num_hidden_layers = 24 model = MahjongReachChowPongKongModel(reach_config) elif enable_model_name == 'chow': chow_config = BertConfig() chow_config.vocab_size = vocab_size chow_config.hidden_size = hidden_size chow_config.num_attention_heads = num_attention_heads chow_config.max_position_embeddings = max_position_embeddings chow_config.num_hidden_layers = 24 model = MahjongReachChowPongKongModel(chow_config) elif enable_model_name == 'pong': pong_config = BertConfig() pong_config.vocab_size = vocab_size pong_config.hidden_size = hidden_size pong_config.num_attention_heads = num_attention_heads pong_config.max_position_embeddings = max_position_embeddings pong_config.num_hidden_layers = 24 model = MahjongReachChowPongKongModel(pong_config) elif enable_model_name == 'kong': kong_config = BertConfig() kong_config.vocab_size = vocab_size kong_config.hidden_size = hidden_size kong_config.num_attention_heads = num_attention_heads kong_config.max_position_embeddings = max_position_embeddings kong_config.num_hidden_layers = 24 model = MahjongReachChowPongKongModel(kong_config) if pretrained_path != '': checkpoint = torch.load(pretrained_path, map_location=catalyst.utils.get_device()) # print(checkpoint['model_state_dict'].keys()) model.load_state_dict(checkpoint['model_state_dict'], strict=False) return model
def __init__(self, config, language_pretrained_model_path=None): super(VisualLinguisticBertDecoder, self).__init__(config) self.config = config # embeddings self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.end_embedding = nn.Embedding(1, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.embedding_LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.embedding_dropout = nn.Dropout(config.hidden_dropout_prob) # for compatibility of roberta self.position_padding_idx = config.position_padding_idx # visual transform self.visual_1x1_text = None self.visual_1x1_object = None if config.visual_size != config.hidden_size: self.visual_1x1_text = nn.Linear(config.visual_size, config.hidden_size) self.visual_1x1_object = nn.Linear(config.visual_size, config.hidden_size) if config.visual_ln: self.visual_ln_text = BertLayerNorm(config.hidden_size, eps=1e-12) self.visual_ln_object = BertLayerNorm(config.hidden_size, eps=1e-12) else: visual_scale_text = nn.Parameter(torch.as_tensor( self.config.visual_scale_text_init, dtype=torch.float), requires_grad=True) self.register_parameter('visual_scale_text', visual_scale_text) visual_scale_object = nn.Parameter(torch.as_tensor( self.config.visual_scale_object_init, dtype=torch.float), requires_grad=True) self.register_parameter('visual_scale_object', visual_scale_object) # ********************************************* # FM addition - Set-up decoder layer for MT # Initializing a BERT bert-base-uncased style configuration configuration = BertConfig() configuration.vocab_size = config.vocab_size # FM edit: reduce size - 12 layers doesn't fit in single 12GB GPU configuration.num_hidden_layers = 6 configuration.is_decoder = True # Initializing a model from the bert-base-uncased style configuration self.decoder = BertModel(configuration) # ********************************************* if self.config.with_pooler: self.pooler = BertPooler(config) # init weights self.apply(self.init_weights) if config.visual_ln: self.visual_ln_text.weight.data.fill_( self.config.visual_scale_text_init) self.visual_ln_object.weight.data.fill_( self.config.visual_scale_object_init) # load language pretrained model if language_pretrained_model_path is not None: self.load_language_pretrained_model(language_pretrained_model_path) if config.word_embedding_frozen: for p in self.word_embeddings.parameters(): p.requires_grad = False self.special_word_embeddings = nn.Embedding( NUM_SPECIAL_WORDS, config.hidden_size) self.special_word_embeddings.weight.data.copy_( self.word_embeddings.weight.data[:NUM_SPECIAL_WORDS])
torch.manual_seed(42) output_base_dir = PRJ_ROOT / "output" / datetime.now().strftime( "train%Y%m%d%H%M%S") output_base_dir.mkdir(exist_ok=True) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.info("device: {}".format(device)) df_train = pd.read_csv(TRAIN_PATH) df_train.columns = ["doc_id", "sents"] sents_list = df_train["sents"].values.tolist() logger.info("len(sents_list): {}".format(len(sents_list))) config = BertConfig() config.num_hidden_layers = 3 config.num_attention_heads = 12 config.hidden_size = 768 config.intermediate_size = 3072 config.max_position_embeddings = 512 config.vocab_size = 32000 logger.info("USE_NSP: {}".format(USE_NSP)) if USE_NSP: model = BertForPreTraining(config) else: model = BertForPreTrainingWithoutNSP(config) model.to(device) logger.info(config) logger.info(model)
torch.tensor(np.array([i for i in nsp_df['input_ids'].values])[val_idx].astype("int32"), dtype=torch.long), torch.tensor(np.array([i for i in nsp_df['input_mask'].values])[val_idx].astype("int32"), dtype=torch.long) ] y_train_torch = torch.tensor(y_train, dtype=torch.float32) y_val_torch = torch.tensor(y_val, dtype=torch.float32) train_dataset = TensorDataset(*features, y_train_torch) val_dataset = TensorDataset(*val_features, y_val_torch) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True) BERT_MODEL_PATH = '/content/drive/My Drive/PyTorch版/' bert_config = BertConfig(BERT_MODEL_PATH+'bert_config.json') bert_config.layer_norm_eps=1e-12 bert_config.num_hidden_layers = 6 model = BertForNextSentencePrediction(bert_config) model.to(device) lr = 1e-5 criterion = torch.nn.BCEWithLogitsLoss().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=lr) param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()] scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) from tqdm import tqdm from sklearn.metrics import accuracy_score def train_one_epoch(model, train_loader, criterion, optimizer, device, steps_upd_logging=500, accumulation_steps=1, multi_loss=None):