def __init__(self, n_classes: int): super(SentimentClassifier, self).__init__() self.model = BertModel.from_pretrained(config.PRETRAINED_MODEL_NAME) self.dropout = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes) self.softmax = nn.Softmax(dim=1)
def __init__(self, config, num_labels=3): super(BertForABSA, self).__init__(config) self.num_labels = num_labels self.bert = BertModel(config) self.hsum = HSUM(4, config, num_labels) self.init_weights()
["This is a sample", "This is another longer sample text"], pad_to_max_length= True # First sentence will have some PADDED tokens to match second sequence length ) for i in range(2): print("Tokens (int) : {}".format(tokens['input_ids'][i])) print("Tokens (str) : {}".format( [tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]])) print("Tokens (attn_mask): {}".format(tokens['attention_mask'][i])) print() from transformers import TFBertModel, BertModel # Let's load a BERT model for TensorFlow and PyTorch model_tf = TFBertModel.from_pretrained('bert-base-cased') model_pt = BertModel.from_pretrained('bert-base-cased') # transformers generates a ready to use dictionary with all the required parameters for the specific framework. input_tf = tokenizer.encode_plus("This is a sample input", return_tensors="tf") input_pt = tokenizer.encode_plus("This is a sample input", return_tensors="pt") # Let's compare the outputs output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt) # Models outputs 2 values (The value for each tokens, the pooled representation of the input sentence) # Here we compare the output differences between PyTorch and TensorFlow. for name, o_tf, o_pt in zip(["output", "pooled"], output_tf, output_pt): print("{} differences: {}".format(name, (o_tf.numpy() - o_pt.numpy()).sum()))
help= 'must have the following columns: seqs, num_seqs, and note_id either as a column or index' ) parser.add_argument('--model_path', type=str) parser.add_argument('--output_path', type=str) parser.add_argument('--emb_method', default='last', const='last', nargs='?', choices=['last', 'sum4', 'cat4'], help='how to extract embeddings from BERT output') args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.model_path) config = BertConfig.from_pretrained(args.model_path, output_hidden_states=True) model = BertModel.from_pretrained(args.model_path, config=config) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print(f'Using {device} with {n_gpu} GPUs') # if n_gpu > 1: # model = torch.nn.DataParallel(model) print('Reading dataframe...') df = pd.read_pickle(args.df_path) if 'note_id' in df.columns: df = df.set_index('note_id') def convert_input_example(note_id, text, seqIdx, subj_id, gender):
# datasets['test'] = TensorDataset(*[t[:100] for t in datasets['test'].tensors]) train_dataloader = DataLoader(datasets['train'], batch_size=1, shuffle=False) dev_dataloader = DataLoader(datasets['dev'], batch_size=100) test_dataloader = DataLoader(datasets['test'], batch_size=100) # Language Model bert_folder_path = Path( f'./experiments/transformers/{bert_model_name}/{bert_model_size_type}/{bert_tokenizer_type}/{bert_version}' ) if bert_tokenizer_type == 'wordpiece_roots': bert_folder_path = Path( f'./experiments/transformers/{bert_model_name}/{bert_model_size_type}/wordpiece/{bert_version}' ) logging.info(f'Loading roots tokenizer BERT from: {str(bert_folder_path)}') bert_tokenizer = AlefBERTRootTokenizer(str(bert_folder_path / 'vocab.txt')) bert = BertModel.from_pretrained(str(bert_folder_path)) elif bert_model_name == 'mBERT': logging.info(f'Loading {bert_model_name}') bert_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual') bert = BertModel.from_pretrained('bert-base-multilingual') elif bert_model_name == 'heBERT': logging.info(f'Loading {bert_model_name}') bert_tokenizer = BertTokenizerFast.from_pretrained( f'avichr/{bert_model_name}') bert = BertModel.from_pretrained(f'avichr/{bert_model_name}') else: logging.info(f'Loading BERT from: {str(bert_folder_path)}') bert_tokenizer = BertTokenizerFast.from_pretrained(str(bert_folder_path)) bert = BertModel.from_pretrained(str(bert_folder_path)) logging.info('BERT model and tokenizer loaded')
level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Some global variables train_batch_size = 40 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") learning_rate = 1e-5 num_epoch = 10 # define student and teacher model # Teacher Model bert_config = BertConfig(num_hidden_layers=12, hidden_size=60, intermediate_size=60, output_hidden_states=True, output_attentions=True) teacher_model = BertModel(bert_config) # Student Model bert_config = BertConfig(num_hidden_layers=3, hidden_size=60, intermediate_size=60, output_hidden_states=True, output_attentions=True) student_model = BertModel(bert_config) ### Train data loader input_ids = torch.LongTensor(np.random.randint(100, 1000, (100000, 50))) attention_mask = torch.LongTensor(np.ones((100000, 50))) token_type_ids = torch.LongTensor(np.zeros((100000, 50))) train_data = TensorDataset(input_ids, attention_mask, token_type_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data,
def __init__(self, config): super(BertForQuestionAnswering, self).__init__(config) self.bert = BertModel(config) # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version self.apply(self.init_bert_weights)
def __init__(self, label_size, bert_model): super().__init__() self.label_size = label_size self.hidden_size = 768 * 2 self.bert = BertModel.from_pretrained(bert_model) self.linear = nn.Linear(self.hidden_size, self.label_size)
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): """ Args: model: BertModel Pytorch model instance to be converted ckpt_dir: Tensorflow model directory model_name: model name Currently supported HF models: - Y BertModel - N BertForMaskedLM - N BertForPreTraining - N BertForMultipleChoice - N BertForNextSentencePrediction - N BertForSequenceClassification - N BertForQuestionAnswering """ tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = ( ("layer.", "layer_"), ("word_embeddings.weight", "word_embeddings"), ("position_embeddings.weight", "position_embeddings"), ("token_type_embeddings.weight", "token_type_embeddings"), (".", "/"), ("LayerNorm/weight", "LayerNorm/gamma"), ("LayerNorm/bias", "LayerNorm/beta"), ("weight", "kernel"), ) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return "bert/{}".format(name) def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format( tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save( session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
def bert_embedding(self, bert_path, text_data): tokenizer = BertTokenizer(vocab_file=bert_path + "vocab.txt") # 初始化分词器 # 如果第一段文本有10小段,则记录为[0, 10],该list元素数量比tag多1,tag[i]对应文本separate_points[i]:separates[i+1] separated_texts = [] separated_points = [0] max_text_length = 512 for slices in text_data: for text in slices: separated_texts.append(text) separated_points.append(len(separated_texts)) max_len = max([len(single) for single in separated_texts]) # 最大的句子长度 self.logger.info("data_size: %d" % len(text_data)) self.logger.info("max_seq_len: %d" % max_len) self.logger.info( "avg_seq_len: %d " % np.mean([len(single) for single in separated_texts]) ) bert_model = BertModel.from_pretrained(bert_path).to(self.device) bert_model.eval() batch_size = config.bert_batch_size n_batch = math.ceil(len(separated_texts) / batch_size) embeds = [] for i in range(n_batch): if i % 100 == 0: self.logger.info("Embedding, %d / %d" % (i, n_batch)) # if i != 0 and i % max_save_size == 0: sta = i * batch_size end = (i + 1) * batch_size tokens, segments, input_masks = [], [], [] for text in separated_texts[sta:end]: indexed_tokens = tokenizer.encode(text) # 索引列表 if len(indexed_tokens) > max_text_length: indexed_tokens = indexed_tokens[:max_text_length] tokens.append(indexed_tokens) segments.append([0] * len(indexed_tokens)) input_masks.append([1] * len(indexed_tokens)) max_len = max([len(single) for single in tokens]) # 最大的句子长度 for j in range(len(tokens)): padding = [0] * (max_len - len(tokens[j])) tokens[j] += padding segments[j] += padding input_masks[j] += padding # segments列表全0,因为只有一个句子1,没有句子2 # input_masks列表1的部分代表句子单词,而后面0的部分代表paddig,只是用于保持输入整齐,没有实际意义。 # 相当于告诉BertModel不要利用后面0的部分 # 转换成PyTorch tensors tokens_tensor = torch.tensor(tokens).to(self.device) segments_tensors = torch.tensor(segments).to(self.device) input_masks_tensors = torch.tensor(input_masks).to(self.device) output = bert_model( tokens_tensor, token_type_ids=segments_tensors, attention_mask=input_masks_tensors, ) last_encode = output[0] output_mask = ( input_masks_tensors .unsqueeze(-1) .repeat(1, 1, last_encode.shape[-1]) ) masked_output = last_encode * output_mask self.logger.debug(masked_output.shape) pooled_output = torch.mean(masked_output, dim=1) self.logger.debug(pooled_output.shape) embed = pooled_output.cpu().detach().tolist() self.logger.debug(len(embed)) embeds.extend(embed) torch.cuda.empty_cache() # 按照文本分割的embedding embeds_per_text = [] for i in range(len(separated_points) - 1): embeds_per_text.append(embeds[separated_points[i]:separated_points[i+1]]) return embeds_per_text
def trainBaseline(trainPath, vocabPath, labelPath, epoch=1, useBert=0, load=0, modelPath='', saveName="baseline"): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') dl = DataLoader(vocabPath, labelPath) if useBert == 0: trainArticles = dl.readData_ub(trainPath) else: trainArticles = dl.readData_bert(trainPath) sample = list( torch.utils.data.WeightedRandomSampler(dl.indices, dl.noSample, replacement=False)) lenArt = dl.lenArt #print('epoch: ', i) baseline = Baseline(64, dl.voc.keysize, 32, 2, useBert=useBert) # print('7') docLoss = nn.CrossEntropyLoss() # print('5') opt = torch.optim.Adam(baseline.parameters(), lr=0.001) if load == 1: checkpoint = torch.load(modelPath) baseline.load_state_dict(checkpoint['baseline_model']) #DGAT.load_state_dict(checkpoint['document_model']) opt.load_state_dict(checkpoint['optimizer']) if useBert == 1: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert = BertModel.from_pretrained('bert-base-uncased').to(device) ##Iterate over articles in dataset for e in range(0, epoch): truelabels = [] predictions = [] for n, ind in enumerate(sample): a = trainArticles[ind] id, article, l = a aTemp = torch.tensor([]) if l > 0: l = torch.tensor([1], dtype=torch.long) #.to('cuda') truelabels.append(1) else: l = torch.tensor([0], dtype=torch.long) #.to('cuda') truelabels.append(0) for i, s in enumerate(article): if useBert == 1: sTemp = bert( torch.tensor( tokenizer( s[:510] if len(s) > 510 else s)['input_ids'], device=device).unsqueeze(0))[0].detach().squeeze( dim=0).to('cpu') else: sTemp = s aTemp = torch.cat((aTemp, sTemp.float()), dim=0) #if i == 0: #h = torch.zeros((1, 32)) label = baseline(aTemp.unsqueeze(0)) loss = docLoss(label, l) if l.item() == label.argmax().item(): if l.item() == 0: # tneg += 1 predictions.append(0) else: # tpos += 1 predictions.append(1) else: if l.item() == 0: # fpos += 1 predictions.append(1) else: # fneg += 1 predictions.append(0) opt.zero_grad() loss.backward() opt.step() print('f1 score: ', f1_score(truelabels, predictions, average=None)) # if debugMode == 0: torch.save( { 'baseline_model': baseline.state_dict(), 'optimizer': opt.state_dict() }, 'dataset/{}.tar'.format(saveName))
assert tokenized_text == [ '[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]' ] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertModel.from_pretrained('bert-large-uncased') # Set the model in evaluation mode to deactivate the DropOut modules # This is IMPORTANT to have reproducible results during evaluation! model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): # See the models docstrings for the detail of the inputs outputs = model(tokens_tensor, token_type_ids=segments_tensors) # Transformers models always output tuples.
def __init__(self, n_class): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_class)
def __init__(self, config): super().__init__() self.num_labels = config.num_labels self.bert = BertModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
def get_model(self): from transformers import BertModel _model = BertModel.from_pretrained(self.tmp_model_path) self.to_device(_model) return _model
hidden_states = self.decoder(hidden_states) return hidden_states """------""" # class Bert(nn.Module): # def __init__(self): # super(Bert, self).__init__() # self.model = BertModel.from_pretrained('hfl/chinese-bert-wwm', config=config) # def forward(self, input_ids=None, attention_mask=None, token_type_ids=None): # output = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # return output BERT = BertModel.from_pretrained('hfl/chinese-bert-wwm', config=config) class dotdict(dict): """dot.notation access to dictionary attributes""" __getattr__ = dict.get __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ #%% class Seq2Seq(nn.Module): def __init__(self, mode): super(Seq2Seq, self).__init__() assert mode in ['G', 'R'] # 把 bert 拿進來
def __init__(self): super().__init__() config = BertConfig.from_pretrained("bert-base-uncased") self.model = BertModel(config)
class SpERT(BertPreTrainedModel): """ Span-based model to jointly extract entities and relations """ VERSION = '1.1' def __init__(self, config: BertConfig, cls_token: int, relation_types: int, entity_types: int, size_embedding: int, prop_drop: float, freeze_transformer: bool, max_pairs: int = 100): super(SpERT, self).__init__(config) # BERT model self.bert = BertModel(config) # layers self.rel_classifier = nn.Linear(config.hidden_size * 3 + size_embedding * 2, relation_types) self.entity_classifier = nn.Linear(config.hidden_size * 2 + size_embedding, entity_types) self.size_embeddings = nn.Embedding(100, size_embedding) self.dropout = nn.Dropout(prop_drop) self._cls_token = cls_token self._relation_types = relation_types self._entity_types = entity_types self._max_pairs = max_pairs # weight initialization self.init_weights() if freeze_transformer: print("Freeze transformer weights") # freeze all transformer weights for param in self.bert.parameters(): param.requires_grad = False def _forward_train(self, encodings: torch.tensor, context_masks: torch.tensor, entity_masks: torch.tensor, entity_sizes: torch.tensor, relations: torch.tensor, rel_masks: torch.tensor): # get contextualized token embeddings from last transformer layer context_masks = context_masks.float() h = self.bert(input_ids=encodings, attention_mask=context_masks)['last_hidden_state'] batch_size = encodings.shape[0] # classify entities size_embeddings = self.size_embeddings(entity_sizes) # embed entity candidate sizes entity_clf, entity_spans_pool = self._classify_entities(encodings, h, entity_masks, size_embeddings) # classify relations h_large = h.unsqueeze(1).repeat(1, max(min(relations.shape[1], self._max_pairs), 1), 1, 1) rel_clf = torch.zeros([batch_size, relations.shape[1], self._relation_types]).to( self.rel_classifier.weight.device) # obtain relation logits # chunk processing to reduce memory usage for i in range(0, relations.shape[1], self._max_pairs): # classify relation candidates chunk_rel_logits = self._classify_relations(entity_spans_pool, size_embeddings, relations, rel_masks, h_large, i) rel_clf[:, i:i + self._max_pairs, :] = chunk_rel_logits return entity_clf, rel_clf def _forward_eval(self, encodings: torch.tensor, context_masks: torch.tensor, entity_masks: torch.tensor, entity_sizes: torch.tensor, entity_spans: torch.tensor, entity_sample_masks: torch.tensor): # get contextualized token embeddings from last transformer layer context_masks = context_masks.float() h = self.bert(input_ids=encodings, attention_mask=context_masks)['last_hidden_state'] batch_size = encodings.shape[0] ctx_size = context_masks.shape[-1] # classify entities size_embeddings = self.size_embeddings(entity_sizes) # embed entity candidate sizes entity_clf, entity_spans_pool = self._classify_entities(encodings, h, entity_masks, size_embeddings) # ignore entity candidates that do not constitute an actual entity for relations (based on classifier) relations, rel_masks, rel_sample_masks = self._filter_spans(entity_clf, entity_spans, entity_sample_masks, ctx_size) rel_sample_masks = rel_sample_masks.float().unsqueeze(-1) h_large = h.unsqueeze(1).repeat(1, max(min(relations.shape[1], self._max_pairs), 1), 1, 1) rel_clf = torch.zeros([batch_size, relations.shape[1], self._relation_types]).to( self.rel_classifier.weight.device) # obtain relation logits # chunk processing to reduce memory usage for i in range(0, relations.shape[1], self._max_pairs): # classify relation candidates chunk_rel_logits = self._classify_relations(entity_spans_pool, size_embeddings, relations, rel_masks, h_large, i) # apply sigmoid chunk_rel_clf = torch.sigmoid(chunk_rel_logits) rel_clf[:, i:i + self._max_pairs, :] = chunk_rel_clf rel_clf = rel_clf * rel_sample_masks # mask # apply softmax entity_clf = torch.softmax(entity_clf, dim=2) return entity_clf, rel_clf, relations def _classify_entities(self, encodings, h, entity_masks, size_embeddings): # max pool entity candidate spans m = (entity_masks.unsqueeze(-1) == 0).float() * (-1e30) entity_spans_pool = m + h.unsqueeze(1).repeat(1, entity_masks.shape[1], 1, 1) entity_spans_pool = entity_spans_pool.max(dim=2)[0] # get cls token as candidate context representation entity_ctx = get_token(h, encodings, self._cls_token) # create candidate representations including context, max pooled span and size embedding entity_repr = torch.cat([entity_ctx.unsqueeze(1).repeat(1, entity_spans_pool.shape[1], 1), entity_spans_pool, size_embeddings], dim=2) entity_repr = self.dropout(entity_repr) # classify entity candidates entity_clf = self.entity_classifier(entity_repr) return entity_clf, entity_spans_pool def _classify_relations(self, entity_spans, size_embeddings, relations, rel_masks, h, chunk_start): batch_size = relations.shape[0] # create chunks if necessary if relations.shape[1] > self._max_pairs: relations = relations[:, chunk_start:chunk_start + self._max_pairs] rel_masks = rel_masks[:, chunk_start:chunk_start + self._max_pairs] h = h[:, :relations.shape[1], :] # get pairs of entity candidate representations entity_pairs = util.batch_index(entity_spans, relations) entity_pairs = entity_pairs.view(batch_size, entity_pairs.shape[1], -1) # get corresponding size embeddings size_pair_embeddings = util.batch_index(size_embeddings, relations) size_pair_embeddings = size_pair_embeddings.view(batch_size, size_pair_embeddings.shape[1], -1) # relation context (context between entity candidate pair) # mask non entity candidate tokens m = ((rel_masks == 0).float() * (-1e30)).unsqueeze(-1) rel_ctx = m + h # max pooling rel_ctx = rel_ctx.max(dim=2)[0] # set the context vector of neighboring or adjacent entity candidates to zero rel_ctx[rel_masks.to(torch.uint8).any(-1) == 0] = 0 # create relation candidate representations including context, max pooled entity candidate pairs # and corresponding size embeddings rel_repr = torch.cat([rel_ctx, entity_pairs, size_pair_embeddings], dim=2) rel_repr = self.dropout(rel_repr) # classify relation candidates chunk_rel_logits = self.rel_classifier(rel_repr) return chunk_rel_logits def _filter_spans(self, entity_clf, entity_spans, entity_sample_masks, ctx_size): batch_size = entity_clf.shape[0] entity_logits_max = entity_clf.argmax(dim=-1) * entity_sample_masks.long() # get entity type (including none) batch_relations = [] batch_rel_masks = [] batch_rel_sample_masks = [] for i in range(batch_size): rels = [] rel_masks = [] sample_masks = [] # get spans classified as entities non_zero_indices = (entity_logits_max[i] != 0).nonzero().view(-1) non_zero_spans = entity_spans[i][non_zero_indices].tolist() non_zero_indices = non_zero_indices.tolist() # create relations and masks for i1, s1 in zip(non_zero_indices, non_zero_spans): for i2, s2 in zip(non_zero_indices, non_zero_spans): if i1 != i2: rels.append((i1, i2)) rel_masks.append(sampling.create_rel_mask(s1, s2, ctx_size)) sample_masks.append(1) if not rels: # case: no more than two spans classified as entities batch_relations.append(torch.tensor([[0, 0]], dtype=torch.long)) batch_rel_masks.append(torch.tensor([[0] * ctx_size], dtype=torch.bool)) batch_rel_sample_masks.append(torch.tensor([0], dtype=torch.bool)) else: # case: more than two spans classified as entities batch_relations.append(torch.tensor(rels, dtype=torch.long)) batch_rel_masks.append(torch.stack(rel_masks)) batch_rel_sample_masks.append(torch.tensor(sample_masks, dtype=torch.bool)) # stack device = self.rel_classifier.weight.device batch_relations = util.padded_stack(batch_relations).to(device) batch_rel_masks = util.padded_stack(batch_rel_masks).to(device) batch_rel_sample_masks = util.padded_stack(batch_rel_sample_masks).to(device) return batch_relations, batch_rel_masks, batch_rel_sample_masks def forward(self, *args, evaluate=False, **kwargs): if not evaluate: return self._forward_train(*args, **kwargs) else: return self._forward_eval(*args, **kwargs)
def get_vision_text_model(self, vision_config, text_config): vision_model = CLIPVisionModel(vision_config).eval() text_model = BertModel(text_config).eval() return vision_model, text_model
if args.seed == 0: args.seed = random.randint(0, 100) set_seed(args) helper = DataHelper(gz=True, config=args) args.n_type = helper.n_type # 2 # Set datasets Full_Loader = helper.train_loader # Subset_Loader = helper.train_sub_loader dev_example_dict = helper.dev_example_dict dev_feature_dict = helper.dev_feature_dict eval_dataset = helper.dev_loader roberta_config = BC.from_pretrained(args.bert_model) encoder = BertModel.from_pretrained(args.bert_model) args.input_dim = roberta_config.hidden_size model = BertSupportNet(config=args, encoder=encoder) if args.trained_weight is not None: model.load_state_dict(torch.load(args.trained_weight)) if args.n_gpu > 0 and args.model_gpu != '-1': model.to('cuda') # Initialize optimizer and criterions lr = args.lr t_total = len( Full_Loader) * args.epochs // args.gradient_accumulation_steps warmup_steps = 0.1 * t_total optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
def __init__(self): super(BertEncoder, self).__init__() self.model_config = BertConfig.from_pretrained('MTBERT', output_hidden_state=True) self.bert = BertModel.from_pretrained('MTBERT', config=self.model_config)
def __init__(self, state_path, top_model=CLSTopModel(), use_loc_ids=False): super(EndToEnd, self).__init__() self.bert = BertModel.from_pretrained(state_path) self.top_model = top_model self.clip_param_grad = None self.use_loc_ids = use_loc_ids
def __init__(self, opt): super().__init__() self.bert = BertModel.from_pretrained(opt.bert_path) self.linear = nn.Linear(opt.bert_hid_size, 1)
def AE(df): model_type = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(model_type) model = BertModel.from_pretrained(model_type, return_dict=True) mask_model = BertForMaskedLM.from_pretrained(model_type, return_dict=True) sep_token = '[SEP]' mask_token = '[MASK]' mask_id = tokenizer(mask_token)['input_ids'][1] sep_id = tokenizer(sep_token)['input_ids'][1] optimizer = AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) mask_model.to(device) auxiliary_tokens = ['the', 'aspect', 'term', 'is'] df['mask_tokens'] = 0 df['auxiliary_tokens'] = 0 df = df.astype('object') for i in range(len(df)): #for j in range(len(df['aspect_terms'].iloc[i])): auxiliary_sents = [] for j in range(len(df['aspect_terms'].iloc[i])): aspect_terms = df['aspect_terms'].iloc[i][j] auxiliary_sent = auxiliary_tokens + [aspect_terms] + [ sep_token ] + df['tokens'].iloc[i] auxiliary_sents.append(auxiliary_sent) mask_sent = auxiliary_tokens + [mask_token] + [sep_token ] + df['tokens'].iloc[i] df['mask_tokens'].iloc[i] = mask_sent df['auxiliary_tokens'].iloc[i] = auxiliary_sents df['distance'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = tokenizer.encode(df['mask_tokens'].iloc[i]) sep_index = tokenized.index(sep_id) mask_index = tokenized.index(mask_id) tokenized = pd.Series([tokenized]) padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") attention_mask = np.where(padded != 0, 1, 0) input_ids = torch.tensor(padded).to(device) attention_mask = torch.tensor(attention_mask).to(device) with torch.no_grad(): last_hidden_states = model(input_ids, attention_mask=attention_mask) original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu( ).numpy() distance = [] for pertubed_index in range(sep_index + 1, MAX_LEN): padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") if padded[0][pertubed_index] != 0 and padded[0][ pertubed_index] != sep_id: #print(padded.shape) cur_id = padded[0][pertubed_index] padded[0][pertubed_index] = mask_id cur_embedding = mask_embedding(model, padded, mask_index) d = dist(original_mask_embedding, cur_embedding) distance.append((cur_id, d)) df['distance'].iloc[i] = distance df['perturbed_mask_index'] = 0 df = df.astype('object') for i in range(len(df)): perturbed_mask_index = [] mask_threshold = calculate_threshold( np.array(df['distance'].iloc[i])[:, 1], std_strength) for dis_index in range(len(df['distance'].iloc[i])): if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[ 'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[ i][dis_index] != 'I': perturbed_mask_index.append(dis_index) df['perturbed_mask_index'].iloc[i] = perturbed_mask_index df['augment_token_id'] = 0 df = df.astype('object') for i in range(len(df)): augment_tokenizeds = [] for j in range(len(df['aspect_terms'].iloc[i])): tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j]) tokenized = torch.Tensor(tokenized).unsqueeze(0).to( torch.int64).to(device) augment_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) for k in range(len(df['perturbed_mask_index'].iloc[i])): mask_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) sep_index = mask_tokenized.index(sep_id) perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ k] + sep_index + 1 mask_tokenized[perturbed_mask_index] = mask_id mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to( torch.int64).to(device) outputs = mask_model(mask_tokenized, labels=tokenized) augment_tokenized[perturbed_mask_index] = int( outputs.logits[:, perturbed_mask_index, :].argmax().cpu( ).numpy()) augment_tokenizeds.append(augment_tokenized) df['augment_token_id'].iloc[i] = augment_tokenizeds df['augment_tokens'] = 0 df = df.astype('object') for i in range(len(df)): tokens_lists = [] for j in range(len(df['aspect_terms'].iloc[i])): tokens_list = [] for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1): tokens_list.append( tokenizer.decode([df['augment_token_id'].iloc[i][j][k]])) sep_index = tokens_list.index(sep_token) tokens_list = tokens_list[sep_index + 1:] tokens_lists.append(tokens_list) df['augment_tokens'].iloc[i] = tokens_lists return df
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=True, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='all_one', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=True, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='all_one', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def __init__(self, drop_rate, output_size): super().__init__() self.bert = BertModel.from_pretrained("bert-base-uncased") self.drop = torch.nn.Dropout(drop_rate) #BERTの出力に合わせて768次元 self.fc = torch.nn.Linear(768, output_size)
import os import sys import json from transformers import BertModel, TFBertModel if __name__ == '__main__': path = sys.argv[1] print(path) conf_path = os.path.join(path, 'config.json') c = json.load(open(conf_path)) c['model_type'] = 'bert' json.dump(c, open(conf_path, 'w')) model = TFBertModel.from_pretrained(path, from_pt=True) model.save_pretrained(path) # test loads model = BertModel.from_pretrained(path) model = TFBertModel.from_pretrained(path)
# coding:utf-8 import sys sys.path.append('../') import torch import numpy as np import torch.nn as nn from common.tree import head_to_adj from common.transformer_encoder import TransformerEncoder from common.RGAT import RGATEncoder from transformers import BertModel, BertConfig bert_config = BertConfig.from_pretrained("bert-base-uncased") bert_config.output_hidden_states = True bert_config.num_labels = 3 bert = BertModel.from_pretrained("bert-base-uncased", config=bert_config) class RGATABSA(nn.Module): def __init__(self, args, emb_matrix=None): super().__init__() in_dim = args.hidden_dim + args.bert_out_dim self.args = args self.enc = ABSAEncoder(args) self.classifier = nn.Linear(in_dim, args.num_class) self.dropout = nn.Dropout(0.1) def forward(self, inputs): outputs = self.enc(inputs) outputs = self.dropout(outputs) logits = self.classifier(outputs)
targets=df.score.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=1 ) train_data_loader = create_data_loader( df_train, tokenizer, MAX_LEN, BATCH_SIZE) val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE) bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) # freeze all the parameters for param in bert_model.parameters(): param.requires_grad = False class SentimentClassifier(nn.Module): def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.relu = nn.ReLU() self.dense1 = nn.Linear(768, 512) # 768 hidden state of bert
# @file: test.py # @time: 2021/05/09 # -*- coding: utf-8 -*- # @project:wholee_keyword # @author:caojinlei # @file: bert_embedding.py # @time: 2021/05/07 import torch from transformers import BertTokenizer, BertModel, BertConfig model_name = 'uncased_L-12_H-768_A-12' tokenizer = BertTokenizer.from_pretrained(model_name) model_config = BertConfig.from_pretrained(model_name) model_config.output_hidden_states = True model_config.output_attentions = True bert_model = BertModel.from_pretrained(model_name, config=model_config) # s = 'i have a pen' s = 'i have a apple' sen_code = tokenizer.encode(s) print(sen_code) sen_word = tokenizer.convert_ids_to_tokens(sen_code) tokens_tensor = torch.LongTensor([sen_code]) segments_tensors = torch.zeros(len(sen_code), dtype=int) # 静态词向量 emb = bert_model.embeddings.word_embeddings.weight.data sen_emb = [] for i in sen_code: sen_emb.append(emb[i])