def find_paragraph(tokenizer: BertTokenizer, model: BertForNextSentencePrediction, question: str, context: str, max_len=256, batch_size=16): q_len = len(tokenizer.tokenize(question)) context_tokens = tokenizer.tokenize(context) part_len = max_len - q_len - 3 parts = [] n = 0 while n < len(context_tokens): parts += [context_tokens[n:n + part_len]] n += part_len // 2 results = [] all_parts = parts[:] while len(parts) > 0: batch = tokenizer.batch_encode_plus(list( zip([question] * batch_size, parts[:batch_size])), max_length=max_len, truncation=True, pad_to_max_length=True, return_tensors="pt").to("cuda") with torch.no_grad(): output = model(**batch)[0] results += [a - b for a, b in output.cpu().tolist()] parts = parts[batch_size:] return np.array(results), [ tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True) for part in all_parts ]
def collate(data: List[Tuple[str, int]], tokenizer: BertTokenizer, block_size: int) -> Dict: texts, labels = list(map(list, zip(*data))) input_data = tokenizer.batch_encode_plus(texts, max_length=block_size, truncation=True, pad_to_max_length=True, return_tensors="pt").to( args.device) input_data['labels'] = torch.tensor(labels).to(args.device) return input_data
def create_data_loader(sentences, tokenizer_vocab, labels=[], train_mode=True): """ Create a dataloader BERT :param iterable sentences: text instances :param string tokenizer_vocab: tokenizer file :param iterable labels: sentiment or emotions class """ logger.info("Loading Tokenize and Encoding data..") tokenizer = BertTokenizer(tokenizer_vocab, do_lower_case=True) encoded_sents = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, return_attention_mask=True, padding=True, max_length=256, truncation=True, return_tensors="pt") sent_ids = encoded_sents["input_ids"] attention_masks = encoded_sents["attention_mask"] if len(labels) > 0: labels = torch.tensor(labels) data = TensorDataset(sent_ids, attention_masks, labels) else: data = TensorDataset(sent_ids, attention_masks) logger.info("Creating Data Loaders...") batch_size = int(CFG["MODELS"]["batch_size"]) if train_mode: dataloader = DataLoader(data, sampler=RandomSampler(data), batch_size=batch_size) else: dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size) return dataloader
def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict: questions = [item[0] for item in data] labels = [0 if item[1] == 'NONE' else 1 for item in data] input_data = tokenizer.batch_encode_plus(questions, max_length=block_size, truncation=True, pad_to_max_length=True, return_tensors="pt").to( args.device) input_data['labels'] = torch.tensor(labels).to(args.device) return input_data
def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict: questions = [item[0] for item in data] texts = [item[1] for item in data] labels = [item[2] for item in data] input_data = tokenizer.batch_encode_plus(list(zip(questions, texts)), max_length=block_size, truncation='only_second', pad_to_max_length=True, return_tensors="pt").to( args.device) input_data['next_sentence_label'] = torch.tensor(labels).to(args.device) return input_data
def encode_sentences(settings: Settings, tokenizer: BertTokenizer, sentences: list) -> Tuple[list, list]: # Use the pretrained BERT transfer model # return as an array of token id's # converts tokens to id's and includes CLS and SEP # can be converted back with convert_ids_to_tokens encoding_dict = tokenizer.batch_encode_plus( sentences, pad_to_max_length=True, max_length=settings.get_max_tokens_length(), add_special_tokens=True, ) return encoding_dict['input_ids'], encoding_dict['attention_mask']
class TransformersBertPreprocessor(Component): def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, tokenize_chinese_chars: bool = True, **kwargs): vocab_file = expand_path(vocab_file) self.tokenizer = BertTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case, tokenize_chinese_chars=tokenize_chinese_chars) self.max_seq_length = max_seq_length def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\ Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]: if isinstance(tokens_batch[0], str): # skip for already tokenized text tokens_batch = [ self.tokenizer.basic_tokenizer.tokenize( sentence, self.tokenizer.all_special_tokens) for sentence in tokens_batch ] startofword_markers_batch = [] subtokens_batch = [] for tokens in tokens_batch: startofword_markers = [0] subtokens = ['[CLS]'] for token in tokens: for i, subtoken in enumerate( self.tokenizer.wordpiece_tokenizer.tokenize(token)): startofword_markers.append(int(i == 0)) subtokens.append(subtoken) startofword_markers.append(0) subtokens.append('[SEP]') if len(subtokens) > self.max_seq_length: raise RuntimeError( f"input sequence after bert tokenization" f" cannot exceed {self.max_seq_length} tokens.") startofword_markers_batch.append(startofword_markers) subtokens_batch.append(subtokens) encoded = self.tokenizer.batch_encode_plus( [[subtokens, None] for subtokens in subtokens_batch], add_special_tokens=False) return (tokens_batch, subtokens_batch, _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id), _pad(startofword_markers_batch), _pad(encoded['attention_mask']))
def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict: questions = [item[3] for item in data] texts = [item[1] for item in data] label2id = { 'YES': 1, 'NO': 0, 'NONE': 2 } labels = [label2id[item[-1]] for item in data] input_data = tokenizer.batch_encode_plus(list(zip(questions, texts)), max_length=block_size, truncation='only_second', pad_to_max_length=True, return_tensors="pt").to( args.device) input_data['labels'] = torch.tensor(labels).to(args.device) return input_data
def context_score(tokenizer: BertTokenizer, model: BertForNextSentencePrediction, question: str, context: List[str], max_len=64): batch = tokenizer.batch_encode_plus(list( zip([question] * len(context), context)), max_length=max_len, truncation=True, pad_to_max_length=True, return_tensors="pt").to("cuda") with torch.no_grad(): output = model(**batch)[0] return np.array([a - b for a, b in output.cpu().numpy()])
def encode_sentences(tokenizer: BertTokenizer, sentences: list) -> (list, list): # Use the pretrained BERT transfer model #return as an array of token id's encoding_dict = tokenizer.batch_encode_plus( batch_text_or_text_pairs=sentences, pad_to_max_length=True, max_length=512, add_special_tokens=True, return_tensors='pt') #converts tokens to id's and includes CLS and SEP # can be converted back with convert_ids_to_tokens print(encoding_dict.keys()) #print(encoding_dict['tokens']) return encoding_dict['input_ids'], encoding_dict['attention_mask']
def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict: starts = [item['start'] for item in data] ends = [item['end'] for item in data] questions = [item['question'] for item in data] contexts = [item['context'] for item in data] input_data = tokenizer.batch_encode_plus(list(zip(questions, contexts)), max_length=block_size, truncation='only_second', pad_to_max_length=True, return_tensors="pt").to( args.device) input_data['start_positions'] = torch.tensor(starts).to(args.device) input_data['end_positions'] = torch.tensor(ends).to(args.device) return input_data
def embed_sentence(modelFolderPath, vocabFilePath, seq, MAX_LEN): device = 'cpu' model = BertModel.from_pretrained(modelFolderPath) model = model.to(device) model = model.eval() tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False) ids = tokenizer.batch_encode_plus(seq, add_special_tokens=True, padding=True, truncation=True, max_length=MAX_LEN) tokenized_sequences = torch.tensor(ids["input_ids"]).to(model.device) attention_mask = torch.tensor(ids["attention_mask"]).to(model.device) with torch.no_grad(): embeddings = model(input_ids=tokenized_sequences, attention_mask=attention_mask)[0] print(embeddings.shape) embeddings = embeddings.clone().detach() protein_embd = torch.tensor(embeddings).sum(dim=0).mean(dim=0) print(protein_embd.shape) ''' pooling = pool_strategy({"token_embeddings": embeddings, "cls_token_embeddings": embeddings[:, 0], "attention_mask": attention_mask, }) pooling = pooling.cpu().numpy() print (pooling.shape) embeddings = embeddings.cpu().numpy() print (embeddings.shape) features = [] for seq_num in range(len(embeddings)): seq_len = (attention_mask[seq_num] == 1).sum() seq_emd = embeddings[seq_num][1:seq_len-1] features.append(seq_emd) #print (len(features))''' return protein_embd
def predict(model: ProtTransClassification, dataloader: DataLoader, tokenizer: BertTokenizer, device) -> (np.array, np.array): logits = [] with torch.no_grad(): for data in tqdm(dataloader): inputs = tokenizer.batch_encode_plus(data, add_special_tokens=True, padding=True, truncation=True, max_length=102, return_tensors="pt") output = model(inputs["input_ids"].to(device), inputs["token_type_ids"].to(device), inputs["attention_mask"].to(device)) logits.append(output["logits"]) logits = torch.cat(logits) _, preds = torch.max(torch.exp(logits), 1) # Detach and convert to numpy logits = logits.cpu().detach().numpy() preds = preds.cpu().detach().numpy() return logits, preds
def generate_protbert_features(root_dir): t0 = time() modelUrl = 'https://www.dropbox.com/s/dm3m1o0tsv9terq/pytorch_model.bin?dl=1' configUrl = 'https://www.dropbox.com/s/d3yw7v4tvi5f4sk/bert_config.json?dl=1' vocabUrl = 'https://www.dropbox.com/s/jvrleji50ql5m5i/vocab.txt?dl=1' downloadFolderPath = root_dir + '/inputs/ProtBert_model/' modelFolderPath = downloadFolderPath modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin') configFilePath = os.path.join(modelFolderPath, 'config.json') vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt') if not os.path.exists(modelFolderPath): os.makedirs(modelFolderPath) def download_file(url, filename): response = requests.get(url, stream=True) with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=filename) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) if not os.path.exists(modelFilePath): download_file(modelUrl, modelFilePath) if not os.path.exists(configFilePath): download_file(configUrl, configFilePath) if not os.path.exists(vocabFilePath): download_file(vocabUrl, vocabFilePath) tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False) model = BertModel.from_pretrained(modelFolderPath) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(device) model = model.eval() def make_aseq(seq): protAlphabet = 'ACDEFGHIKLMNPQRSTVWYX' return ' '.join([protAlphabet[x] for x in seq]) # data = ['MSREEVESLIQEVLEVYPEKARKDRNKHLAVNDPAVTQSKKCIISNKKSQPGLMTIRGCAYAGSKGVVWGPIKDMIHISHGPVGCGQYSRAGRRNYYIGTTGVNAFVTMNFTSDFQEKDIVFGGDKKLAKLIDEVETLFPLNKGISVQSECPIGLIGDDIESVSKVKGAELSKTIVPVRCEGFRGVSQSLGHHIANDAVRDWVLGKRDEDTTFASTPYDVAIIGDYNIGGDAWSSRILLEEMGLRCVAQWSGDGSISEIELTPKVKLNLVHCYRSMNYISRHMEEKYGIPWMEYNFFGPTKTIESLRAIAAKFDESIQKKCEEVIAKYKPEWEAVVAKYRPRLEGKRVMLYIGGLRPRHVIGAYEDLGMEVVGTGYEFAHNDDYDRTMKEMGDSTLLYDDVTGYEFEEFVKRIKPDLIGSGIKEKFIFQKMGIPFREMHSWDYSGPYHGFDGFAIFARDMDMTLNNPCWKKLQAPWEASQQVDKIKASYPLFLDQDYKDM', # 'HLQSTPQNLVSNAPIAETAGIAEPPDDDLQARLNTLKKQ'] sequences = [] with open(root_dir + '/inputs/protein_list.txt', 'r') as f: protein_list = f.readlines() for protein in protein_list: seq = open( root_dir + '/inputs/fasta_files/{}.fasta'.format(protein.strip()), 'r').readlines() sequences += [seq[1].strip()] sequences_Example = [' '.join(list(seq)) for seq in sequences] sequences_Example = [ re.sub(r"[-UZOB]", "X", sequence) for sequence in sequences_Example ] all_protein_features = [] for i, seq in enumerate(sequences_Example): ids = tokenizer.batch_encode_plus([seq], add_special_tokens=True, pad_to_max_length=True) input_ids = torch.tensor(ids['input_ids']).to(device) attention_mask = torch.tensor(ids['attention_mask']).to(device) with torch.no_grad(): embedding = model(input_ids=input_ids, attention_mask=attention_mask)[0] embedding = embedding.cpu().numpy() features = [] for seq_num in range(len(embedding)): seq_len = (attention_mask[seq_num] == 1).sum() seq_emd = embedding[seq_num][1:seq_len - 1] features.append(seq_emd) # print(features.__len__()) # print(features[0].shape) # print(all_protein_sequences['all_protein_complex_pdb_ids'][i]) # print(features) all_protein_features += features pickle.dump({'ProtBert_features': all_protein_features}, gzip.open(root_dir + '/inputs/ProtBert_features.pkl.gz', 'wb')) print('Total time spent for ProtBERT:', time() - t0)
count = 0 with open("./residuesequences.txt", "r") as f: for seq in f.readlines(): desc = str(seq).rstrip('\n') sequences_Example.append(desc) count += 1 print("Total data points(Clean): ", str(count)) #Replace "UZOB" with "X" sequences_Example = [ re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example ] #Tokenizing input sequences ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True) input_ids = torch.tensor(ids['input_ids']).to(device) attention_mask = torch.tensor(ids['attention_mask']).to(device) #Generating Embeddings prefix = join(pwd, "Embeddings") if not os.path.exists(prefix): os.makedirs(prefix) bs = 16 batch = 0 count = 0 i = 0 embedding = np.zeros((128, 1632, 1024), dtype=np.float32) limit = len(sequences_Example) // bs * bs
def main(): modelUrl = 'https://www.dropbox.com/s/dm3m1o0tsv9terq/pytorch_model.bin?dl=1' configUrl = 'https://www.dropbox.com/s/d3yw7v4tvi5f4sk/bert_config.json?dl=1' vocabUrl = 'https://www.dropbox.com/s/jvrleji50ql5m5i/vocab.txt?dl=1' modelFolderPath = '/home/a/aditi/pfs/packages/language_models/ProtBert/' modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin') configFilePath = os.path.join(modelFolderPath, 'config.json') vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt') parser = argparse.ArgumentParser(description="Main script to run models") parser.add_argument("--fastafilepath", type=str, help="training/testing") parser.add_argument("--mappings", type=str, help="training/testing") parser.add_argument("--output", type=str, help="training/testing") args = parser.parse_args() if not os.path.exists(modelFilePath): download_file(modelUrl, modelFilePath) if not os.path.exists(configFilePath): download_file(configUrl, configFilePath) if not os.path.exists(vocabFilePath): download_file(vocabUrl, vocabFilePath) fasta_dirr = args.fastafilepath + '/' mappings = args.mappings outputfile = args.output df = pd.read_csv(mappings, sep=",", names=["Uniprot_ID", "Localization"]) print(df) #sequence_list = [] #target_list = [] sample = [] for filename in os.listdir(fasta_dirr): b_dict = {} file_path = fasta_dirr + filename afile = open(file_path, 'r') sequence = read_fasta_sequence(afile) if (filename[:-6]) in df['Uniprot_ID'].values: tar = (df.loc[df['Uniprot_ID'] == ( filename[:-6])]['Localization'].values)[0] b_dict['seq'] = sequence b_dict['label'] = tar blist = b_dict.copy() sample.append(blist) #sequence_list.append(sequence) #target_list.append(str(tar)) df1 = pd.DataFrame(sample) print(df1) seqlist = df1['seq'].tolist() seqlist = [re.sub(r"[UZOB]", "X", sequence) for sequence in seqlist] #print (seqlist) tarlist = df1['label'].tolist() tarlist = np.char.encode(tarlist) npuniq = np.unique(tarlist) print(npuniq) tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False) ids = tokenizer.batch_encode_plus(seqlist, add_special_tokens=False, padding=True, truncation=True, max_length=2000) print(type(ids)) device = torch.device('cpu') print(device) model = BertModel.from_pretrained(modelFolderPath) model = model.to(device) model = model.eval() input_ids = torch.tensor(ids['input_ids']).to(device) attention_mask = torch.tensor(ids['attention_mask']).to(device) with torch.no_grad(): embedding = model(input_ids=input_ids, attention_mask=attention_mask)[0] pooling = pool_strategy({ "token_embeddings": embedding, "cls_token_embeddings": embedding[:, 0], "attention_mask": attention_mask, }) pooling = pooling.cpu().numpy() print(pooling.shape) #embedding = embedding.cpu().numpy() #print (embedding.shape) with h5py.File(outputfile, "w") as embeddings_file: embeddings_file.create_dataset("labels", data=tarlist) embeddings_file.create_dataset('features', data=pooling) """ features = [] for seq_num in range(len(embedding)): seq_len = (attention_mask[seq_num] == 1).sum() seq_emd = embedding[seq_num][1:seq_len-1] features.append(seq_emd) print (seq_emd.shape) print (len(features)) """ """ encoder_features = 1024 model = BertModel.from_pretrained(modelFolderPath) label_set = "CYT,ERE,EXC,GLG,LYS,MEM,MIT,NUC,PEX,PLS" # Label Encoder label_encoder = LabelEncoder(label_set.split(","), reserved_labels=[]) label_encoder.unknown_index = None print (label_encoder) device = torch.device('cpu') model = model.eval() ids = tokenizer.batch_encode_plus(sequence_list, add_special_tokens=True, pad_to_max_length=True) input_ids = torch.tensor(ds['input_ids']).to(device) print (input_ids) attention_mask = torch.tensor(ds['attention_mask']).to(device) # Embedding has shape (N, 3, 1024) where N = number of proteins with torch.no_grad(): embedding = model(input_ids=input_ids,attention_mask=attention_mask)[0] pooling = pool_strategy({"token_embeddings": embedding, "cls_token_embeddings": embedding[:, 0], "attention_mask": attention_mask, }) print (pooling.shape) pooling = pooling.cpu().numpy() embedding = embedding.cpu().numpy() #print (embedding.shape) attention_mask = np.asarray(attention_mask) target_list = np.char.encode(np.array(target_list), encoding='utf8') print (target_list.shape) """ '''
class ReFoodBERT(nn.Module): def __init__(self, device, dropout_rate=0.1): super(ReFoodBERT, self).__init__() # Load pretrained foodbert self.food_bert: BertModel = BertModel.from_pretrained( pretrained_model_name_or_path='foodbert/data/mlm_output/checkpoint-final') with open('foodbert/data/used_ingredients.json', 'r') as f: used_ingredients = json.load(f) self.tokenizer = BertTokenizer(vocab_file='foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients) self.loss_fct = nn.BCEWithLogitsLoss() self.hidden_size = self.food_bert.config.hidden_size self.cls_fc_layer = FCLayer(self.hidden_size, self.hidden_size, dropout_rate) # use weight sharing between the two layers dealing with entities self.e_fc_layer = FCLayer(self.hidden_size, self.hidden_size, dropout_rate) self.label_classifier = FCLayer(self.hidden_size * 3, 2, dropout_rate, use_activation=False) self.ingr_sep_id_1 = self.tokenizer.convert_tokens_to_ids('$') self.ingr_sep_id_2 = self.tokenizer.convert_tokens_to_ids('£') self.device = device def compute_embedding_for_entities(self, sequence_outputs, input_ids): ingr_sep_1_idxs = torch.nonzero((input_ids == self.ingr_sep_id_1)) assert len(ingr_sep_1_idxs) == input_ids.shape[0] * 2 ingr_sep_1_idxs = ingr_sep_1_idxs[::2] # get first occurence of § ingr_sep_2_idxs = torch.nonzero((input_ids == self.ingr_sep_id_2)) assert len(ingr_sep_2_idxs) == input_ids.shape[0] * 2 ingr_sep_2_idxs = ingr_sep_2_idxs[::2] # get first occurence of # ingr_1_idxs = ingr_sep_1_idxs[:, 1] + 1 # get next index after first $ ingr_2_idxs = ingr_sep_2_idxs[:, 1] + 1 # get next index after first # # https://medium.com/analytics-vidhya/understanding-indexing-with-pytorch-gather-33717a84ebc4 # If we want to index a 3d tensor like 32x128x768 in dim=1 our index tensor should have shape 32xNx768. e1_h = torch.gather(sequence_outputs, 1, ingr_1_idxs.unsqueeze(1).repeat(1, self.hidden_size).unsqueeze(1)).squeeze(1) e2_h = torch.gather(sequence_outputs, 1, ingr_2_idxs.unsqueeze(1).repeat(1, self.hidden_size).unsqueeze(1)).squeeze(1) return e1_h, e2_h def compute_avg_embedding_for_entities(self, sequence_outputs, input_ids): ingr_sep_1_idxs = torch.nonzero((input_ids == self.ingr_sep_id_1)) assert len(ingr_sep_1_idxs) == input_ids.shape[0] * 2 beginning_ingr_sep_1_idxs = ingr_sep_1_idxs[::2] # get first occurence of § end_ingr_sep_1_idxs = ingr_sep_1_idxs[1::2] # get second occurence of § ingr_sep_2_idxs = torch.nonzero((input_ids == self.ingr_sep_id_2)) assert len(ingr_sep_2_idxs) == input_ids.shape[0] * 2 beginning_ingr_sep_2_idxs = ingr_sep_2_idxs[::2] # get first occurence of # end_ingr_sep_2_idxs = ingr_sep_2_idxs[1::2] # get second occurence of # e1_h = [] e2_h = [] for idx, sequence_output in enumerate(sequence_outputs): e1_h.append((sequence_output[beginning_ingr_sep_1_idxs[idx, 1] + 1:end_ingr_sep_1_idxs[idx, 1]]).mean(dim=0)) e2_h.append((sequence_output[beginning_ingr_sep_2_idxs[idx, 1] + 1:end_ingr_sep_2_idxs[idx, 1]]).mean(dim=0)) e1_h = torch.stack(e1_h) e2_h = torch.stack(e2_h) return e1_h, e2_h def forward(self, sentences, labels=None): encoded_dict = self.tokenizer.batch_encode_plus( sentences, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoded_dict['input_ids'].to(self.device) attention_mask = encoded_dict['attention_mask'].to(self.device) token_type_ids = encoded_dict['token_type_ids'].to(self.device) outputs = self.food_bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # sequence_output, pooled_output, (hidden_states), (attentions) sequence_output = outputs[0] pooled_output = outputs[1] # [CLS]/pooled outout e1_h, e2_h = self.compute_embedding_for_entities(sequence_output, input_ids) # Dropout -> tanh -> fc_layer pooled_output = self.cls_fc_layer(pooled_output) e1_h = self.e_fc_layer(e1_h) e2_h = self.e_fc_layer(e2_h) # Concat -> fc_layer concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1) logits = self.label_classifier(concat_h) outputs = (None, logits) if labels is not None: logits, labels = logits.flatten(), labels.flatten() known_mask = (labels != -1) loss = self.loss_fct(logits[known_mask], labels[known_mask]) outputs = (loss, logits) return outputs # (loss/None, logits)
model = BertModel.from_pretrained(modelFolderPath) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(device) model = model.eval() sequences_Example = final1 sequences_Example = [ re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example ] ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True) input_ids = torch.tensor(ids['input_ids']).to(device) attention_mask = torch.tensor(ids['attention_mask']).to(device) with torch.no_grad(): embedding = model(input_ids=input_ids, attention_mask=attention_mask)[0] embedding = embedding.cpu().numpy() features = [] for seq_num in range(len(embedding)): seq_len = (attention_mask[seq_num] == 1).sum() seq_emd = embedding[seq_num][1:seq_len - 1] features.append(seq_emd)