Python BertTokenizer.batch_encode_plus Exemples, transformers.BertTokenizer.batch_encode_plus Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : find_answer.py Projet : razinkovnik/qas

def find_paragraph(tokenizer: BertTokenizer,
                   model: BertForNextSentencePrediction,
                   question: str,
                   context: str,
                   max_len=256,
                   batch_size=16):
    q_len = len(tokenizer.tokenize(question))
    context_tokens = tokenizer.tokenize(context)
    part_len = max_len - q_len - 3
    parts = []
    n = 0
    while n < len(context_tokens):
        parts += [context_tokens[n:n + part_len]]
        n += part_len // 2
    results = []
    all_parts = parts[:]
    while len(parts) > 0:
        batch = tokenizer.batch_encode_plus(list(
            zip([question] * batch_size, parts[:batch_size])),
                                            max_length=max_len,
                                            truncation=True,
                                            pad_to_max_length=True,
                                            return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = model(**batch)[0]
        results += [a - b for a, b in output.cpu().tolist()]
        parts = parts[batch_size:]
    return np.array(results), [
        tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True)
        for part in all_parts
    ]

Exemple #2

0

Afficher le fichier

Fichier : twitter_sentiment.py Projet : razinkovnik/stock_recom

def collate(data: List[Tuple[str, int]], tokenizer: BertTokenizer, block_size: int) -> Dict:
    texts, labels = list(map(list, zip(*data)))
    input_data = tokenizer.batch_encode_plus(texts, max_length=block_size,
                                             truncation=True, pad_to_max_length=True, return_tensors="pt").to(
        args.device)
    input_data['labels'] = torch.tensor(labels).to(args.device)
    return input_data

Exemple #3

0

Afficher le fichier

def create_data_loader(sentences, tokenizer_vocab, labels=[], train_mode=True):
    """
    Create a dataloader BERT
    :param iterable sentences: text instances
    :param string tokenizer_vocab: tokenizer file
    :param iterable labels: sentiment or emotions class
    """
    logger.info("Loading Tokenize and Encoding data..")
    tokenizer = BertTokenizer(tokenizer_vocab,
                              do_lower_case=True)
    encoded_sents = tokenizer.batch_encode_plus(sentences,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                padding=True,
                                                max_length=256,
                                                truncation=True,
                                                return_tensors="pt")
    sent_ids = encoded_sents["input_ids"]
    attention_masks = encoded_sents["attention_mask"]
    if len(labels) > 0:
        labels = torch.tensor(labels)
        data = TensorDataset(sent_ids, attention_masks, labels)
    else:
        data = TensorDataset(sent_ids, attention_masks)
    logger.info("Creating Data Loaders...")
    batch_size = int(CFG["MODELS"]["batch_size"])
    if train_mode:
        dataloader = DataLoader(data,
                                sampler=RandomSampler(data),
                                batch_size=batch_size)
    else:
        dataloader = DataLoader(data,
                                sampler=SequentialSampler(data),
                                batch_size=batch_size)
    return dataloader

Exemple #4

0

Afficher le fichier

def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict:
    questions = [item[0] for item in data]
    labels = [0 if item[1] == 'NONE' else 1 for item in data]
    input_data = tokenizer.batch_encode_plus(questions,
                                             max_length=block_size,
                                             truncation=True,
                                             pad_to_max_length=True,
                                             return_tensors="pt").to(
                                                 args.device)
    input_data['labels'] = torch.tensor(labels).to(args.device)
    return input_data

Exemple #5

0

Afficher le fichier

Fichier : primary_task.py Projet : razinkovnik/qas

def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict:
    questions = [item[0] for item in data]
    texts = [item[1] for item in data]
    labels = [item[2] for item in data]
    input_data = tokenizer.batch_encode_plus(list(zip(questions, texts)),
                                             max_length=block_size,
                                             truncation='only_second',
                                             pad_to_max_length=True,
                                             return_tensors="pt").to(
                                                 args.device)
    input_data['next_sentence_label'] = torch.tensor(labels).to(args.device)
    return input_data

Exemple #6

0

Afficher le fichier

def encode_sentences(settings: Settings, tokenizer: BertTokenizer,
                     sentences: list) -> Tuple[list, list]:
    # Use the pretrained BERT transfer model
    # return as an array of token id's
    # converts tokens to id's and includes CLS and SEP
    # can be converted back with convert_ids_to_tokens
    encoding_dict = tokenizer.batch_encode_plus(
        sentences,
        pad_to_max_length=True,
        max_length=settings.get_max_tokens_length(),
        add_special_tokens=True,
    )
    return encoding_dict['input_ids'], encoding_dict['attention_mask']

Exemple #7

0

Afficher le fichier

class TransformersBertPreprocessor(Component):
    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 tokenize_chinese_chars: bool = True,
                 **kwargs):
        vocab_file = expand_path(vocab_file)
        self.tokenizer = BertTokenizer(
            vocab_file=vocab_file,
            do_lower_case=do_lower_case,
            tokenize_chinese_chars=tokenize_chinese_chars)
        self.max_seq_length = max_seq_length

    def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\
            Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]:

        if isinstance(tokens_batch[0], str):  # skip for already tokenized text
            tokens_batch = [
                self.tokenizer.basic_tokenizer.tokenize(
                    sentence, self.tokenizer.all_special_tokens)
                for sentence in tokens_batch
            ]
        startofword_markers_batch = []
        subtokens_batch = []
        for tokens in tokens_batch:
            startofword_markers = [0]
            subtokens = ['[CLS]']
            for token in tokens:
                for i, subtoken in enumerate(
                        self.tokenizer.wordpiece_tokenizer.tokenize(token)):
                    startofword_markers.append(int(i == 0))
                    subtokens.append(subtoken)
            startofword_markers.append(0)
            subtokens.append('[SEP]')
            if len(subtokens) > self.max_seq_length:
                raise RuntimeError(
                    f"input sequence after bert tokenization"
                    f" cannot exceed {self.max_seq_length} tokens.")

            startofword_markers_batch.append(startofword_markers)
            subtokens_batch.append(subtokens)

        encoded = self.tokenizer.batch_encode_plus(
            [[subtokens, None] for subtokens in subtokens_batch],
            add_special_tokens=False)

        return (tokens_batch, subtokens_batch,
                _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id),
                _pad(startofword_markers_batch),
                _pad(encoded['attention_mask']))

Exemple #8

0

Afficher le fichier

Fichier : primary_yesno_task.py Projet : razinkovnik/qas

def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict:
    questions = [item[3] for item in data]
    texts = [item[1] for item in data]
    label2id = {
        'YES': 1,
        'NO': 0,
        'NONE': 2
    }
    labels = [label2id[item[-1]] for item in data]
    input_data = tokenizer.batch_encode_plus(list(zip(questions, texts)), max_length=block_size,
                                             truncation='only_second', pad_to_max_length=True, return_tensors="pt").to(
        args.device)
    input_data['labels'] = torch.tensor(labels).to(args.device)
    return input_data

Exemple #9

0

Afficher le fichier

Fichier : find_answer.py Projet : razinkovnik/qas

def context_score(tokenizer: BertTokenizer,
                  model: BertForNextSentencePrediction,
                  question: str,
                  context: List[str],
                  max_len=64):
    batch = tokenizer.batch_encode_plus(list(
        zip([question] * len(context), context)),
                                        max_length=max_len,
                                        truncation=True,
                                        pad_to_max_length=True,
                                        return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model(**batch)[0]
    return np.array([a - b for a, b in output.cpu().numpy()])

Exemple #10

0

Afficher le fichier

Fichier : shape_test.py Projet : blongwill/portfolio

def encode_sentences(tokenizer: BertTokenizer,
                     sentences: list) -> (list, list):
    # Use the pretrained BERT transfer model
    #return as an array of token id's
    encoding_dict = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=sentences,
        pad_to_max_length=True,
        max_length=512,
        add_special_tokens=True,
        return_tensors='pt')  #converts tokens to id's and includes CLS and SEP
    # can be converted back with convert_ids_to_tokens
    print(encoding_dict.keys())
    #print(encoding_dict['tokens'])
    return encoding_dict['input_ids'], encoding_dict['attention_mask']

Exemple #11

0

Afficher le fichier

Fichier : gold_task.py Projet : razinkovnik/qas

def collate(data: List, tokenizer: BertTokenizer, block_size: int) -> Dict:
    starts = [item['start'] for item in data]
    ends = [item['end'] for item in data]
    questions = [item['question'] for item in data]
    contexts = [item['context'] for item in data]
    input_data = tokenizer.batch_encode_plus(list(zip(questions, contexts)),
                                             max_length=block_size,
                                             truncation='only_second',
                                             pad_to_max_length=True,
                                             return_tensors="pt").to(
                                                 args.device)
    input_data['start_positions'] = torch.tensor(starts).to(args.device)
    input_data['end_positions'] = torch.tensor(ends).to(args.device)
    return input_data

Exemple #12

0

Afficher le fichier

def embed_sentence(modelFolderPath, vocabFilePath, seq, MAX_LEN):

    device = 'cpu'

    model = BertModel.from_pretrained(modelFolderPath)
    model = model.to(device)
    model = model.eval()

    tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False)
    ids = tokenizer.batch_encode_plus(seq,
                                      add_special_tokens=True,
                                      padding=True,
                                      truncation=True,
                                      max_length=MAX_LEN)

    tokenized_sequences = torch.tensor(ids["input_ids"]).to(model.device)
    attention_mask = torch.tensor(ids["attention_mask"]).to(model.device)

    with torch.no_grad():
        embeddings = model(input_ids=tokenized_sequences,
                           attention_mask=attention_mask)[0]

    print(embeddings.shape)
    embeddings = embeddings.clone().detach()
    protein_embd = torch.tensor(embeddings).sum(dim=0).mean(dim=0)
    print(protein_embd.shape)
    '''
    pooling = pool_strategy({"token_embeddings": embeddings,
                                      "cls_token_embeddings": embeddings[:, 0],
                                      "attention_mask": attention_mask,
                                      })
    
    pooling = pooling.cpu().numpy()
    print (pooling.shape)

    embeddings = embeddings.cpu().numpy()
    print (embeddings.shape)

    features = [] 
    for seq_num in range(len(embeddings)):
        seq_len = (attention_mask[seq_num] == 1).sum()
        seq_emd = embeddings[seq_num][1:seq_len-1]
        features.append(seq_emd)
    #print (len(features))'''

    return protein_embd

Exemple #13

0

Afficher le fichier

def predict(model: ProtTransClassification, dataloader: DataLoader,
            tokenizer: BertTokenizer, device) -> (np.array, np.array):
    logits = []
    with torch.no_grad():
        for data in tqdm(dataloader):
            inputs = tokenizer.batch_encode_plus(data,
                                                 add_special_tokens=True,
                                                 padding=True,
                                                 truncation=True,
                                                 max_length=102,
                                                 return_tensors="pt")
            output = model(inputs["input_ids"].to(device),
                           inputs["token_type_ids"].to(device),
                           inputs["attention_mask"].to(device))
            logits.append(output["logits"])
    logits = torch.cat(logits)
    _, preds = torch.max(torch.exp(logits), 1)
    # Detach and convert to numpy
    logits = logits.cpu().detach().numpy()
    preds = preds.cpu().detach().numpy()
    return logits, preds

Exemple #14

0

Afficher le fichier

def generate_protbert_features(root_dir):
    t0 = time()
    modelUrl = 'https://www.dropbox.com/s/dm3m1o0tsv9terq/pytorch_model.bin?dl=1'
    configUrl = 'https://www.dropbox.com/s/d3yw7v4tvi5f4sk/bert_config.json?dl=1'
    vocabUrl = 'https://www.dropbox.com/s/jvrleji50ql5m5i/vocab.txt?dl=1'

    downloadFolderPath = root_dir + '/inputs/ProtBert_model/'

    modelFolderPath = downloadFolderPath

    modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin')

    configFilePath = os.path.join(modelFolderPath, 'config.json')

    vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt')

    if not os.path.exists(modelFolderPath):
        os.makedirs(modelFolderPath)

    def download_file(url, filename):
        response = requests.get(url, stream=True)
        with tqdm.wrapattr(open(filename, "wb"),
                           "write",
                           miniters=1,
                           total=int(response.headers.get('content-length',
                                                          0)),
                           desc=filename) as fout:
            for chunk in response.iter_content(chunk_size=4096):
                fout.write(chunk)

    if not os.path.exists(modelFilePath):
        download_file(modelUrl, modelFilePath)

    if not os.path.exists(configFilePath):
        download_file(configUrl, configFilePath)

    if not os.path.exists(vocabFilePath):
        download_file(vocabUrl, vocabFilePath)

    tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False)
    model = BertModel.from_pretrained(modelFolderPath)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    model = model.eval()

    def make_aseq(seq):
        protAlphabet = 'ACDEFGHIKLMNPQRSTVWYX'
        return ' '.join([protAlphabet[x] for x in seq])

    # data = ['MSREEVESLIQEVLEVYPEKARKDRNKHLAVNDPAVTQSKKCIISNKKSQPGLMTIRGCAYAGSKGVVWGPIKDMIHISHGPVGCGQYSRAGRRNYYIGTTGVNAFVTMNFTSDFQEKDIVFGGDKKLAKLIDEVETLFPLNKGISVQSECPIGLIGDDIESVSKVKGAELSKTIVPVRCEGFRGVSQSLGHHIANDAVRDWVLGKRDEDTTFASTPYDVAIIGDYNIGGDAWSSRILLEEMGLRCVAQWSGDGSISEIELTPKVKLNLVHCYRSMNYISRHMEEKYGIPWMEYNFFGPTKTIESLRAIAAKFDESIQKKCEEVIAKYKPEWEAVVAKYRPRLEGKRVMLYIGGLRPRHVIGAYEDLGMEVVGTGYEFAHNDDYDRTMKEMGDSTLLYDDVTGYEFEEFVKRIKPDLIGSGIKEKFIFQKMGIPFREMHSWDYSGPYHGFDGFAIFARDMDMTLNNPCWKKLQAPWEASQQVDKIKASYPLFLDQDYKDM',
    #         'HLQSTPQNLVSNAPIAETAGIAEPPDDDLQARLNTLKKQ']

    sequences = []

    with open(root_dir + '/inputs/protein_list.txt', 'r') as f:
        protein_list = f.readlines()
        for protein in protein_list:
            seq = open(
                root_dir +
                '/inputs/fasta_files/{}.fasta'.format(protein.strip()),
                'r').readlines()
            sequences += [seq[1].strip()]

    sequences_Example = [' '.join(list(seq)) for seq in sequences]
    sequences_Example = [
        re.sub(r"[-UZOB]", "X", sequence) for sequence in sequences_Example
    ]

    all_protein_features = []

    for i, seq in enumerate(sequences_Example):
        ids = tokenizer.batch_encode_plus([seq],
                                          add_special_tokens=True,
                                          pad_to_max_length=True)
        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device)
        with torch.no_grad():
            embedding = model(input_ids=input_ids,
                              attention_mask=attention_mask)[0]
        embedding = embedding.cpu().numpy()
        features = []
        for seq_num in range(len(embedding)):
            seq_len = (attention_mask[seq_num] == 1).sum()
            seq_emd = embedding[seq_num][1:seq_len - 1]
            features.append(seq_emd)
    #     print(features.__len__())
    #     print(features[0].shape)
    # print(all_protein_sequences['all_protein_complex_pdb_ids'][i])
    #     print(features)
        all_protein_features += features

    pickle.dump({'ProtBert_features': all_protein_features},
                gzip.open(root_dir + '/inputs/ProtBert_features.pkl.gz', 'wb'))

    print('Total time spent for ProtBERT:', time() - t0)

Exemple #15

0

Afficher le fichier

    count = 0
    with open("./residuesequences.txt", "r") as f:
        for seq in f.readlines():
            desc = str(seq).rstrip('\n')
            sequences_Example.append(desc)
            count += 1
    print("Total data points(Clean): ", str(count))

    #Replace "UZOB" with "X"
    sequences_Example = [
        re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example
    ]

    #Tokenizing input sequences
    ids = tokenizer.batch_encode_plus(sequences_Example,
                                      add_special_tokens=True,
                                      pad_to_max_length=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    #Generating Embeddings
    prefix = join(pwd, "Embeddings")
    if not os.path.exists(prefix):
        os.makedirs(prefix)
    bs = 16
    batch = 0
    count = 0
    i = 0
    embedding = np.zeros((128, 1632, 1024), dtype=np.float32)

    limit = len(sequences_Example) // bs * bs

Exemple #16

0

Afficher le fichier

Fichier : generate_ProtBert.py Projet : aditishenoy/bioinfo_repo

def main():

    modelUrl = 'https://www.dropbox.com/s/dm3m1o0tsv9terq/pytorch_model.bin?dl=1'
    configUrl = 'https://www.dropbox.com/s/d3yw7v4tvi5f4sk/bert_config.json?dl=1'
    vocabUrl = 'https://www.dropbox.com/s/jvrleji50ql5m5i/vocab.txt?dl=1'

    modelFolderPath = '/home/a/aditi/pfs/packages/language_models/ProtBert/'

    modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin')
    configFilePath = os.path.join(modelFolderPath, 'config.json')
    vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt')

    parser = argparse.ArgumentParser(description="Main script to run models")
    parser.add_argument("--fastafilepath", type=str, help="training/testing")
    parser.add_argument("--mappings", type=str, help="training/testing")
    parser.add_argument("--output", type=str, help="training/testing")

    args = parser.parse_args()

    if not os.path.exists(modelFilePath):
        download_file(modelUrl, modelFilePath)

    if not os.path.exists(configFilePath):
        download_file(configUrl, configFilePath)

    if not os.path.exists(vocabFilePath):
        download_file(vocabUrl, vocabFilePath)

    fasta_dirr = args.fastafilepath + '/'
    mappings = args.mappings
    outputfile = args.output

    df = pd.read_csv(mappings, sep=",", names=["Uniprot_ID", "Localization"])
    print(df)

    #sequence_list = []
    #target_list = []
    sample = []
    for filename in os.listdir(fasta_dirr):

        b_dict = {}
        file_path = fasta_dirr + filename

        afile = open(file_path, 'r')
        sequence = read_fasta_sequence(afile)

        if (filename[:-6]) in df['Uniprot_ID'].values:
            tar = (df.loc[df['Uniprot_ID'] == (
                filename[:-6])]['Localization'].values)[0]
            b_dict['seq'] = sequence
            b_dict['label'] = tar
            blist = b_dict.copy()
            sample.append(blist)
            #sequence_list.append(sequence)
            #target_list.append(str(tar))

    df1 = pd.DataFrame(sample)
    print(df1)
    seqlist = df1['seq'].tolist()
    seqlist = [re.sub(r"[UZOB]", "X", sequence) for sequence in seqlist]
    #print (seqlist)
    tarlist = df1['label'].tolist()
    tarlist = np.char.encode(tarlist)

    npuniq = np.unique(tarlist)
    print(npuniq)
    tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False)
    ids = tokenizer.batch_encode_plus(seqlist,
                                      add_special_tokens=False,
                                      padding=True,
                                      truncation=True,
                                      max_length=2000)
    print(type(ids))
    device = torch.device('cpu')
    print(device)
    model = BertModel.from_pretrained(modelFolderPath)
    model = model.to(device)
    model = model.eval()

    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    with torch.no_grad():
        embedding = model(input_ids=input_ids,
                          attention_mask=attention_mask)[0]
        pooling = pool_strategy({
            "token_embeddings": embedding,
            "cls_token_embeddings": embedding[:, 0],
            "attention_mask": attention_mask,
        })

    pooling = pooling.cpu().numpy()
    print(pooling.shape)

    #embedding = embedding.cpu().numpy()
    #print (embedding.shape)

    with h5py.File(outputfile, "w") as embeddings_file:
        embeddings_file.create_dataset("labels", data=tarlist)
        embeddings_file.create_dataset('features', data=pooling)
    """
    features = [] 
    for seq_num in range(len(embedding)):
        seq_len = (attention_mask[seq_num] == 1).sum()
        seq_emd = embedding[seq_num][1:seq_len-1]
        features.append(seq_emd)
        print (seq_emd.shape)
    print (len(features))
    """
    """
    encoder_features = 1024
    model = BertModel.from_pretrained(modelFolderPath)
    
    label_set = "CYT,ERE,EXC,GLG,LYS,MEM,MIT,NUC,PEX,PLS"
    # Label Encoder
    label_encoder = LabelEncoder(label_set.split(","), reserved_labels=[])
    label_encoder.unknown_index = None
    print (label_encoder)
    
    device = torch.device('cpu')
    model = model.eval()

    ids = tokenizer.batch_encode_plus(sequence_list, add_special_tokens=True, pad_to_max_length=True)
   

    input_ids = torch.tensor(ds['input_ids']).to(device)
    print (input_ids)
    attention_mask = torch.tensor(ds['attention_mask']).to(device)



    # Embedding has shape (N, 3, 1024) where N = number of proteins
    with torch.no_grad():
        embedding = model(input_ids=input_ids,attention_mask=attention_mask)[0]
        pooling = pool_strategy({"token_embeddings": embedding,
                                      "cls_token_embeddings": embedding[:, 0],
                                      "attention_mask": attention_mask,
                                      })
        print (pooling.shape)
        pooling = pooling.cpu().numpy()
        embedding = embedding.cpu().numpy()
        #print (embedding.shape)

    attention_mask = np.asarray(attention_mask)

    target_list = np.char.encode(np.array(target_list), encoding='utf8')
    print (target_list.shape)
    """
    '''

Exemple #17

0

Afficher le fichier

class ReFoodBERT(nn.Module):
    def __init__(self, device, dropout_rate=0.1):
        super(ReFoodBERT, self).__init__()
        # Load pretrained foodbert
        self.food_bert: BertModel = BertModel.from_pretrained(
            pretrained_model_name_or_path='foodbert/data/mlm_output/checkpoint-final')
        with open('foodbert/data/used_ingredients.json', 'r') as f:
            used_ingredients = json.load(f)
        self.tokenizer = BertTokenizer(vocab_file='foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False,
                                       max_len=128, never_split=used_ingredients)
        self.loss_fct = nn.BCEWithLogitsLoss()

        self.hidden_size = self.food_bert.config.hidden_size

        self.cls_fc_layer = FCLayer(self.hidden_size, self.hidden_size, dropout_rate)
        # use weight sharing between the two layers dealing with entities
        self.e_fc_layer = FCLayer(self.hidden_size, self.hidden_size, dropout_rate)
        self.label_classifier = FCLayer(self.hidden_size * 3, 2, dropout_rate,
                                        use_activation=False)
        self.ingr_sep_id_1 = self.tokenizer.convert_tokens_to_ids('$')
        self.ingr_sep_id_2 = self.tokenizer.convert_tokens_to_ids('£')
        self.device = device

    def compute_embedding_for_entities(self, sequence_outputs, input_ids):
        ingr_sep_1_idxs = torch.nonzero((input_ids == self.ingr_sep_id_1))
        assert len(ingr_sep_1_idxs) == input_ids.shape[0] * 2
        ingr_sep_1_idxs = ingr_sep_1_idxs[::2]  # get first occurence of §
        ingr_sep_2_idxs = torch.nonzero((input_ids == self.ingr_sep_id_2))
        assert len(ingr_sep_2_idxs) == input_ids.shape[0] * 2
        ingr_sep_2_idxs = ingr_sep_2_idxs[::2]  # get first occurence of #

        ingr_1_idxs = ingr_sep_1_idxs[:, 1] + 1  # get next index after first $
        ingr_2_idxs = ingr_sep_2_idxs[:, 1] + 1  # get next index after first #

        # https://medium.com/analytics-vidhya/understanding-indexing-with-pytorch-gather-33717a84ebc4
        # If we want to index a 3d tensor like 32x128x768 in dim=1 our index tensor should have shape 32xNx768.
        e1_h = torch.gather(sequence_outputs, 1, ingr_1_idxs.unsqueeze(1).repeat(1, self.hidden_size).unsqueeze(1)).squeeze(1)
        e2_h = torch.gather(sequence_outputs, 1, ingr_2_idxs.unsqueeze(1).repeat(1, self.hidden_size).unsqueeze(1)).squeeze(1)

        return e1_h, e2_h

    def compute_avg_embedding_for_entities(self, sequence_outputs, input_ids):
        ingr_sep_1_idxs = torch.nonzero((input_ids == self.ingr_sep_id_1))
        assert len(ingr_sep_1_idxs) == input_ids.shape[0] * 2
        beginning_ingr_sep_1_idxs = ingr_sep_1_idxs[::2]  # get first occurence of §
        end_ingr_sep_1_idxs = ingr_sep_1_idxs[1::2]  # get second occurence of §
        ingr_sep_2_idxs = torch.nonzero((input_ids == self.ingr_sep_id_2))
        assert len(ingr_sep_2_idxs) == input_ids.shape[0] * 2
        beginning_ingr_sep_2_idxs = ingr_sep_2_idxs[::2]  # get first occurence of #
        end_ingr_sep_2_idxs = ingr_sep_2_idxs[1::2]  # get second occurence of #

        e1_h = []
        e2_h = []
        for idx, sequence_output in enumerate(sequence_outputs):
            e1_h.append((sequence_output[beginning_ingr_sep_1_idxs[idx, 1] + 1:end_ingr_sep_1_idxs[idx, 1]]).mean(dim=0))
            e2_h.append((sequence_output[beginning_ingr_sep_2_idxs[idx, 1] + 1:end_ingr_sep_2_idxs[idx, 1]]).mean(dim=0))

        e1_h = torch.stack(e1_h)
        e2_h = torch.stack(e2_h)

        return e1_h, e2_h

    def forward(self, sentences, labels=None):
        encoded_dict = self.tokenizer.batch_encode_plus(
            sentences,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoded_dict['input_ids'].to(self.device)
        attention_mask = encoded_dict['attention_mask'].to(self.device)
        token_type_ids = encoded_dict['token_type_ids'].to(self.device)

        outputs = self.food_bert(input_ids, attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]/pooled outout

        e1_h, e2_h = self.compute_embedding_for_entities(sequence_output, input_ids)

        # Dropout -> tanh -> fc_layer
        pooled_output = self.cls_fc_layer(pooled_output)
        e1_h = self.e_fc_layer(e1_h)
        e2_h = self.e_fc_layer(e2_h)

        # Concat -> fc_layer
        concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        outputs = (None, logits)

        if labels is not None:
            logits, labels = logits.flatten(), labels.flatten()
            known_mask = (labels != -1)
            loss = self.loss_fct(logits[known_mask], labels[known_mask])

            outputs = (loss, logits)

        return outputs  # (loss/None, logits)

Exemple #18

0

Afficher le fichier

Fichier : embeddinggenerator.py Projet : sailfish009/PeptideMHCToxCPP

model = BertModel.from_pretrained(modelFolderPath)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
model = model.eval()

sequences_Example = final1

sequences_Example = [
    re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example
]

ids = tokenizer.batch_encode_plus(sequences_Example,
                                  add_special_tokens=True,
                                  padding=True)

input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

with torch.no_grad():
    embedding = model(input_ids=input_ids, attention_mask=attention_mask)[0]

embedding = embedding.cpu().numpy()

features = []
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][1:seq_len - 1]
    features.append(seq_emd)