Ejemplo n.º 1
0
def test_basic():
    import torch
    from tape import ProteinBertModel, ProteinBertConfig, TAPETokenizer  # type: ignore

    config = ProteinBertConfig(hidden_size=12,
                               intermediate_size=12 * 4,
                               num_hidden_layers=2)
    model = ProteinBertModel(config)
    tokenizer = TAPETokenizer(vocab='iupac')

    sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    sequence_output = output[0]  # noqa
    pooled_output = output[1]  # noqa
Ejemplo n.º 2
0
def test_forcedownload():
    model = ProteinBertModel.from_pretrained('bert-base')
    url = BERT_PRETRAINED_MODEL_ARCHIVE_MAP['bert-base']
    filename = url_to_filename(url, get_etag(url))
    wholepath = get_cache() / filename
    oldtime = time.ctime(os.path.getmtime(wholepath))
    model = ProteinBertModel.from_pretrained('bert-base', force_download=True)
    newtime = time.ctime(os.path.getmtime(wholepath))
    assert (newtime != oldtime)
    # Deploy model
    # iupac is the vocab for TAPE models, use unirep for the UniRep model
    tokenizer = TAPETokenizer(vocab='iupac')
    # Pfam Family: Hexapep, Clan: CL0536
    sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    model(token_ids)
Ejemplo n.º 3
0
class encoding_tape(object):
    def __init__(self, dataset_sequences):

        self.dataset_sequences = dataset_sequences
        self.model = ProteinBertModel.from_pretrained('bert-base')
        self.tokenizer = TAPETokenizer(
            vocab='iupac'
        )  # iupac is the vocab for TAPE models, use unirep for the UniRep model

    def apply_encoding(self):

        matrix_encoding = []
        for i in range(len(self.dataset_sequences)):

            try:
                token_ids = torch.tensor([
                    self.tokenizer.encode(
                        self.dataset_sequences['sequence'][i])
                ])
                output = self.model(token_ids)
                sequence_output = output[0]

                matrix_data = []

                for element in sequence_output[0].cpu().detach().numpy():
                    matrix_data.append(element)

                encoding_avg = []

                for k in range(len(matrix_data[0])):
                    array_value = []
                    for j in range(len(matrix_data)):
                        array_value.append(matrix_data[j][k])

                    encoding_avg.append(np.mean(array_value))
                matrix_encoding.append(encoding_avg)
            except:
                pass

        header = [
            "P_" + str(i + 1) for i in range(len(matrix_encoding[0]) - 1)
        ]
        self.dataset_encoding = pd.DataFrame(matrix_encoding, columns=header)
Ejemplo n.º 4
0
def UniRep_Embed(input_seq):
    T0 = time.time()
    UNIREPEB_ = []
    PID = []
    print("UniRep Embedding...")

    model = UniRepModel.from_pretrained('babbler-1900')
    model = model.to(DEVICE)
    tokenizer = TAPETokenizer(vocab='unirep')

    for key, value in input_seq.items():
        PID.append(key)
        sequence = value
        if len(sequence) == 0:
            print('# WARNING: sequence',
                  PID,
                  'has length=0. Skipping.',
                  file=sys.stderr)
            continue
        with torch.no_grad():
            token_ids = torch.tensor([tokenizer.encode(sequence)])
            token_ids = token_ids.to(DEVICE)
            output = model(token_ids)
            unirep_output = output[0]
            unirep_output = torch.squeeze(unirep_output)
            unirep_output = unirep_output.mean(0)
            unirep_output = unirep_output.cpu().numpy()
            UNIREPEB_.append(unirep_output.tolist())
    unirep_feature = pd.DataFrame(UNIREPEB_)
    col = ["UniRep_F" + str(i + 1) for i in range(0, 1900)]
    unirep_feature.columns = col
    unirep_feature = pd.concat([unirep_feature], axis=1)
    unirep_feature.index = PID
    # print(unirep_feature.shape)
    unirep_feature.to_csv("./dataset/unirep_feature.csv")
    print("Getting Deep Representation Learning Features with UniRep is done.")
    print("it took %0.3f mins.\n" % ((time.time() - T0) / 60))

    return unirep_feature
Ejemplo n.º 5
0
tokenizer = TAPETokenizer(vocab='iupac') 


num_of_features = 768
import numpy as np
X=np.zeros((len(train_seqs),num_of_features))
y=np.zeros(len(train_seqs))

ind_X=np.zeros((len(test_seqs),num_of_features))
ind_y=np.zeros(len(test_seqs))

# now lets populate X
i=0
for s in train_seqs:
    #f=extractFeatures(s)
    token_ids = torch.tensor([tokenizer.encode(s)])
    output = model(token_ids)
    sequence_output = output[0]
    pooled_output = output[1]
    
    X[i]=np.array(np.mean(sequence_output.detach().numpy(),axis=1) )
    i=i+1
    
y=np.array(train_labels)


i=0
for s in test_seqs:
    #f=extractFeatures(s)
    token_ids = torch.tensor([tokenizer.encode(s)])
    output = model(token_ids)
Ejemplo n.º 6
0
def DRLF_Embed(fastaFile, outFile, device=-2):

    path = fastaFile
    count = 0
    SSAEMB_ = []
    UNIREPEB_ = []
    ##read Fasta File
    inData = fasta.fasta2csv(path)
    Seqs = inData["Seq"]

    PID_ = []
    ##SSA Embedding

    print("SSA Embedding...")
    lm_embed, lstm_stack, proj = load_model(
        "./src/PretrainedModel/SSA_embed.model", use_cuda=True)

    with open(path, 'rb') as f:
        for name, sequence in fasta.parse_stream(f):

            pid = str(name.decode('utf-8'))
            if len(sequence) == 0:
                print('# WARNING: sequence',
                      pid,
                      'has length=0. Skipping.',
                      file=sys.stderr)
                continue

            PID_.append(pid)

            z = embed_sequence(sequence,
                               lm_embed,
                               lstm_stack,
                               proj,
                               final_only=True,
                               pool='avg',
                               use_cuda=True)

            SSAEMB_.append(z)
            count += 1
            print(sequence,
                  '# {} sequences processed...'.format(count),
                  file=sys.stderr,
                  end='\r')
    print("SSA embedding finished@")

    ssa_feature = pd.DataFrame(SSAEMB_)
    col = ["SSA_F" + str(i + 1) for i in range(0, 121)]
    ssa_feature.columns = col

    print("UniRep Embedding...")
    print("Loading UniRep Model...", file=sys.stderr, end='\r')

    model = UniRepModel.from_pretrained('babbler-1900')
    model = model.to(DEVICE)
    tokenizer = TAPETokenizer(vocab='unirep')

    count = 0
    PID_ = inData["PID"]

    for sequence in Seqs:

        if len(sequence) == 0:
            print('# WARNING: sequence',
                  pid,
                  'has length=0. Skipping.',
                  file=sys.stderr)
            continue

        with torch.no_grad():
            token_ids = torch.tensor([tokenizer.encode(sequence)])
            token_ids = token_ids.to(DEVICE)
            output = model(token_ids)
            unirep_output = output[0]
            #print(unirep_output.shape)
            unirep_output = torch.squeeze(unirep_output)
            #print(unirep_output.shape)
            unirep_output = unirep_output.mean(0)
            unirep_output = unirep_output.cpu().numpy()

            # print(sequence,len(sequence),unirep_output.shape)
            UNIREPEB_.append(unirep_output.tolist())
            count += 1
            print(sequence,
                  '# {} sequences processed...'.format(count),
                  file=sys.stderr,
                  end='\r')

    unirep_feature = pd.DataFrame(UNIREPEB_)

    col = ["UniRep_avg_F" + str(i + 1) for i in range(0, 1900)]
    unirep_feature.columns = col
    print("UniRep Embedding Finished@!")
    Features = pd.concat([ssa_feature, unirep_feature], axis=1)
    Features.index = PID_
    Features.to_csv(outFile)
    print("Getting Deep Representation Learning Features is done.")

    return Features, inData
Ejemplo n.º 7
0
import torch
from tape import ProteinBertModel, TAPETokenizer
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(
    vocab='iupac'
)  # iupac is the vocab for TAPE models, use unirep for the UniRep model

# Pfam Family: Hexapep, Clan: CL0536
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
token_ids = torch.tensor([tokenizer.encode(sequence)])
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]