Esempio n. 1
0
    def _get_network_emissions(self, original_aa_string, pssm, token):
        aa = original_aa_string if self.use_aa else -1
        evo = pssm if self.use_pssm else -1
        tok = token if self.use_token else -1
        packed_input_sequences = self.embed(aa, evo, tok)
        minibatch_size = int(packed_input_sequences[1][0])
        self.init_hidden(minibatch_size)

        if self.use_gpu:
            packed_input_sequences = packed_input_sequences.cuda()

        (data, bi_lstm_batches, _,
         _), self.hidden_layer = self.bi_lstm(packed_input_sequences,
                                              self.hidden_layer)
        emissions_padded, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.PackedSequence(self.hidden_to_labels(data),
                                              bi_lstm_batches))
        emissions = emissions_padded.transpose(0, 1).transpose(
            1, 2)  # minibatch_size, self.mixture_size, -1
        emissions = self.batch_norm(emissions)
        emissions = emissions.transpose(1, 2).transpose(
            0, 1)  # (minibatch_size, -1, self.mixture_size)

        output_angles = self._dehidrals(emissions, self.alphabet)
        backbone_atoms_padded, _ = \
            get_backbone_positions_from_angular_prediction(output_angles,
                                                           batch_sizes,
                                                           self.use_gpu)
        return output_angles, backbone_atoms_padded, batch_sizes
Esempio n. 2
0
    def forward(self, latent_space):
        #decoding. takes in a single latent code from a single part of the batch.
        # where input is the teacher forcing or the predictions from the previous step.

        # batch_first = True for this.
        prev_out, hidden = self.decoder(latent_space)
        #predict sequences:
        pred_seqs = F.log_softmax(self.dense2_post_dec(prev_out), dim=2)

        # alternatives to weird angle mixture model.
        # # just predicting dihedrals directly. Then try to scale them to be between +/-pi
        # # then try using atan2
        #np.pi* F.tanh(
        output_angles = self.latent_to_dihedral2(
            F.elu(self.latent_to_dihedral1(prev_out))).permute(
                [1, 0, 2])  # max size, minibatch size, 3 (angels)
        #print('output angles shape::: ', output_angles.shape)
        ###print('output angles::: ', output_angles)

        # weird angle mixture model thing.
        '''x = self.hidden_to_labels(prev_out)
        x = self.bn(x.permute([0,2,1])).permute([0,2,1]).contiguous()
        #x = x.transpose(1,2) #(minibatch_size, -1, self.mixture_size)
        p = torch.exp(self.soft(x))
        output_angles = self.softmax_to_angle(p).transpose(0,1) # max size, minibatch size, 3 (angels)'''

        # used to feed in batch sizes here. could do so from encoder. but all I do I take the length of it...
        backbone_atoms_padded, batch_sizes_backbone = get_backbone_positions_from_angular_prediction(
            output_angles, latent_space.shape[0], self.device)

        if torch.isnan(backbone_atoms_padded).sum() > 0:
            print('angles are NOT VALID!!!++++++++++++++++++++++++')

        return pred_seqs, output_angles, backbone_atoms_padded, batch_sizes_backbone
Esempio n. 3
0
    def _get_network_emissions(self, original_aa_string):

        # (seqlen x features), (seqlen x batch x features )
        packed_input_sequences, input_sequences, batch_sizes = self.embed(
            original_aa_string)

        # (batch x seqlen x features )
        reshaped_packed_input_sequences = input_sequences.data.permute(1, 0, 2)
        # Turn (batch x seqlen x features) into (batch x features x seqlen) for CNN
        flipped_reshaped_packed_input_sequences = reshaped_packed_input_sequences.transpose(
            1, 2)
        batch_size, embedding_size, sequence_length = flipped_reshaped_packed_input_sequences.shape

        # why init in every fwd pass? Is it stateless?
        # if self.minibatch_size != minibatch_size:
        #     self.minibatch_size = minibatch_size
        #     self.init_hidden(self.minibatch_size)
        # else:
        #     self.retain_hidden()

        # stateless
        self.init_hidden(sequence_length)

        x_conv = self.conv_init(flipped_reshaped_packed_input_sequences)
        x_conv = x_conv.transpose(1, 2)
        x_lstm = torch.cat(
            (x_conv, reshaped_packed_input_sequences),
            2)  # Concatenate original input and output CNN to LSTM

        # performing the fwd pass
        data, self.hidden_layer = self.bi_lstm(x_lstm)
        x_total = torch.cat((data, x_conv), 2)
        emissions = self.hidden_to_labels(x_total)
        emissions = emissions.view(batch_size, self.mixture_size, -1)
        emissions = self.batch_norm(emissions)
        emissions = emissions.view(batch_size, -1, self.mixture_size)
        probabilities = torch.exp(self.soft(emissions))

        output_angles = self.softmax_to_angle(probabilities) \
            .transpose(0, 1)  # max size, minibatch size, 3 (angles)

        assert not np.isnan(data.cpu().detach().numpy()).any()

        backbone_atoms_padded, _ = \
            get_backbone_positions_from_angular_prediction(output_angles,
                                                           batch_sizes,
                                                           self.use_gpu)
        assert not np.isnan(backbone_atoms_padded.cpu().detach().numpy()).any()

        return output_angles, backbone_atoms_padded, batch_sizes
Esempio n. 4
0
    def _get_network_emissions(self, original_aa_string):

        packed_input_sequences, input_sequences, batch_sizes = self.embed(
            original_aa_string)

        minibatch_size = int(packed_input_sequences[1][0])

        # why init in every fwd pass? Is it stateless?
        if self.minibatch_size != minibatch_size:
            self.minibatch_size = minibatch_size
            self.init_hidden(self.minibatch_size)
        else:
            self.retain_hidden()

        # stateless
        # self.init_hidden(minibatch_size)

        # performing the fwd pass
        (data, bi_lstm_batches, _,
         _), self.hidden_layer = self.bi_lstm(packed_input_sequences,
                                              self.hidden_layer)

        assert not np.isnan(data.cpu().detach().numpy()).any()

        emissions_padded, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.PackedSequence(self.hidden_to_labels(data),
                                              bi_lstm_batches))

        assert not np.isnan(emissions_padded.cpu().detach().numpy()).any()
        emissions = emissions_padded.transpose(0, 1)\
            .transpose(1, 2)  # minibatch_size, self.mixture_size, -1
        emissions = self.batch_norm(emissions)
        emissions = emissions.transpose(
            1, 2)  # (minibatch_size, -1, self.mixture_size)
        probabilities = torch.exp(self.soft(emissions))
        output_angles = self.softmax_to_angle(probabilities)\
            .transpose(0, 1)  # max size, minibatch size, 3 (angles)

        assert not np.isnan(output_angles.cpu().detach().numpy()).any()

        backbone_atoms_padded, _ = \
            get_backbone_positions_from_angular_prediction(output_angles,
                                                           batch_sizes,
                                                           self.use_gpu)
        assert not np.isnan(backbone_atoms_padded.cpu().detach().numpy()).any()

        return output_angles, backbone_atoms_padded, batch_sizes
Esempio n. 5
0
 def _get_network_emissions(self, original_aa_string, pssm=-1):
     packed_input_sequences = self.embed(original_aa_string)
     minibatch_size = int(packed_input_sequences[1][0])
     self.init_hidden(minibatch_size)
     (data, bi_lstm_batches, _, _), self.hidden_layer = self.bi_lstm(
         packed_input_sequences, self.hidden_layer)
     emissions_padded, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
         torch.nn.utils.rnn.PackedSequence(self.hidden_to_labels(data), bi_lstm_batches))
     emissions = emissions_padded.transpose(0, 1)\
         .transpose(1, 2)  # minibatch_size, self.mixture_size, -1
     emissions = self.batch_norm(emissions)
     emissions = emissions.transpose(1, 2)  # (minibatch_size, -1, self.mixture_size)
     probabilities = torch.exp(self.soft(emissions))
     output_angles = self.softmax_to_angle(probabilities)\
         .transpose(0, 1)  # max size, minibatch size, 3 (angles)
     backbone_atoms_padded, _ = \
         get_backbone_positions_from_angular_prediction(output_angles,
                                                        batch_sizes,
                                                        self.use_gpu)
     return output_angles, backbone_atoms_padded, batch_sizes
Esempio n. 6
0
    def _get_network_emissions(self, original_aa_string, pssm, primary_token):

        # set input
        aa = original_aa_string if self.use_aa else -1
        evo = pssm if self.use_pssm else -1
        tok = primary_token if self.use_token else -1
        packed_input_sequences = self.embed(aa, evo, tok)
        minibatch_size = int(packed_input_sequences[1][0])

        if self.src_mask is None or self.src_mask.size(
                0) != packed_input_sequences[1].size(0):
            mask = self._generate_square_subsequent_mask(
                packed_input_sequences[1].size(0)).to(self.device)
            self.src_mask = mask

        # transfomer
        state = self.W(packed_input_sequences[0].to(self.device))
        state = F.relu(state)
        state, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.PackedSequence(state,
                                              packed_input_sequences[1]))
        positional_encodings = self.pos_encoder(state)

        output = self.transformer_encoder(positional_encodings, self.src_mask)

        # convert internal representation to label
        output_angles = self._dehidrals(
            output,
            self.alphabet,
        )

        # coordinate
        backbone_atoms_padded, _ = get_backbone_positions_from_angular_prediction(
            output_angles, batch_sizes, self.use_gpu)

        return output_angles, backbone_atoms_padded, batch_sizes
Esempio n. 7
0
def process_file(input_file, output_file, use_gpu):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset('primary',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='int32')
    dset2 = f.create_dataset('tertiary',(current_buffer_size,MAX_SEQUENCE_LENGTH,9),maxshape=(None,MAX_SEQUENCE_LENGTH, 9),dtype='float')
    dset3 = f.create_dataset('mask',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='uint8')

    input_file_pointer = open("raw_data/casp11/" + input_file, "r")

    while True:
        # while there's more proteins to process
        print(input_file_pointer)
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size,MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size,MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size,MAX_SEQUENCE_LENGTH))

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length,9)).T

        tertiary_padded[:,:sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']

        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(torch.Tensor(primary_padded).type(dtype=torch.long), mask)
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(9, -1).transpose(0, 1).unsqueeze(1) / 100

        if use_gpu:
            pos = pos.cuda()

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos, [len(prim)], use_gpu=use_gpu)

        tertiary, _ = get_backbone_positions_from_angular_prediction(angles, batch_sizes, use_gpu=use_gpu)
        tertiary = tertiary.squeeze(1)

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
Esempio n. 8
0
    def _get_network_emissions(self, original_aa_string):

        # (seqlen x features), (seqlen x batch x features )
        packed_input_sequences, input_sequences, batch_sizes = self.embed(
            original_aa_string)

        # (batch x seqlen x features )
        reshaped_packed_input_sequences = input_sequences.data.permute(1, 0, 2)
        # Turn (batch x seqlen x features) into (batch x features x seqlen) for CNN
        flipped_reshaped_packed_input_sequences = reshaped_packed_input_sequences.transpose(
            1, 2)
        batch_size, embedding_size, sequence_length = flipped_reshaped_packed_input_sequences.shape

        # performing the fwd pass
        init = F.leaky_relu(
            self.conv_bn_init(
                self.conv_init(flipped_reshaped_packed_input_sequences)))

        res = self.conv1(F.leaky_relu(self.conv_bn1(init)))
        res = self.conv2(F.leaky_relu(self.conv_bn2(res)))

        x_conv = res + init

        res = self.conv4(F.leaky_relu(self.conv_bn4(x_conv)))
        res = self.conv5(F.leaky_relu(self.conv_bn5(res)))

        x_conv = self.conv6_expand(x_conv)
        x_conv += res

        res = self.conv7(F.leaky_relu(self.conv_bn7(x_conv)))
        res = self.conv8(F.leaky_relu(self.conv_bn8(res)))

        x_conv = self.conv9_expand(x_conv)
        x_conv += res

        res = self.conv10(F.leaky_relu(self.conv_bn10(x_conv)))
        res = self.conv11(F.leaky_relu(self.conv_bn11(res)))

        x_conv = self.conv12_expand(x_conv)
        x_conv += res

        res = self.conv13(F.leaky_relu(self.conv_bn13(x_conv)))
        res = self.conv14(F.leaky_relu(self.conv_bn14(res)))

        x_conv = self.conv15_expand(x_conv)
        x_conv += res

        res = self.conv16(F.leaky_relu(self.conv_bn16(x_conv)))
        res = self.conv17(F.leaky_relu(self.conv_bn17(res)))

        x_conv = self.conv18_expand(x_conv)
        x_conv += res

        res = self.conv19(F.leaky_relu(self.conv_bn19(x_conv)))
        res = self.conv20(F.leaky_relu(self.conv_bn20(res)))

        x_conv = self.conv21_expand(x_conv)
        x_conv += res

        res = self.conv22(F.leaky_relu(self.conv_bn22(x_conv)))
        res = self.conv23(F.leaky_relu(self.conv_bn23(res)))

        x_conv = self.conv24_expand(x_conv)
        x_conv += res

        x_conv = x_conv.transpose(1, 2)
        x_lstm = torch.cat(
            (x_conv, reshaped_packed_input_sequences),
            2)  # Concatenate original input and output CNN to LSTM

        # why init in every fwd pass? Is it stateless?
        # if self.minibatch_size != minibatch_size:
        #     self.minibatch_size = minibatch_size
        #     self.init_hidden(self.minibatch_size)
        # else:
        #     self.retain_hidden()

        # stateless:
        self.init_hidden(sequence_length)
        data, self.hidden_layer = self.bi_lstm(x_lstm)
        x_total = torch.cat((data, x_conv), 2)
        emissions = self.hidden_to_labels(x_total)
        emissions = emissions.view(batch_size, self.mixture_size, -1)
        emissions = self.batch_norm(emissions)
        emissions = emissions.view(batch_size, -1, self.mixture_size)
        probabilities = torch.exp(self.soft(emissions))

        output_angles = self.softmax_to_angle(probabilities) \
            .transpose(0, 1)  # max size, minibatch size, 3 (angles)

        assert not np.isnan(data.cpu().detach().numpy()).any()

        backbone_atoms_padded, _ = \
            get_backbone_positions_from_angular_prediction(output_angles,
                                                           batch_sizes,
                                                           self.use_gpu)
        assert not np.isnan(backbone_atoms_padded.cpu().detach().numpy()).any()

        return output_angles, backbone_atoms_padded, batch_sizes
def process_file(input_file, output_file, use_gpu):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset(
        'primary', (current_buffer_size, MAX_SEQUENCE_LENGTH),
        maxshape=(None, MAX_SEQUENCE_LENGTH),
        dtype='int32'
    )  # creates an empty dataset with given dimension, axes with none in maxshape are unlimited
    dset2 = f.create_dataset('tertiary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH, 9),
                             maxshape=(None, MAX_SEQUENCE_LENGTH, 9),
                             dtype='float')
    dset3 = f.create_dataset('mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(
            next_protein['tertiary']).T)  # flattens the array into 1-D
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']

        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(
            torch.Tensor(primary_padded).type(dtype=torch.long), mask
        )  # only leaves those aa which have + (actually 0) in their mask
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(
            9, -1
        ).transpose(0, 1).unsqueeze(
            1
        ) / 100  # divides by 100 because all values are artificially increased by 100 as specified in the proteinnet documentation, do not know yet why we need to add an additional dimension???

        if use_gpu:
            pos = pos.cuda()

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(
            pos, [len(prim)], use_gpu=use_gpu)

        tertiary, _ = get_backbone_positions_from_angular_prediction(
            angles, batch_sizes, use_gpu=use_gpu)
        tertiary = tertiary.squeeze(1)

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu(
        ).numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to",
          output_file)
Esempio n. 10
0
def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'):
    print("Processing raw data file", input_file)

    # set tokenizer
    tokenizer = TAPETokenizer(vocab=vocab)

    # create output file
    file = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length), dtype='int32')
    dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9),
                                maxshape=(None, max_sequence_length, 9), dtype='float')
    dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length),
                                dtype='uint8')
    dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21),
                                maxshape=(None, max_sequence_length, 21), dtype='float')
    dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length),
                                maxshape=(None, 2 * max_sequence_length), dtype='int32')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > max_sequence_length:
            # print("Dropping protein as length too long:", sequence_length)
            continue
        print("Process protein with length", sequence_length)
        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, max_sequence_length))
            dset2.resize((current_buffer_size, max_sequence_length, 9))
            dset3.resize((current_buffer_size, max_sequence_length))
            dset4.resize((current_buffer_size, max_sequence_length, 21))
            dset5.resize((current_buffer_size, 2 * max_sequence_length))

        primary_padded = np.zeros(max_sequence_length)
        tertiary_padded = np.zeros((9, max_sequence_length))
        mask_padded = np.zeros(max_sequence_length)
        pssm_padded = np.zeros((21, max_sequence_length))
        primary_token_padded = np.zeros(2 * max_sequence_length)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']

        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary'])

        if use_mask:
            mask = torch.Tensor(mask_padded).type(dtype=torch.bool)

            prim = torch.masked_select(torch.Tensor(primary_padded)
                                       .type(dtype=torch.long), mask)
            seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask']))

            pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \
                      .view(9, -1).transpose(0, 1).unsqueeze(1) / 100

            pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1)

            if use_gpu:
                pos = pos.cuda()

            angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos,
                                                                           [len(prim)],
                                                                           use_gpu=use_gpu)

            tertiary, _ = get_backbone_positions_from_angular_prediction(angles,
                                                                         batch_sizes,
                                                                         use_gpu=use_gpu)
            tertiary = tertiary.squeeze(1)

            primary_padded = np.zeros(max_sequence_length)
            tertiary_padded = np.zeros((max_sequence_length, 9))
            pssm_padded = np.zeros((max_sequence_length, 21))

            length_after_mask_removed = len(prim)

            primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
            primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy()
            tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
            pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy()
            mask_padded = np.zeros(max_sequence_length)
            mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = pssm_padded
        dset5[current_buffer_allocation] = primary_token_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
Esempio n. 11
0
def process_file(input_file, output_file, device, want_trimmed, want_pure):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset('primary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='int32')
    dset2 = f.create_dataset('tertiary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH, 9),
                             maxshape=(None, MAX_SEQUENCE_LENGTH, 9),
                             dtype='float')
    dset3 = f.create_dataset('mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')
    dset4 = f.create_dataset('padding_mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')
    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if want_pure:
            unpadded_mask = torch.Tensor(
                next_protein['mask']).type(dtype=torch.uint8)
            if unpadded_mask.sum() != unpadded_mask.shape[0]:
                print('dropping protein, mask has holes')
                continue

        elif want_trimmed:
            s = [str(i) for i in next_protein['mask']]
            s.append('0')  # needed for those that dont end with a mask spot!.
            res = "".join(s)
            if len(res.split('10')) > 2:
                print('dropping protein, mask isnt just on edges')
                continue

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(
            torch.Tensor(primary_padded).type(dtype=torch.long), mask)
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(
            9, -1).transpose(0, 1).unsqueeze(1) / 100

        pos = pos.to(device)

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(
            pos, [len(prim)], device)
        # this must be what is creating the nans!! Not clear to me why...
        tertiary, _ = get_backbone_positions_from_angular_prediction(
            angles, batch_sizes, device)
        tertiary = tertiary.squeeze(1)

        if torch.isnan(tertiary).sum() > 0:
            print('there is a nan in tertiary! Dropping and printing mask')
            print(next_protein['mask'])
            continue

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        if sequence_length == 0:
            print(
                'sequence length is zero after mask was applied. Dropping! =========='
            )
            continue

        #print('final size', length_after_mask_removed)
        #print('tertiary ', tertiary)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu(
        ).numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        padding_mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        # this mask has masking for both the padding and the AAs without angle data!
        # # THIS HAS BECOME COMPLETELY IRRELEVANT NOW THAT I AM GETTING RID OF ANY MISSING THINGS FIRST!
        padding_mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset4.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = padding_mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to",
          output_file)