Esempio n. 1
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(actual_coords_list)
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        if isinstance(_batch_sizes[0], int):
            _batch_sizes = torch.tensor(_batch_sizes)
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                     _batch_sizes,
                                                     self.use_gpu)
        # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded,
        #                                           actual_coords_list_padded,
        #                                           batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            # drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss  # + drmsd_avg
Esempio n. 2
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _) = minibatch

        if any(np.isnan(x.cpu().detach().numpy()).any() for x in original_aa_string) or \
        any(np.isnan(x.cpu().detach().numpy()).any() for x in actual_coords_list):
            return None

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        assert not np.isnan(emissions.cpu().detach().numpy()).any()
        actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn\
            .pad_packed_sequence(
                torch.nn.utils.rnn.pack_sequence(actual_coords_list))
        assert not np.isnan(
            actual_coords_list_padded.cpu().detach().numpy()).any()
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()

        start = time.time()
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                     batch_sizes_coords,
                                                     self.use_gpu)
        # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded,
        #                                           actual_coords_list_padded,
        #                                           batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            # drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss  # + drmsd_avg
Esempio n. 3
0
    def compute_loss(self, minibatch, processed_minibatches, minimum_updates):
        (original_aa_string, actual_coords_list, _) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(
            actual_coords_list)
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        if isinstance(_batch_sizes[0], int):
            _batch_sizes = torch.tensor(_batch_sizes)
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                    _batch_sizes,
                                                    self.use_gpu)
        drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded,
                                                  actual_coords_list_padded,
                                                  _batch_sizes)

        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        multiplier = 0.4

        if (processed_minibatches < minimum_updates * (40 / 100)):
            multiplier = processed_minibatches / minimum_updates

        normalized_angular_loss = angular_loss / 5
        normalized_drmsd_avg = drmsd_avg / 100
        return (normalized_drmsd_avg * multiplier) + (normalized_angular_loss *
                                                      (1 - multiplier))
Esempio n. 4
0
    def forward(self, seq, batch_sizes, tert):
        # dealing with the sequences first:
        # embeds = self.embeddings(inputs).view((1, -1))
        packed_input_sequences = embed(seq, batch_sizes, self.device)
        packed_output, hidden = self.encoder_seq(packed_input_sequences)
        # batch comes second here? so shape[1]
        # commented out so I can add the meta LSTM. need to unpack also!!
        out_padded_seq, lengths = torch.nn.utils.rnn.pad_packed_sequence(
            packed_output)
        #seq_hidden_means = torch.sum(out_padded, dim=0) / lengths.view(-1,1).expand(-1, self.ENCODING_LSTM_OUTPUT*2).type(torch.float)
        #Now dealing with the tertiary structure!! Convert coords to dihedral angles.
        # None here is because this is not padded and I dont want to give it a batch size.
        #print('pre dihedral', tert)
        tert_angles = calculate_dihedral_angles_over_minibatch(tert,
                                                               None,
                                                               self.device,
                                                               is_padded=False)
        # convert this into a packed sequence!
        #print('pre packing', tert_angles)
        packed_tert_angles = torch.nn.utils.rnn.pack_sequence(tert_angles).to(
            self.device)
        #this is to return for the loss function, the real dihedral angles:
        padded_real_angles, _ = torch.nn.utils.rnn.pad_packed_sequence(
            packed_tert_angles)
        # dealing with the sequences:
        packed_output, hidden = self.encoder_tert(packed_tert_angles)
        # need to unpack and get the means here!
        out_padded_tert, lengths = torch.nn.utils.rnn.pad_packed_sequence(
            packed_output)
        #tert_hidden_means = torch.sum(out_padded, dim=0) / lengths.view(-1,1).expand(-1, self.ENCODING_LSTM_OUTPUT*2).type(torch.float)
        # get mean of all hidden states. will then concat this with the tertiary and put through dense.

        # meta encoder LSTM: concat all the hidden states from every time step!!
        res = torch.cat((out_padded_seq, out_padded_tert), dim=2)
        res = torch.nn.utils.rnn.pack_padded_sequence(res, lengths)
        res, hidden = self.encoder_meta(res)
        res, lengths = torch.nn.utils.rnn.pad_packed_sequence(res)
        res = torch.sum(res, dim=0) / lengths.view(-1, 1).expand(
            -1, self.META_ENCODING_LSTM_OUTPUT * 2).type(torch.float).to(
                self.device)
        #res = torch.cat( (seq_hidden_means, tert_hidden_means), dim=1)
        #Ignore the batchnorm for now!
        #res = self.batchnorm(res)
        res = self.dense2_enc(F.elu(
            self.dense1_enc(res)))  # used to have F.tanh here!
        # out_padded are the dihedral angles for the structure!!
        return res, padded_real_angles
Esempio n. 5
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _, pssms, token) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = self._get_network_emissions(
            original_aa_string, pssms, token)
        actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.pack_sequence(actual_coords_list))
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        emissions_actual, _ = calculate_dihedral_angles_over_minibatch(
            actual_coords_list_padded, batch_sizes_coords, self.use_gpu)
        drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded,
                                                  actual_coords_list_padded,
                                                  _batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss, drmsd_avg
Esempio n. 6
0
    def _get_network_emissions(self, original_aa_string):
        initial_aa_pos = initial_pos_from_aa_string(original_aa_string)
        packed_input_sequences = self.embed(original_aa_string)
        backbone_atoms_padded, batch_sizes_backbone \
            = structures_to_backbone_atoms_padded(initial_aa_pos)
        if self.use_gpu:
            backbone_atoms_padded = backbone_atoms_padded.cuda()
        embedding_padded, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.PackedSequence(packed_input_sequences))
        for _ in range(self.recurrent_steps):
            combined_features = torch.cat((embedding_padded, backbone_atoms_padded), dim=2)
            for idx, aa_features in enumerate(combined_features.transpose(0, 1)):
                msg = pass_messages(aa_features,
                                    self.apply_message_function,
                                    self.use_gpu)  # aa_count * output size
                backbone_atoms_padded[:, idx] = self.linear_transform(
                    torch.cat((aa_features, msg), dim=1))

        output, batch_sizes = calculate_dihedral_angles_over_minibatch(original_aa_string,
                                                                       backbone_atoms_padded,
                                                                       batch_sizes_backbone,
                                                                       self.use_gpu)
        return output, backbone_atoms_padded, batch_sizes
Esempio n. 7
0
def prediction():

    list_of_files = glob.glob('output/models/*')
    default_model_path = max(list_of_files, key=os.path.getctime)

    parser = argparse.ArgumentParser(
        description="OpenProtein - Prediction CLI")
    parser.add_argument('--input_sequence', dest='input_sequence')
    parser.add_argument('--model_path',
                        dest='model_path',
                        default=default_model_path)
    parser.add_argument('--use_gpu', dest='use_gpu', default=False, type=bool)

    args, _ = parser.parse_known_args()

    print("Using model:", args.model_path)

    model = torch.load(args.model_path)

    input_sequences = [args.input_sequence]

    input_sequences_encoded = list(
        torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences)

    predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = \
        model(input_sequences_encoded)

    if predicted_dihedral_angles == []:
        predicted_dihedral_angles, _ = calculate_dihedral_angles_over_minibatch(
            predicted_backbone_atoms, batch_sizes, args.use_gpu)
    write_to_pdb(
        get_structure_from_angles(input_sequences_encoded[0],
                                  predicted_dihedral_angles[:, 0]),
        "prediction")

    print("Wrote prediction to output/protein_prediction.pdb")
Esempio n. 8
0
    def evaluate_model(self, data_loader):
        loss = 0
        data_total = []
        dRMSD_list = []
        RMSD_list = []
        for _, data in enumerate(data_loader, 0):
            primary_sequence, tertiary_positions, _mask = data
            start = time.time()
            predicted_angles, backbone_atoms, batch_sizes = self(primary_sequence)
            write_out("Apply model to validation minibatch:", time.time() - start)

            if predicted_angles == []:
                # model didn't provide angles, so we'll compute them here
                output_angles, _ = calculate_dihedral_angles_over_minibatch(backbone_atoms,
                                                                            batch_sizes,
                                                                            self.use_gpu)
            else:
                output_angles = predicted_angles

            cpu_predicted_angles = output_angles.transpose(0, 1).cpu().detach()
            if backbone_atoms == []:
                # model didn't provide backbone atoms, we need to compute that
                output_positions, _ = \
                    get_backbone_positions_from_angles(predicted_angles,
                                                       batch_sizes,
                                                       self.use_gpu)
            else:
                output_positions = backbone_atoms

            cpu_predicted_backbone_atoms = output_positions.transpose(0, 1).cpu().detach()

            minibatch_data = list(zip(primary_sequence,
                                      tertiary_positions,
                                      cpu_predicted_angles,
                                      cpu_predicted_backbone_atoms))
            data_total.extend(minibatch_data)
            start = time.time()
            for primary_sequence, tertiary_positions, _predicted_pos, predicted_backbone_atoms\
                    in minibatch_data:
                actual_coords = tertiary_positions.transpose(0, 1).contiguous().view(-1, 3)

                predicted_coords = predicted_backbone_atoms[:len(primary_sequence)]\
                    .transpose(0, 1).contiguous().view(-1, 3).detach()
                rmsd = calc_rmsd(predicted_coords, actual_coords)
                drmsd = calc_drmsd(predicted_coords, actual_coords)
                RMSD_list.append(rmsd)
                dRMSD_list.append(drmsd)
                error = rmsd
                loss += error

                end = time.time()
            write_out("Calculate validation loss for minibatch took:", end - start)
        loss /= data_loader.dataset.__len__()
        self.historical_rmsd_avg_values.append(float(torch.Tensor(RMSD_list).mean()))
        self.historical_drmsd_avg_values.append(float(torch.Tensor(dRMSD_list).mean()))

        prim = data_total[0][0]
        pos = data_total[0][1]
        pos_pred = data_total[0][3]
        if self.use_gpu:
            pos = pos.cuda()
            pos_pred = pos_pred.cuda()
        angles = calculate_dihedral_angles(pos, self.use_gpu)
        angles_pred = calculate_dihedral_angles(pos_pred, self.use_gpu)
        write_to_pdb(get_structure_from_angles(prim, angles), "test")
        write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred")

        data = {}
        data["pdb_data_pred"] = open("output/protein_test_pred.pdb", "r").read()
        data["pdb_data_true"] = open("output/protein_test.pdb", "r").read()
        data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:, 1]])
        data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1, 2]])
        data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:, 1]])
        data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1, 2]])
        data["rmsd_avg"] = self.historical_rmsd_avg_values
        data["drmsd_avg"] = self.historical_drmsd_avg_values

        prediction_data = None

        return (loss, data, prediction_data)
Esempio n. 9
0
def process_file(input_file, output_file, use_gpu):
    write_out("Processing raw data file", input_file)
    # create output file
    file = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = file.create_dataset('primary', (current_buffer_size, MAX_SEQUENCE_LENGTH),
                                maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32')
    dset2 = file.create_dataset('tertiary', (current_buffer_size, MAX_SEQUENCE_LENGTH, 9),
                                maxshape=(None, MAX_SEQUENCE_LENGTH, 9), dtype='float')
    dset3 = file.create_dataset('mask', (current_buffer_size, MAX_SEQUENCE_LENGTH),
                                maxshape=(None, MAX_SEQUENCE_LENGTH),
                                dtype='uint8')
    dset4 = file.create_dataset('angle', (current_buffer_size, MAX_SEQUENCE_LENGTH, 3),
                                maxshape=(None, MAX_SEQUENCE_LENGTH, 3), dtype='float')
    dset5 = file.create_dataset('secondary', (current_buffer_size, MAX_SEQUENCE_LENGTH),
                                maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32')
    dset6 = file.create_dataset('id', (current_buffer_size, 1),
                                maxshape=(None, 1), dtype=h5py.string_dtype())

    input_file_pointer = open(input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein, missing_aa = read_protein_from_file(input_file_pointer)
        if next_protein is None: # no more proteins to process
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            write_out("Dropping protein as length too long:", sequence_length)
            continue
        if missing_aa is True:
            continue
        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset4.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 3))
            dset5.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset6.resize((current_buffer_size, 1))

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        secondary_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        
        secondary_padded[:sequence_length] = next_protein['secondary']

        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T
        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        mask = torch.Tensor(mask_padded).type(dtype=torch.bool)
        prim = torch.masked_select(torch.Tensor(primary_padded)\
                                   .type(dtype=torch.long), mask)
        

        sec = torch.masked_select(torch.Tensor(secondary_padded)\
                                               .type(dtype=torch.long), mask)
        
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask)\
                  .view(9, -1).transpose(0, 1).unsqueeze(1)
        pos_angstrom = pos / 100

        if use_gpu:
            pos_angstrom = pos_angstrom.cuda()

        # map to angles and back to tertiary
        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos_angstrom,
                                                                       torch.tensor([len(prim)]),
                                                                       use_gpu=use_gpu)

        tertiary, _ = get_backbone_positions_from_angles(angles,
                                                         batch_sizes,
                                                         use_gpu=use_gpu)
        tertiary = tertiary.squeeze(1)
        angles = angles.squeeze(1)


        # create variables to store padded sequences in
        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        angle_padded = np.zeros((MAX_SEQUENCE_LENGTH, 3))
        secondary_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # store padded sequences
        length_after_mask_removed = len(prim)
        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        secondary_padded[:length_after_mask_removed] = sec.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
        mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)
        angle_padded[:length_after_mask_removed, :] = angles.data.cpu().numpy()

        # save padded sequences on disk
        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = angle_padded
        dset5[current_buffer_allocation] = secondary_padded
        dset6[current_buffer_allocation] = next_protein['id']
        current_buffer_allocation += 1
    if current_buffer_allocation == 0:
        write_out("Preprocessing was selected but no proteins in the input file "
                  "were accepted. Please check your input.")
        os._exit(1)
    write_out("Wrote output to", current_buffer_allocation, "proteins to", output_file)
Esempio n. 10
0
    def evaluate_model(self, data_loader):
        loss = 0
        angular_loss = 0
        data_total = []
        dRMSD_list = []
        RMSD_list = []
        for _, data in enumerate(data_loader, 0):
            primary_sequence, tertiary_positions, _mask, pssm, token = data
            start = time.time()
            predicted_angles, backbone_atoms, _batch_sizes = self(
                primary_sequence, pssm, token)
            write_out("Apply model to validation minibatch:",
                      time.time() - start)
            cpu_predicted_angles = predicted_angles.transpose(
                0, 1).cpu().detach()
            cpu_predicted_backbone_atoms = backbone_atoms.transpose(
                0, 1).cpu().detach()
            minibatch_data = list(
                zip(primary_sequence, tertiary_positions, cpu_predicted_angles,
                    cpu_predicted_backbone_atoms))
            data_total.extend(minibatch_data)
            actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence(
                torch.nn.utils.rnn.pack_sequence(tertiary_positions))
            if self.use_gpu:
                actual_coords_list_padded = actual_coords_list_padded.cuda()
            emissions_actual, _ = calculate_dihedral_angles_over_minibatch(
                actual_coords_list_padded, batch_sizes_coords, self.use_gpu)

            start = time.time()
            for primary_sequence, tertiary_positions, _predicted_pos, predicted_backbone_atoms \
                    in minibatch_data:
                actual_coords = tertiary_positions.transpose(
                    0, 1).contiguous().view(-1, 3)
                predicted_coords = predicted_backbone_atoms[:len(primary_sequence)] \
                    .transpose(0, 1).contiguous().view(-1, 3).detach()

                if self.use_gpu:
                    emissions_actual = emissions_actual.cuda()
                angular_loss += float(
                    calc_angular_difference(predicted_angles,
                                            emissions_actual))

                rmsd = calc_rmsd(predicted_coords, actual_coords)
                drmsd = calc_drmsd(predicted_coords, actual_coords)
                RMSD_list.append(rmsd)
                dRMSD_list.append(drmsd)
                error = float(drmsd)
                loss += error
                end = time.time()
            write_out("Calculate validation loss for minibatch took:",
                      end - start)
        loss /= data_loader.dataset.__len__()
        angular_loss /= data_loader.dataset.__len__()
        self.historical_rmsd_avg_values.append(
            float(torch.Tensor(RMSD_list).mean()))
        self.historical_drmsd_avg_values.append(
            float(torch.Tensor(dRMSD_list).mean()))

        prim = data_total[0][0]
        pos = data_total[0][1]
        pos_pred = data_total[0][3]
        if self.use_gpu:
            pos = pos.cuda()
            pos_pred = pos_pred.cuda()
        angles = calculate_dihedral_angles(pos, self.use_gpu)
        angles_pred = calculate_dihedral_angles(pos_pred, self.use_gpu)

        write_to_pdb(get_structure_from_angles(prim, angles), "test")
        write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred")

        data = {}
        data["pdb_data_pred"] = open("output/protein_test_pred.pdb",
                                     "r").read()
        data["pdb_data_true"] = open("output/protein_test.pdb", "r").read()
        data["phi_actual"] = list(
            [math.degrees(float(v)) for v in angles[1:, 1]])
        data["psi_actual"] = list(
            [math.degrees(float(v)) for v in angles[:-1, 2]])
        data["phi_predicted"] = list(
            [math.degrees(float(v)) for v in angles_pred[1:, 1]])
        data["psi_predicted"] = list(
            [math.degrees(float(v)) for v in angles_pred[:-1, 2]])
        data["rmsd_avg"] = self.historical_rmsd_avg_values
        data["drmsd_avg"] = self.historical_drmsd_avg_values

        prediction_data = None

        return loss, data, prediction_data, angular_loss
Esempio n. 11
0
def process_file(input_file, output_file, use_gpu):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset('primary',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='int32')
    dset2 = f.create_dataset('tertiary',(current_buffer_size,MAX_SEQUENCE_LENGTH,9),maxshape=(None,MAX_SEQUENCE_LENGTH, 9),dtype='float')
    dset3 = f.create_dataset('mask',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='uint8')

    input_file_pointer = open("raw_data/casp11/" + input_file, "r")

    while True:
        # while there's more proteins to process
        print(input_file_pointer)
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size,MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size,MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size,MAX_SEQUENCE_LENGTH))

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length,9)).T

        tertiary_padded[:,:sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']

        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(torch.Tensor(primary_padded).type(dtype=torch.long), mask)
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(9, -1).transpose(0, 1).unsqueeze(1) / 100

        if use_gpu:
            pos = pos.cuda()

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos, [len(prim)], use_gpu=use_gpu)

        tertiary, _ = get_backbone_positions_from_angular_prediction(angles, batch_sizes, use_gpu=use_gpu)
        tertiary = tertiary.squeeze(1)

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
def process_file(input_file, output_file, use_gpu):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset(
        'primary', (current_buffer_size, MAX_SEQUENCE_LENGTH),
        maxshape=(None, MAX_SEQUENCE_LENGTH),
        dtype='int32'
    )  # creates an empty dataset with given dimension, axes with none in maxshape are unlimited
    dset2 = f.create_dataset('tertiary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH, 9),
                             maxshape=(None, MAX_SEQUENCE_LENGTH, 9),
                             dtype='float')
    dset3 = f.create_dataset('mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(
            next_protein['tertiary']).T)  # flattens the array into 1-D
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']

        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(
            torch.Tensor(primary_padded).type(dtype=torch.long), mask
        )  # only leaves those aa which have + (actually 0) in their mask
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(
            9, -1
        ).transpose(0, 1).unsqueeze(
            1
        ) / 100  # divides by 100 because all values are artificially increased by 100 as specified in the proteinnet documentation, do not know yet why we need to add an additional dimension???

        if use_gpu:
            pos = pos.cuda()

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(
            pos, [len(prim)], use_gpu=use_gpu)

        tertiary, _ = get_backbone_positions_from_angular_prediction(
            angles, batch_sizes, use_gpu=use_gpu)
        tertiary = tertiary.squeeze(1)

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu(
        ).numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to",
          output_file)
Esempio n. 13
0
def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'):
    print("Processing raw data file", input_file)

    # set tokenizer
    tokenizer = TAPETokenizer(vocab=vocab)

    # create output file
    file = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length), dtype='int32')
    dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9),
                                maxshape=(None, max_sequence_length, 9), dtype='float')
    dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length),
                                maxshape=(None, max_sequence_length),
                                dtype='uint8')
    dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21),
                                maxshape=(None, max_sequence_length, 21), dtype='float')
    dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length),
                                maxshape=(None, 2 * max_sequence_length), dtype='int32')

    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > max_sequence_length:
            # print("Dropping protein as length too long:", sequence_length)
            continue
        print("Process protein with length", sequence_length)
        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, max_sequence_length))
            dset2.resize((current_buffer_size, max_sequence_length, 9))
            dset3.resize((current_buffer_size, max_sequence_length))
            dset4.resize((current_buffer_size, max_sequence_length, 21))
            dset5.resize((current_buffer_size, 2 * max_sequence_length))

        primary_padded = np.zeros(max_sequence_length)
        tertiary_padded = np.zeros((9, max_sequence_length))
        mask_padded = np.zeros(max_sequence_length)
        pssm_padded = np.zeros((21, max_sequence_length))
        primary_token_padded = np.zeros(2 * max_sequence_length)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']

        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary'])

        if use_mask:
            mask = torch.Tensor(mask_padded).type(dtype=torch.bool)

            prim = torch.masked_select(torch.Tensor(primary_padded)
                                       .type(dtype=torch.long), mask)
            seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask']))

            pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \
                      .view(9, -1).transpose(0, 1).unsqueeze(1) / 100

            pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1)

            if use_gpu:
                pos = pos.cuda()

            angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos,
                                                                           [len(prim)],
                                                                           use_gpu=use_gpu)

            tertiary, _ = get_backbone_positions_from_angular_prediction(angles,
                                                                         batch_sizes,
                                                                         use_gpu=use_gpu)
            tertiary = tertiary.squeeze(1)

            primary_padded = np.zeros(max_sequence_length)
            tertiary_padded = np.zeros((max_sequence_length, 9))
            pssm_padded = np.zeros((max_sequence_length, 21))

            length_after_mask_removed = len(prim)

            primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
            primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy()
            tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy()
            pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy()
            mask_padded = np.zeros(max_sequence_length)
            mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed)

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = pssm_padded
        dset5[current_buffer_allocation] = primary_token_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
Esempio n. 14
0
def process_file(input_file, output_file, device, want_trimmed, want_pure):
    print("Processing raw data file", input_file)

    # create output file
    f = h5py.File(output_file, 'w')
    current_buffer_size = 1
    current_buffer_allocation = 0
    dset1 = f.create_dataset('primary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='int32')
    dset2 = f.create_dataset('tertiary',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH, 9),
                             maxshape=(None, MAX_SEQUENCE_LENGTH, 9),
                             dtype='float')
    dset3 = f.create_dataset('mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')
    dset4 = f.create_dataset('padding_mask',
                             (current_buffer_size, MAX_SEQUENCE_LENGTH),
                             maxshape=(None, MAX_SEQUENCE_LENGTH),
                             dtype='uint8')
    input_file_pointer = open("data/raw/" + input_file, "r")

    while True:
        # while there's more proteins to process
        next_protein = read_protein_from_file(input_file_pointer)
        if next_protein is None:
            break

        sequence_length = len(next_protein['primary'])

        if sequence_length > MAX_SEQUENCE_LENGTH:
            print("Dropping protein as length too long:", sequence_length)
            continue

        if want_pure:
            unpadded_mask = torch.Tensor(
                next_protein['mask']).type(dtype=torch.uint8)
            if unpadded_mask.sum() != unpadded_mask.shape[0]:
                print('dropping protein, mask has holes')
                continue

        elif want_trimmed:
            s = [str(i) for i in next_protein['mask']]
            s.append('0')  # needed for those that dont end with a mask spot!.
            res = "".join(s)
            if len(res.split('10')) > 2:
                print('dropping protein, mask isnt just on edges')
                continue

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH))
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)

        # masking and padding here happens so that the stored dataset is of the same size.
        # when the data is loaded in this padding is removed again.
        primary_padded[:sequence_length] = next_protein['primary']
        t_transposed = np.ravel(np.array(next_protein['tertiary']).T)
        t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T

        tertiary_padded[:, :sequence_length] = t_reshaped
        mask_padded[:sequence_length] = next_protein['mask']
        mask = torch.Tensor(mask_padded).type(dtype=torch.uint8)

        prim = torch.masked_select(
            torch.Tensor(primary_padded).type(dtype=torch.long), mask)
        pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(
            9, -1).transpose(0, 1).unsqueeze(1) / 100

        pos = pos.to(device)

        angles, batch_sizes = calculate_dihedral_angles_over_minibatch(
            pos, [len(prim)], device)
        # this must be what is creating the nans!! Not clear to me why...
        tertiary, _ = get_backbone_positions_from_angular_prediction(
            angles, batch_sizes, device)
        tertiary = tertiary.squeeze(1)

        if torch.isnan(tertiary).sum() > 0:
            print('there is a nan in tertiary! Dropping and printing mask')
            print(next_protein['mask'])
            continue

        primary_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9))

        length_after_mask_removed = len(prim)

        if sequence_length == 0:
            print(
                'sequence length is zero after mask was applied. Dropping! =========='
            )
            continue

        #print('final size', length_after_mask_removed)
        #print('tertiary ', tertiary)

        primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy()
        tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu(
        ).numpy()
        mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        padding_mask_padded = np.zeros(MAX_SEQUENCE_LENGTH)
        # this mask has masking for both the padding and the AAs without angle data!
        # # THIS HAS BECOME COMPLETELY IRRELEVANT NOW THAT I AM GETTING RID OF ANY MISSING THINGS FIRST!
        padding_mask_padded[:length_after_mask_removed] = np.ones(
            length_after_mask_removed)

        if current_buffer_allocation >= current_buffer_size:
            current_buffer_size = current_buffer_size + 1
            dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9))
            dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))
            dset4.resize((current_buffer_size, MAX_SEQUENCE_LENGTH))

        dset1[current_buffer_allocation] = primary_padded
        dset2[current_buffer_allocation] = tertiary_padded
        dset3[current_buffer_allocation] = mask_padded
        dset4[current_buffer_allocation] = padding_mask_padded
        current_buffer_allocation += 1

    print("Wrote output to", current_buffer_allocation, "proteins to",
          output_file)