def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(actual_coords_list) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() if isinstance(_batch_sizes[0], int): _batch_sizes = torch.tensor(_batch_sizes) emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, _batch_sizes, self.use_gpu) # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded, # actual_coords_list_padded, # batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() # drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss # + drmsd_avg
def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _) = minibatch if any(np.isnan(x.cpu().detach().numpy()).any() for x in original_aa_string) or \ any(np.isnan(x.cpu().detach().numpy()).any() for x in actual_coords_list): return None emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) assert not np.isnan(emissions.cpu().detach().numpy()).any() actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn\ .pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(actual_coords_list)) assert not np.isnan( actual_coords_list_padded.cpu().detach().numpy()).any() if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, batch_sizes_coords, self.use_gpu) # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded, # actual_coords_list_padded, # batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() # drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss # + drmsd_avg
def compute_loss(self, minibatch, processed_minibatches, minimum_updates): (original_aa_string, actual_coords_list, _) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence( actual_coords_list) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() if isinstance(_batch_sizes[0], int): _batch_sizes = torch.tensor(_batch_sizes) emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, _batch_sizes, self.use_gpu) drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded, actual_coords_list_padded, _batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) multiplier = 0.4 if (processed_minibatches < minimum_updates * (40 / 100)): multiplier = processed_minibatches / minimum_updates normalized_angular_loss = angular_loss / 5 normalized_drmsd_avg = drmsd_avg / 100 return (normalized_drmsd_avg * multiplier) + (normalized_angular_loss * (1 - multiplier))
def forward(self, seq, batch_sizes, tert): # dealing with the sequences first: # embeds = self.embeddings(inputs).view((1, -1)) packed_input_sequences = embed(seq, batch_sizes, self.device) packed_output, hidden = self.encoder_seq(packed_input_sequences) # batch comes second here? so shape[1] # commented out so I can add the meta LSTM. need to unpack also!! out_padded_seq, lengths = torch.nn.utils.rnn.pad_packed_sequence( packed_output) #seq_hidden_means = torch.sum(out_padded, dim=0) / lengths.view(-1,1).expand(-1, self.ENCODING_LSTM_OUTPUT*2).type(torch.float) #Now dealing with the tertiary structure!! Convert coords to dihedral angles. # None here is because this is not padded and I dont want to give it a batch size. #print('pre dihedral', tert) tert_angles = calculate_dihedral_angles_over_minibatch(tert, None, self.device, is_padded=False) # convert this into a packed sequence! #print('pre packing', tert_angles) packed_tert_angles = torch.nn.utils.rnn.pack_sequence(tert_angles).to( self.device) #this is to return for the loss function, the real dihedral angles: padded_real_angles, _ = torch.nn.utils.rnn.pad_packed_sequence( packed_tert_angles) # dealing with the sequences: packed_output, hidden = self.encoder_tert(packed_tert_angles) # need to unpack and get the means here! out_padded_tert, lengths = torch.nn.utils.rnn.pad_packed_sequence( packed_output) #tert_hidden_means = torch.sum(out_padded, dim=0) / lengths.view(-1,1).expand(-1, self.ENCODING_LSTM_OUTPUT*2).type(torch.float) # get mean of all hidden states. will then concat this with the tertiary and put through dense. # meta encoder LSTM: concat all the hidden states from every time step!! res = torch.cat((out_padded_seq, out_padded_tert), dim=2) res = torch.nn.utils.rnn.pack_padded_sequence(res, lengths) res, hidden = self.encoder_meta(res) res, lengths = torch.nn.utils.rnn.pad_packed_sequence(res) res = torch.sum(res, dim=0) / lengths.view(-1, 1).expand( -1, self.META_ENCODING_LSTM_OUTPUT * 2).type(torch.float).to( self.device) #res = torch.cat( (seq_hidden_means, tert_hidden_means), dim=1) #Ignore the batchnorm for now! #res = self.batchnorm(res) res = self.dense2_enc(F.elu( self.dense1_enc(res))) # used to have F.tanh here! # out_padded are the dihedral angles for the structure!! return res, padded_real_angles
def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _, pssms, token) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = self._get_network_emissions( original_aa_string, pssms, token) actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(actual_coords_list)) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() emissions_actual, _ = calculate_dihedral_angles_over_minibatch( actual_coords_list_padded, batch_sizes_coords, self.use_gpu) drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded, actual_coords_list_padded, _batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss, drmsd_avg
def _get_network_emissions(self, original_aa_string): initial_aa_pos = initial_pos_from_aa_string(original_aa_string) packed_input_sequences = self.embed(original_aa_string) backbone_atoms_padded, batch_sizes_backbone \ = structures_to_backbone_atoms_padded(initial_aa_pos) if self.use_gpu: backbone_atoms_padded = backbone_atoms_padded.cuda() embedding_padded, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence( torch.nn.utils.rnn.PackedSequence(packed_input_sequences)) for _ in range(self.recurrent_steps): combined_features = torch.cat((embedding_padded, backbone_atoms_padded), dim=2) for idx, aa_features in enumerate(combined_features.transpose(0, 1)): msg = pass_messages(aa_features, self.apply_message_function, self.use_gpu) # aa_count * output size backbone_atoms_padded[:, idx] = self.linear_transform( torch.cat((aa_features, msg), dim=1)) output, batch_sizes = calculate_dihedral_angles_over_minibatch(original_aa_string, backbone_atoms_padded, batch_sizes_backbone, self.use_gpu) return output, backbone_atoms_padded, batch_sizes
def prediction(): list_of_files = glob.glob('output/models/*') default_model_path = max(list_of_files, key=os.path.getctime) parser = argparse.ArgumentParser( description="OpenProtein - Prediction CLI") parser.add_argument('--input_sequence', dest='input_sequence') parser.add_argument('--model_path', dest='model_path', default=default_model_path) parser.add_argument('--use_gpu', dest='use_gpu', default=False, type=bool) args, _ = parser.parse_known_args() print("Using model:", args.model_path) model = torch.load(args.model_path) input_sequences = [args.input_sequence] input_sequences_encoded = list( torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences) predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = \ model(input_sequences_encoded) if predicted_dihedral_angles == []: predicted_dihedral_angles, _ = calculate_dihedral_angles_over_minibatch( predicted_backbone_atoms, batch_sizes, args.use_gpu) write_to_pdb( get_structure_from_angles(input_sequences_encoded[0], predicted_dihedral_angles[:, 0]), "prediction") print("Wrote prediction to output/protein_prediction.pdb")
def evaluate_model(self, data_loader): loss = 0 data_total = [] dRMSD_list = [] RMSD_list = [] for _, data in enumerate(data_loader, 0): primary_sequence, tertiary_positions, _mask = data start = time.time() predicted_angles, backbone_atoms, batch_sizes = self(primary_sequence) write_out("Apply model to validation minibatch:", time.time() - start) if predicted_angles == []: # model didn't provide angles, so we'll compute them here output_angles, _ = calculate_dihedral_angles_over_minibatch(backbone_atoms, batch_sizes, self.use_gpu) else: output_angles = predicted_angles cpu_predicted_angles = output_angles.transpose(0, 1).cpu().detach() if backbone_atoms == []: # model didn't provide backbone atoms, we need to compute that output_positions, _ = \ get_backbone_positions_from_angles(predicted_angles, batch_sizes, self.use_gpu) else: output_positions = backbone_atoms cpu_predicted_backbone_atoms = output_positions.transpose(0, 1).cpu().detach() minibatch_data = list(zip(primary_sequence, tertiary_positions, cpu_predicted_angles, cpu_predicted_backbone_atoms)) data_total.extend(minibatch_data) start = time.time() for primary_sequence, tertiary_positions, _predicted_pos, predicted_backbone_atoms\ in minibatch_data: actual_coords = tertiary_positions.transpose(0, 1).contiguous().view(-1, 3) predicted_coords = predicted_backbone_atoms[:len(primary_sequence)]\ .transpose(0, 1).contiguous().view(-1, 3).detach() rmsd = calc_rmsd(predicted_coords, actual_coords) drmsd = calc_drmsd(predicted_coords, actual_coords) RMSD_list.append(rmsd) dRMSD_list.append(drmsd) error = rmsd loss += error end = time.time() write_out("Calculate validation loss for minibatch took:", end - start) loss /= data_loader.dataset.__len__() self.historical_rmsd_avg_values.append(float(torch.Tensor(RMSD_list).mean())) self.historical_drmsd_avg_values.append(float(torch.Tensor(dRMSD_list).mean())) prim = data_total[0][0] pos = data_total[0][1] pos_pred = data_total[0][3] if self.use_gpu: pos = pos.cuda() pos_pred = pos_pred.cuda() angles = calculate_dihedral_angles(pos, self.use_gpu) angles_pred = calculate_dihedral_angles(pos_pred, self.use_gpu) write_to_pdb(get_structure_from_angles(prim, angles), "test") write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred") data = {} data["pdb_data_pred"] = open("output/protein_test_pred.pdb", "r").read() data["pdb_data_true"] = open("output/protein_test.pdb", "r").read() data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:, 1]]) data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1, 2]]) data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:, 1]]) data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1, 2]]) data["rmsd_avg"] = self.historical_rmsd_avg_values data["drmsd_avg"] = self.historical_drmsd_avg_values prediction_data = None return (loss, data, prediction_data)
def process_file(input_file, output_file, use_gpu): write_out("Processing raw data file", input_file) # create output file file = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = file.create_dataset('primary', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32') dset2 = file.create_dataset('tertiary', (current_buffer_size, MAX_SEQUENCE_LENGTH, 9), maxshape=(None, MAX_SEQUENCE_LENGTH, 9), dtype='float') dset3 = file.create_dataset('mask', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='uint8') dset4 = file.create_dataset('angle', (current_buffer_size, MAX_SEQUENCE_LENGTH, 3), maxshape=(None, MAX_SEQUENCE_LENGTH, 3), dtype='float') dset5 = file.create_dataset('secondary', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32') dset6 = file.create_dataset('id', (current_buffer_size, 1), maxshape=(None, 1), dtype=h5py.string_dtype()) input_file_pointer = open(input_file, "r") while True: # while there's more proteins to process next_protein, missing_aa = read_protein_from_file(input_file_pointer) if next_protein is None: # no more proteins to process break sequence_length = len(next_protein['primary']) if sequence_length > MAX_SEQUENCE_LENGTH: write_out("Dropping protein as length too long:", sequence_length) continue if missing_aa is True: continue if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9)) dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset4.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 3)) dset5.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset6.resize((current_buffer_size, 1)) primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH)) mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) secondary_padded = np.zeros(MAX_SEQUENCE_LENGTH) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] secondary_padded[:sequence_length] = next_protein['secondary'] t_transposed = np.ravel(np.array(next_protein['tertiary']).T) t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T tertiary_padded[:, :sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] mask = torch.Tensor(mask_padded).type(dtype=torch.bool) prim = torch.masked_select(torch.Tensor(primary_padded)\ .type(dtype=torch.long), mask) sec = torch.masked_select(torch.Tensor(secondary_padded)\ .type(dtype=torch.long), mask) pos = torch.masked_select(torch.Tensor(tertiary_padded), mask)\ .view(9, -1).transpose(0, 1).unsqueeze(1) pos_angstrom = pos / 100 if use_gpu: pos_angstrom = pos_angstrom.cuda() # map to angles and back to tertiary angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos_angstrom, torch.tensor([len(prim)]), use_gpu=use_gpu) tertiary, _ = get_backbone_positions_from_angles(angles, batch_sizes, use_gpu=use_gpu) tertiary = tertiary.squeeze(1) angles = angles.squeeze(1) # create variables to store padded sequences in primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9)) mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) angle_padded = np.zeros((MAX_SEQUENCE_LENGTH, 3)) secondary_padded = np.zeros(MAX_SEQUENCE_LENGTH) # store padded sequences length_after_mask_removed = len(prim) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() secondary_padded[:length_after_mask_removed] = sec.data.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy() mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed) angle_padded[:length_after_mask_removed, :] = angles.data.cpu().numpy() # save padded sequences on disk dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded dset4[current_buffer_allocation] = angle_padded dset5[current_buffer_allocation] = secondary_padded dset6[current_buffer_allocation] = next_protein['id'] current_buffer_allocation += 1 if current_buffer_allocation == 0: write_out("Preprocessing was selected but no proteins in the input file " "were accepted. Please check your input.") os._exit(1) write_out("Wrote output to", current_buffer_allocation, "proteins to", output_file)
def evaluate_model(self, data_loader): loss = 0 angular_loss = 0 data_total = [] dRMSD_list = [] RMSD_list = [] for _, data in enumerate(data_loader, 0): primary_sequence, tertiary_positions, _mask, pssm, token = data start = time.time() predicted_angles, backbone_atoms, _batch_sizes = self( primary_sequence, pssm, token) write_out("Apply model to validation minibatch:", time.time() - start) cpu_predicted_angles = predicted_angles.transpose( 0, 1).cpu().detach() cpu_predicted_backbone_atoms = backbone_atoms.transpose( 0, 1).cpu().detach() minibatch_data = list( zip(primary_sequence, tertiary_positions, cpu_predicted_angles, cpu_predicted_backbone_atoms)) data_total.extend(minibatch_data) actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(tertiary_positions)) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() emissions_actual, _ = calculate_dihedral_angles_over_minibatch( actual_coords_list_padded, batch_sizes_coords, self.use_gpu) start = time.time() for primary_sequence, tertiary_positions, _predicted_pos, predicted_backbone_atoms \ in minibatch_data: actual_coords = tertiary_positions.transpose( 0, 1).contiguous().view(-1, 3) predicted_coords = predicted_backbone_atoms[:len(primary_sequence)] \ .transpose(0, 1).contiguous().view(-1, 3).detach() if self.use_gpu: emissions_actual = emissions_actual.cuda() angular_loss += float( calc_angular_difference(predicted_angles, emissions_actual)) rmsd = calc_rmsd(predicted_coords, actual_coords) drmsd = calc_drmsd(predicted_coords, actual_coords) RMSD_list.append(rmsd) dRMSD_list.append(drmsd) error = float(drmsd) loss += error end = time.time() write_out("Calculate validation loss for minibatch took:", end - start) loss /= data_loader.dataset.__len__() angular_loss /= data_loader.dataset.__len__() self.historical_rmsd_avg_values.append( float(torch.Tensor(RMSD_list).mean())) self.historical_drmsd_avg_values.append( float(torch.Tensor(dRMSD_list).mean())) prim = data_total[0][0] pos = data_total[0][1] pos_pred = data_total[0][3] if self.use_gpu: pos = pos.cuda() pos_pred = pos_pred.cuda() angles = calculate_dihedral_angles(pos, self.use_gpu) angles_pred = calculate_dihedral_angles(pos_pred, self.use_gpu) write_to_pdb(get_structure_from_angles(prim, angles), "test") write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred") data = {} data["pdb_data_pred"] = open("output/protein_test_pred.pdb", "r").read() data["pdb_data_true"] = open("output/protein_test.pdb", "r").read() data["phi_actual"] = list( [math.degrees(float(v)) for v in angles[1:, 1]]) data["psi_actual"] = list( [math.degrees(float(v)) for v in angles[:-1, 2]]) data["phi_predicted"] = list( [math.degrees(float(v)) for v in angles_pred[1:, 1]]) data["psi_predicted"] = list( [math.degrees(float(v)) for v in angles_pred[:-1, 2]]) data["rmsd_avg"] = self.historical_rmsd_avg_values data["drmsd_avg"] = self.historical_drmsd_avg_values prediction_data = None return loss, data, prediction_data, angular_loss
def process_file(input_file, output_file, use_gpu): print("Processing raw data file", input_file) # create output file f = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = f.create_dataset('primary',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='int32') dset2 = f.create_dataset('tertiary',(current_buffer_size,MAX_SEQUENCE_LENGTH,9),maxshape=(None,MAX_SEQUENCE_LENGTH, 9),dtype='float') dset3 = f.create_dataset('mask',(current_buffer_size,MAX_SEQUENCE_LENGTH),maxshape=(None,MAX_SEQUENCE_LENGTH),dtype='uint8') input_file_pointer = open("raw_data/casp11/" + input_file, "r") while True: # while there's more proteins to process print(input_file_pointer) next_protein = read_protein_from_file(input_file_pointer) if next_protein is None: break sequence_length = len(next_protein['primary']) if sequence_length > MAX_SEQUENCE_LENGTH: print("Dropping protein as length too long:", sequence_length) continue if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size,MAX_SEQUENCE_LENGTH)) dset2.resize((current_buffer_size,MAX_SEQUENCE_LENGTH, 9)) dset3.resize((current_buffer_size,MAX_SEQUENCE_LENGTH)) primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH)) mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] t_transposed = np.ravel(np.array(next_protein['tertiary']).T) t_reshaped = np.reshape(t_transposed, (sequence_length,9)).T tertiary_padded[:,:sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] mask = torch.Tensor(mask_padded).type(dtype=torch.uint8) prim = torch.masked_select(torch.Tensor(primary_padded).type(dtype=torch.long), mask) pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view(9, -1).transpose(0, 1).unsqueeze(1) / 100 if use_gpu: pos = pos.cuda() angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos, [len(prim)], use_gpu=use_gpu) tertiary, _ = get_backbone_positions_from_angular_prediction(angles, batch_sizes, use_gpu=use_gpu) tertiary = tertiary.squeeze(1) primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9)) length_after_mask_removed = len(prim) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy() mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed) dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded current_buffer_allocation += 1 print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
def process_file(input_file, output_file, use_gpu): print("Processing raw data file", input_file) # create output file f = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = f.create_dataset( 'primary', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32' ) # creates an empty dataset with given dimension, axes with none in maxshape are unlimited dset2 = f.create_dataset('tertiary', (current_buffer_size, MAX_SEQUENCE_LENGTH, 9), maxshape=(None, MAX_SEQUENCE_LENGTH, 9), dtype='float') dset3 = f.create_dataset('mask', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='uint8') input_file_pointer = open("data/raw/" + input_file, "r") while True: # while there's more proteins to process next_protein = read_protein_from_file(input_file_pointer) if next_protein is None: break sequence_length = len(next_protein['primary']) if sequence_length > MAX_SEQUENCE_LENGTH: print("Dropping protein as length too long:", sequence_length) continue if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9)) dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH)) mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] t_transposed = np.ravel(np.array( next_protein['tertiary']).T) # flattens the array into 1-D t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T tertiary_padded[:, :sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] mask = torch.Tensor(mask_padded).type(dtype=torch.uint8) prim = torch.masked_select( torch.Tensor(primary_padded).type(dtype=torch.long), mask ) # only leaves those aa which have + (actually 0) in their mask pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view( 9, -1 ).transpose(0, 1).unsqueeze( 1 ) / 100 # divides by 100 because all values are artificially increased by 100 as specified in the proteinnet documentation, do not know yet why we need to add an additional dimension??? if use_gpu: pos = pos.cuda() angles, batch_sizes = calculate_dihedral_angles_over_minibatch( pos, [len(prim)], use_gpu=use_gpu) tertiary, _ = get_backbone_positions_from_angular_prediction( angles, batch_sizes, use_gpu=use_gpu) tertiary = tertiary.squeeze(1) primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9)) length_after_mask_removed = len(prim) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu( ).numpy() mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) mask_padded[:length_after_mask_removed] = np.ones( length_after_mask_removed) dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded current_buffer_allocation += 1 print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
def process_file(input_file, output_file, use_gpu, max_sequence_length, use_mask=True, vocab='iupac'): print("Processing raw data file", input_file) # set tokenizer tokenizer = TAPETokenizer(vocab=vocab) # create output file file = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = file.create_dataset('primary', (current_buffer_size, max_sequence_length), maxshape=(None, max_sequence_length), dtype='int32') dset2 = file.create_dataset('tertiary', (current_buffer_size, max_sequence_length, 9), maxshape=(None, max_sequence_length, 9), dtype='float') dset3 = file.create_dataset('mask', (current_buffer_size, max_sequence_length), maxshape=(None, max_sequence_length), dtype='uint8') dset4 = file.create_dataset('pssm', (current_buffer_size, max_sequence_length, 21), maxshape=(None, max_sequence_length, 21), dtype='float') dset5 = file.create_dataset('primary_token', (current_buffer_size, 2 * max_sequence_length), maxshape=(None, 2 * max_sequence_length), dtype='int32') input_file_pointer = open("data/raw/" + input_file, "r") while True: # while there's more proteins to process next_protein = read_protein_from_file(input_file_pointer) if next_protein is None: break sequence_length = len(next_protein['primary']) if sequence_length > max_sequence_length: # print("Dropping protein as length too long:", sequence_length) continue print("Process protein with length", sequence_length) if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size, max_sequence_length)) dset2.resize((current_buffer_size, max_sequence_length, 9)) dset3.resize((current_buffer_size, max_sequence_length)) dset4.resize((current_buffer_size, max_sequence_length, 21)) dset5.resize((current_buffer_size, 2 * max_sequence_length)) primary_padded = np.zeros(max_sequence_length) tertiary_padded = np.zeros((9, max_sequence_length)) mask_padded = np.zeros(max_sequence_length) pssm_padded = np.zeros((21, max_sequence_length)) primary_token_padded = np.zeros(2 * max_sequence_length) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] t_transposed = np.ravel(np.array(next_protein['tertiary']).T) t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T tertiary_padded[:, :sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] pssm_padded[:, :sequence_length] = np.array(next_protein['evolutionary']) if use_mask: mask = torch.Tensor(mask_padded).type(dtype=torch.bool) prim = torch.masked_select(torch.Tensor(primary_padded) .type(dtype=torch.long), mask) seq_token = torch.Tensor(tokenization(tokenizer, next_protein['seq'], next_protein['mask'])) pos = torch.masked_select(torch.Tensor(tertiary_padded), mask) \ .view(9, -1).transpose(0, 1).unsqueeze(1) / 100 pssm = torch.masked_select(torch.Tensor(pssm_padded), mask).view(21, -1).transpose(0, 1) if use_gpu: pos = pos.cuda() angles, batch_sizes = calculate_dihedral_angles_over_minibatch(pos, [len(prim)], use_gpu=use_gpu) tertiary, _ = get_backbone_positions_from_angular_prediction(angles, batch_sizes, use_gpu=use_gpu) tertiary = tertiary.squeeze(1) primary_padded = np.zeros(max_sequence_length) tertiary_padded = np.zeros((max_sequence_length, 9)) pssm_padded = np.zeros((max_sequence_length, 21)) length_after_mask_removed = len(prim) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() primary_token_padded[:len(seq_token)] = seq_token.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu().numpy() pssm_padded[:length_after_mask_removed, :] = pssm.data.cpu().numpy() mask_padded = np.zeros(max_sequence_length) mask_padded[:length_after_mask_removed] = np.ones(length_after_mask_removed) dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded dset4[current_buffer_allocation] = pssm_padded dset5[current_buffer_allocation] = primary_token_padded current_buffer_allocation += 1 print("Wrote output to", current_buffer_allocation, "proteins to", output_file)
def process_file(input_file, output_file, device, want_trimmed, want_pure): print("Processing raw data file", input_file) # create output file f = h5py.File(output_file, 'w') current_buffer_size = 1 current_buffer_allocation = 0 dset1 = f.create_dataset('primary', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='int32') dset2 = f.create_dataset('tertiary', (current_buffer_size, MAX_SEQUENCE_LENGTH, 9), maxshape=(None, MAX_SEQUENCE_LENGTH, 9), dtype='float') dset3 = f.create_dataset('mask', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='uint8') dset4 = f.create_dataset('padding_mask', (current_buffer_size, MAX_SEQUENCE_LENGTH), maxshape=(None, MAX_SEQUENCE_LENGTH), dtype='uint8') input_file_pointer = open("data/raw/" + input_file, "r") while True: # while there's more proteins to process next_protein = read_protein_from_file(input_file_pointer) if next_protein is None: break sequence_length = len(next_protein['primary']) if sequence_length > MAX_SEQUENCE_LENGTH: print("Dropping protein as length too long:", sequence_length) continue if want_pure: unpadded_mask = torch.Tensor( next_protein['mask']).type(dtype=torch.uint8) if unpadded_mask.sum() != unpadded_mask.shape[0]: print('dropping protein, mask has holes') continue elif want_trimmed: s = [str(i) for i in next_protein['mask']] s.append('0') # needed for those that dont end with a mask spot!. res = "".join(s) if len(res.split('10')) > 2: print('dropping protein, mask isnt just on edges') continue primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((9, MAX_SEQUENCE_LENGTH)) mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) # masking and padding here happens so that the stored dataset is of the same size. # when the data is loaded in this padding is removed again. primary_padded[:sequence_length] = next_protein['primary'] t_transposed = np.ravel(np.array(next_protein['tertiary']).T) t_reshaped = np.reshape(t_transposed, (sequence_length, 9)).T tertiary_padded[:, :sequence_length] = t_reshaped mask_padded[:sequence_length] = next_protein['mask'] mask = torch.Tensor(mask_padded).type(dtype=torch.uint8) prim = torch.masked_select( torch.Tensor(primary_padded).type(dtype=torch.long), mask) pos = torch.masked_select(torch.Tensor(tertiary_padded), mask).view( 9, -1).transpose(0, 1).unsqueeze(1) / 100 pos = pos.to(device) angles, batch_sizes = calculate_dihedral_angles_over_minibatch( pos, [len(prim)], device) # this must be what is creating the nans!! Not clear to me why... tertiary, _ = get_backbone_positions_from_angular_prediction( angles, batch_sizes, device) tertiary = tertiary.squeeze(1) if torch.isnan(tertiary).sum() > 0: print('there is a nan in tertiary! Dropping and printing mask') print(next_protein['mask']) continue primary_padded = np.zeros(MAX_SEQUENCE_LENGTH) tertiary_padded = np.zeros((MAX_SEQUENCE_LENGTH, 9)) length_after_mask_removed = len(prim) if sequence_length == 0: print( 'sequence length is zero after mask was applied. Dropping! ==========' ) continue #print('final size', length_after_mask_removed) #print('tertiary ', tertiary) primary_padded[:length_after_mask_removed] = prim.data.cpu().numpy() tertiary_padded[:length_after_mask_removed, :] = tertiary.data.cpu( ).numpy() mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) mask_padded[:length_after_mask_removed] = np.ones( length_after_mask_removed) padding_mask_padded = np.zeros(MAX_SEQUENCE_LENGTH) # this mask has masking for both the padding and the AAs without angle data! # # THIS HAS BECOME COMPLETELY IRRELEVANT NOW THAT I AM GETTING RID OF ANY MISSING THINGS FIRST! padding_mask_padded[:length_after_mask_removed] = np.ones( length_after_mask_removed) if current_buffer_allocation >= current_buffer_size: current_buffer_size = current_buffer_size + 1 dset1.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset2.resize((current_buffer_size, MAX_SEQUENCE_LENGTH, 9)) dset3.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset4.resize((current_buffer_size, MAX_SEQUENCE_LENGTH)) dset1[current_buffer_allocation] = primary_padded dset2[current_buffer_allocation] = tertiary_padded dset3[current_buffer_allocation] = mask_padded dset4[current_buffer_allocation] = padding_mask_padded current_buffer_allocation += 1 print("Wrote output to", current_buffer_allocation, "proteins to", output_file)