def read_protein_from_file(file_pointer): dict_ = {} _dssp_dict = {'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7} _mask_dict = {'-': 0, '+': 1} while True: next_line = file_pointer.readline() if next_line == '[ID]\n': id_ = file_pointer.readline()[:-1] dict_.update({'id': id_}) elif next_line == '[PRIMARY]\n': primary = encode_primary_string(file_pointer.readline()[:-1]) dict_.update({'primary': primary}) elif next_line == '[EVOLUTIONARY]\n': evolutionary = [] for residue in range(21): evolutionary.append( [float(step) for step in file_pointer.readline().split()]) dict_.update({'evolutionary': evolutionary}) elif next_line == '[SECONDARY]\n': secondary = list([_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]]) dict_.update({'secondary': secondary}) elif next_line == '[TERTIARY]\n': tertiary = [] # 3 dimension for axis in range(3): tertiary.append( [float(coord) for coord in file_pointer.readline().split()]) dict_.update({'tertiary': tertiary}) elif next_line == '[MASK]\n': mask = list([_mask_dict[aa] for aa in file_pointer.readline()[:-1]]) dict_.update({'mask': mask}) elif next_line == '\n': return dict_ elif next_line == '': return None
def read_protein_from_file(file_pointer): dict_ = {} _dssp_dict = { 'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7 } _mask_dict = {'-': 0, '+': 1} while True: next_line = file_pointer.readline() if next_line == '[ID]\n': # if the line in the file contains its ID include it in the dictionary under the id key id_ = file_pointer.readline( )[: -1] # such specific indexing is used to omit the last character in this case the newline dict_.update({'id': id_}) elif next_line == '[PRIMARY]\n': # if the line in the file contains its Primary structure info include it in the dictionary under the primary key primary = encode_primary_string( file_pointer.readline()[:-1] ) # creates an encoded list where each aa is changed to its alphabetical position among all aa dict_.update({'primary': primary}) elif next_line == '[EVOLUTIONARY]\n': evolutionary = [] for residue in range(21): evolutionary.append( [float(step) for step in file_pointer.readline().split()] ) # the line contains 21 times the seq_length=lenseq of numbers first lenseq amount corresponds to the first aa the next lenseq to the second one and etc. in the end it appends 21 lists to a list which is then included in the dictionary under the key evolutionary dict_.update({'evolutionary': evolutionary}) elif next_line == '[SECONDARY]\n': # if the line in the file contains its Secondary structure info include it in the dictionary under the secondary key secondary = list( [_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]]) dict_.update({'secondary': secondary}) elif next_line == '[TERTIARY]\n': # if the line in the file contains its Tertiary structure info include it in the dictionary under the tertiary key tertiary = [] # 3 dimension for axis in range(3): tertiary.append( # first appends all first backbone atom coordinates of every aa, then the second and finally the last one [ float(coord) for coord in file_pointer.readline().split() ]) dict_.update({'tertiary': tertiary}) elif next_line == '[MASK]\n': mask = list( [_mask_dict[aa] for aa in file_pointer.readline()[:-1]]) dict_.update({'mask': mask}) elif next_line == '\n': return dict_ elif next_line == '': return None
def read_protein_from_file(file_pointer): """The algorithm Defining Secondary Structure of Proteins (DSSP) uses information on e.g. the position of atoms and the hydrogen bonds of the molecule to determine the secondary structure (helices, sheets...). """ dict_ = {} _dssp_dict = { 'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7 } _mask_dict = {'-': 0, '+': 1} while True: next_line = file_pointer.readline() if next_line == '[ID]\n': id_ = file_pointer.readline()[:-1] dict_.update({'id': id_}) elif next_line == '[PRIMARY]\n': primary = encode_primary_string(file_pointer.readline()[:-1]) dict_.update({'primary': primary}) elif next_line == '[EVOLUTIONARY]\n': evolutionary = [] for _residue in range(21): evolutionary.append( [float(step) for step in file_pointer.readline().split()]) dict_.update({'evolutionary': evolutionary}) elif next_line == '[SECONDARY]\n': secondary = list( [_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]]) dict_.update({'secondary': secondary}) elif next_line == '[TERTIARY]\n': tertiary = [] # 3 dimension for _axis in range(3): tertiary.append([ float(coord) for coord in file_pointer.readline().split() ]) dict_.update({'tertiary': tertiary}) elif next_line == '[MASK]\n': mask = list( [_mask_dict[aa] for aa in file_pointer.readline()[:-1]]) dict_.update({'mask': mask}) elif next_line == '\n': return dict_ elif next_line == '': return None
def main(): input_sequences = [ "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEE" \ "NTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKI" \ "EEQIEGKKGLNIRVRTLYDGIKNYKVQFP"] model_path = "output/models/2019-01-30_00_38_46-TRAIN-LR0_01-MB1.model" model = torch.load(model_path) input_sequences_encoded = list( torch.LongTensor(encode_primary_string(aa)) for aa in input_sequences) predicted_dihedral_angles, _predicted_backbone_atoms, _batch_sizes = \ model(input_sequences_encoded) write_to_pdb( get_structure_from_angles(input_sequences_encoded[0], predicted_dihedral_angles[:, 0]), "myprediction") print("Wrote prediction to output/protein_myprediction.pdb")
def predict(): list_of_files = glob.glob( 'output/models/*') # * means all if need specific format then *.csv model_path = max(list_of_files, key=os.path.getctime) print("Generating ONNX from model:", model_path) model = torch.load(model_path) input_sequences = [ "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEE" \ "NTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKI" \ "EEQIEGKKGLNIRVRTLYDGIKNYKVQFP"] input_sequences_encoded = list( torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences) print("Exporting to ONNX...") output_path = "./tests/output/openprotein.onnx" onnx_from_model(model, input_sequences_encoded, output_path) print("Wrote ONNX to", output_path)
def prediction(): list_of_files = glob.glob('output/models/*') default_model_path = max(list_of_files, key=os.path.getctime) parser = argparse.ArgumentParser( description="OpenProtein - Prediction CLI") parser.add_argument('--input_sequence', dest='input_sequence') parser.add_argument('--model_path', dest='model_path', default=default_model_path) parser.add_argument('--use_gpu', dest='use_gpu', default=False, type=bool) args, _ = parser.parse_known_args() print("Using model:", args.model_path) model = torch.load(args.model_path) input_sequences = [args.input_sequence] input_sequences_encoded = list( torch.IntTensor(encode_primary_string(aa)) for aa in input_sequences) predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = \ model(input_sequences_encoded) if predicted_dihedral_angles == []: predicted_dihedral_angles, _ = calculate_dihedral_angles_over_minibatch( predicted_backbone_atoms, batch_sizes, args.use_gpu) write_to_pdb( get_structure_from_angles(input_sequences_encoded[0], predicted_dihedral_angles[:, 0]), "prediction") print("Wrote prediction to output/protein_prediction.pdb")
def read_protein_from_file(file_pointer): """The algorithm Defining Secondary Structure of Proteins (DSSP) uses information on e.g. the position of atoms and the hydrogen bonds of the molecule to determine the secondary structure (helices, sheets...). """ dict_ = {} _dssp_dict = {'L': 0, 'H': 1, 'B': 2, 'E': 3, 'G': 4, 'I': 5, 'T': 6, 'S': 7} _mask_dict = {'-': 0, '+': 1} while True: next_line = file_pointer.readline() if next_line == '[ID]\n': id_ = file_pointer.readline()[:-1] dict_.update({'id': id_}) elif next_line == '[PRIMARY]\n': primary = encode_primary_string(file_pointer.readline()[:-1]) dict_.update({'primary': primary}) elif next_line == '[EVOLUTIONARY]\n': evolutionary = [] for _residue in range(21): evolutionary.append(\ [float(step) for step in file_pointer.readline().split()]) dict_.update({'evolutionary': evolutionary}) elif next_line == '[SECONDARY]\n': secondary = list([_dssp_dict[dssp] for dssp in file_pointer.readline()[:-1]]) dict_.update({'secondary': secondary}) elif next_line == '[TERTIARY]\n': tertiary = [] # 3 dimension for _axis in range(3): tertiary.append(\ [float(coord) for coord in file_pointer.readline().split()]) dict_.update({'tertiary': tertiary}) elif next_line == '[MASK]\n': mask = list([_mask_dict[aa] for aa in file_pointer.readline()[:-1]]) dict_.update({'mask': mask}) mask_str = ''.join(map(str, mask)) write_out("-------------") # Check for missing AA coordinates missing_internal_aa = False sequence_end = len(mask) # for now, assume no C-terminal truncation needed write_out("Reading the protein " + id_) if re.search(r'1+0+1+', mask_str) is not None: # indicates missing coordinates missing_internal_aa = True write_out("One or more internal coordinates missing. Protein is discarded.") elif re.search(r'^0*$', mask_str) is not None: # indicates no coordinates at all missing_internal_aa = True write_out("One or more internal coordinates missing. It will be discarded.") else: if mask[0] == 0: write_out("Missing coordinates in the N-terminal end. Truncating protein.") # investigate when the sequence with coordinates start and finish sequence_start = re.search(r'1', mask_str).start() if re.search(r'10', mask_str) is not None: # missing coords in the C-term end sequence_end = re.search(r'10', mask_str).start() + 1 write_out("Missing coordinates in the C-term end. Truncating protein.") write_out("Analyzing amino acids", sequence_start + 1, "-", sequence_end) # split lists in dict to have the seq with coords # separated from what should not be analysed if 'secondary' in dict_: dict_.update({'secondary': secondary[sequence_start:sequence_end]}) dict_.update({'primary': primary[sequence_start:sequence_end]}) dict_.update({'mask': mask[sequence_start:sequence_end]}) for elem in range(len(dict_['evolutionary'])): dict_['evolutionary'][elem] = \ dict_['evolutionary'][elem][sequence_start:sequence_end] for elem in range(len(dict_['tertiary'])): dict_['tertiary'][elem] = \ dict_['tertiary'][elem][sequence_start * 3:sequence_end * 3] elif next_line == '\n': if 'secondary' not in dict_: dict_['secondary'] = [8] * len(dict_['primary']) else: print("*" * 10, dict_['secondary']) return dict_, missing_internal_aa elif next_line == '': if dict_: if 'secondary' not in dict_: dict_['secondary'] = [8] * len(dict_['primary']) else: print("*" * 10, dict_['secondary']) return dict_, missing_internal_aa else: return None, False
# This file is part of the OpenProtein project. # # @author Jeppe Hallgren # # For license information, please see the LICENSE file in the root directory. import torch from util import encode_primary_string, get_structure_from_angles, write_to_pdb, \ calculate_dihedral_angles_over_minibatch input_sequences = [ "SRSLVISTINQISEDSKEFYFTLDNGKTMFPSNSQAWGGEKFENGQRAFVIFNELEQPVNGYDYNIQVRDITKVLTKEIVTMDDEENTEEKIGDDKINATYMWISKDKKYLTIEFQYYSTHSEDKKHFLNLVINNKDNTDDEYINLEFRHNSERDSPDHLGEGYVSFKLDKIEEQIEGKKGLNIRVRTLYDGIKNYKVQFP" ] model_path = "output/models/2019-01-30_00_38_46-TRAIN-LR0_01-MB1.model" model = torch.load(model_path) input_senquences_encoded = list( torch.LongTensor(encode_primary_string(aa)) for aa in input_sequences) predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = model( input_senquences_encoded) write_to_pdb( get_structure_from_angles(input_senquences_encoded[0], predicted_dihedral_angles[:, 0]), "myprediction") print("Wrote prediction to output/protein_myprediction.pdb")