def test_align_file(self): pdbin1 = os.path.join(self.testfiles_dir, '1D7M.pdb') pdbin2 = os.path.join(self.testfiles_dir, '1GU8.pdb') pdbin3 = os.path.join(self.testfiles_dir, '2UUI.pdb') s1 = sequence_util.Sequence(pdb=pdbin1) s1 += sequence_util.Sequence(pdb=pdbin2) s1 += sequence_util.Sequence(pdb=pdbin3) ref = ">1D7M.pdb" + os.linesep ref += "EMANRLAGLENSLESEKVSREQLIKQKDQLNSLLASLESEGAEREKRLRELEAKLDETLKNLELEKLARMELEARLAKTE" + os.linesep ref += "KDRAILELKLAEAIDEKSKLE" + os.linesep ref += os.linesep ref += ">1D7M.pdb" + os.linesep ref += "EMANRLAGLENSLESEKVSREQLIKQKDQLNSLLASLESEGAEREKRLRELEAKLDETLKNLELEKLARMELEARLAKTE" + os.linesep ref += "KDRAILELKLAEAIDEKSKLE" + os.linesep ref += os.linesep ref += ">1GU8.pdb" + os.linesep ref += "VGLTTLFWLGAIGMLVGTLAFAWAGRDAGSGERRYYVTLVGISGIAAVAYVVMALGVGWVPVAERTVFAPRYIDWILTTP" + os.linesep ref += "LIVYFLGLLAGLDSREFGIVITLNTVVMLAGFAGAMVPGIERYALFGMGAVAFLGLVYYLVGPMTESASQRSSGIKSLYV" + os.linesep ref += "RLRNLTVILWAIYPFIWLLGPPGVALLTPTVDVALIVYLDLVTKVGFGFIALDAAATL" + os.linesep ref += os.linesep ref += ">2UUI.pdb" + os.linesep ref += "MHHHHHHKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEFERVYRAQVNCSEYFPLFLATLWVAGIFF" + os.linesep ref += "HEGAAALCGLVYLFARLRYFQGYARSAQLRLAPLYASARALWLLVALAALGLLAHFLPAALRAALLGRLRTLLPWA" + os.linesep ref += os.linesep self.assertEqual(s1.fasta_str(pdbname=True), ref)
def alignment_file(self, models, alignment_file=None): """Create an alignment file for the models - this is based on the assumption they are all the same length but may have different residues""" if not alignment_file: alignment_file = os.path.join(self.work_dir, 'homologs.fasta') all_seq = sequence_util.Sequence(pdb=models[0]) for model in models[1:]: all_seq += sequence_util.Sequence(pdb=model) if not all(map(lambda x: x == len(all_seq.sequences[0]), [len(s) for s in all_seq.sequences])): raise RuntimeError('PDB files are not all of the same length!\n{0}'.format(models)) all_seq.write_fasta(alignment_file, pdbname=True) return alignment_file
def test_add(self): s1 = sequence_util.Sequence( pdb=os.path.join(self.testfiles_dir, '1GU8.pdb')) s2 = sequence_util.Sequence( fasta=os.path.join(self.testfiles_dir, '2uui.fasta')) s1 += s2 self.assertTrue(len(s1.sequences), 2) self.assertTrue(len(s1.resseqs), 2) self.assertTrue(len(s1.headers), 2) self.assertTrue(len(s1.pdbs), 2) self.assertTrue(len(s1.chains), 2) self.assertTrue(len(s1.fasta_files), 2)
def test__parse_fasta_2(self): fasta = [">foo"] fasta += ["AAAAA AA"] s = sequence_util.Sequence() s._parse_fasta(fasta) self.assertListEqual(s.headers, [">foo"]) self.assertListEqual(s.sequences, ["AAAAAAA"])
def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None): """ Only residues from the first protein are listed in the theseus output, but then not even all of them We assume the output is based on the original alignment so that where each residue in the first protein lines up with either another residue in one of the other proteins or a gap SO - we need to go through the theseus data and for each residue that is core find the corresponding residues in the other proteins We use the resSeq numbers to match the residues across the alignment """ if not os.path.isdir(work_dir): os.mkdir(work_dir) seqalign = sequence_util.Sequence(fasta=alignment_file) # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object for m in models: seqalign.add_pdb_data(m) # Sanity check that the names of the pdb files match those from the fasta header # Format is expected to be: '>1ujb.pdb(A)' names = [h[1:].split('(')[0] for h in seqalign.headers] if not seqalign.pdbs == names: raise RuntimeError( "headers and names of pdb files do not match!\n{0}\n{1}".format( seqalign.pdbs, names)) # Get the name of the first pdb that the alignment is based on first = seqalign.pdbs[0] # Dictionary mapping model pdb to resSeqs that are core model2core = {} for p in seqalign.pdbs: model2core[p] = [] # initialise # Get list of core resSeqs in the first sequence model2core[first] = [x.resSeq for x in var_by_res if x.core] # Now go through the first sequence and get the resSeqs of the corresponding core for the other models pointer = 0 # Tracks where we are in the first sequence for i, resSeq in enumerate(seqalign.resseqs[0]): if model2core[first][pointer] == resSeq: # Core residue in first sequence so append the corresponding resSeqs for the other proteins for j, pdb in enumerate(seqalign.pdbs[1:]): model2core[pdb].append(seqalign.resseqs[j + 1][i]) pointer += 1 if pointer >= len(model2core[first]): break core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep=model2core[name]) core_models.append(pdbout) return core_models
def test_resseq(self): pdbin = os.path.join(self.testfiles_dir, '1D7M.pdb') s1 = sequence_util.Sequence(pdb=pdbin) self.assertTrue(len(s1.sequences), 2) self.assertTrue(len(s1.headers), 2) self.assertTrue(len(s1.pdbs), 2) self.assertEqual(s1.pdbs[0], os.path.basename(pdbin)) self.assertTrue(s1.resseqs[0][-1], 343)
def test_fail_char(self): fp = sequence_util.Sequence() # Test case 1 - expected to work fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVYAIAFTMYLSMLLGYGLTMVP"] try: fp.canonicalise() except RuntimeError as msg: self.assertTrue(False, msg) # Test case 2 - expected to fail fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVXAIAFTMYLSMLLGYGLTMVP"] self.assertRaises(RuntimeError, fp.canonicalise)
def test_from_pdb(self): s1 = sequence_util.Sequence( pdb=os.path.join(self.testfiles_dir, '4DZN.pdb')) self.assertEqual(s1.name, '4DZN') self.assertEqual(s1.pdbs, ['4DZN.pdb', '4DZN.pdb', '4DZN.pdb']) self.assertEqual(s1.chains, ['A', 'B', 'C']) outfasta = ">From pdb: 4DZN.pdb chain=A length=31" + os.linesep outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep outfasta += os.linesep outfasta += ">From pdb: 4DZN.pdb chain=B length=31" + os.linesep outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep outfasta += os.linesep outfasta += ">From pdb: 4DZN.pdb chain=C length=31" + os.linesep outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep outfasta += os.linesep self.assertEqual(outfasta, "".join(s1.fasta_str()))
def test_OK(self): infasta = ">3HAP:A|PDBID|CHAIN|SEQUENCE" + os.linesep infasta += "QAQITGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAI" + os.linesep infasta += "TTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLDLALLV" + os.linesep infasta += "DADQGTILAAVGADGIMIGTGLVGALTKVYSYRFVWWAISTAA" + os.linesep infasta += "MLYILYVLFFGFTSKAESMRPEVASTFKVL" + os.linesep infasta += "RNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILL" + os.linesep infasta += "RSRAIFGEAEAPEPSAGDGAAATSD" fp = sequence_util.Sequence() fp._parse_fasta(infasta.split(os.linesep)) outfasta = ">3HAP:A|PDBID|CHAIN|SEQUENCE" + os.linesep outfasta += "QAQITGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYW" + os.linesep outfasta += "ARYADWLFTTPLLLLDLALLVDADQGTILAAVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLFFGFTSKA" + os.linesep outfasta += "ESMRPEVASTFKVLRNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILLRSRAIFGEAEAPEPSA" + os.linesep outfasta += "GDGAAATSD" + os.linesep outfasta += os.linesep self.assertEqual(outfasta, "".join(fp.fasta_str())) self.assertEqual(fp.length(), 249)
# Get full paths to all files args.input_file = os.path.abspath(args.input_file) if not os.path.isfile(args.input_file): raise RuntimeError("Cannot find input file: {}".format( args.input_file)) if args.output_file: args.output_file = os.path.abspath(args.output_file) else: n = os.path.splitext(os.path.basename(args.input_file))[0] args.output_file = n + "_std.pdb" if args.ren: renumber_residues(args.input_file, args.output_file, start=1) elif args.std: standardise(args.input_file, args.output_file, del_hetatm=True, chain=args.chain) elif args.seq: logging.debug(sequence_util.Sequence(pdb=args.input_file).fasta_str()) elif args.split_models: logging.debug(split_pdb(args.input_file)) elif args.split_chains: logging.debug(split_into_chains(args.input_file, chain=args.chain)) elif args.chain: logging.debug( extract_chain(args.input_file, args.output_file, chainID=args.chain))
def test_canonicalise_4(self): fp = sequence_util.Sequence() fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVXAIAFTMYLSMLLGYGLTMVP*"] with self.assertRaises(RuntimeError): fp.canonicalise()
def test_canonicalise_1(self): fp = sequence_util.Sequence() fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVYAIAFTMYLSMLLGYGLTMVP"] fp.canonicalise() self.assertTrue(True)
def test_addPdb_data(self): fasta1 = os.path.join(self.testfiles_dir, '1ujb_2a6pA_3c7tA.afasta') pdbin1 = os.path.join(self.ample_share, 'examples', 'homologs', 'input', '1ujbA.pdb') pdbin2 = os.path.join(self.ample_share, 'examples', 'homologs', 'input', '2a6pA.pdb') pdbin3 = os.path.join(self.ample_share, 'examples', 'homologs', 'input', '3c7tA.pdb') s1 = sequence_util.Sequence(fasta=fasta1) s1.add_pdb_data(pdbin1) s1.add_pdb_data(pdbin2) s1.add_pdb_data(pdbin3) self.assertEqual(s1.pdbs[0], os.path.basename(pdbin1)) self.assertEqual(s1.chains[0], 'A') self.assertEqual(s1.pdbs[1], os.path.basename(pdbin2)) self.assertEqual(s1.chains[1], 'A') self.assertEqual(s1.pdbs[2], os.path.basename(pdbin3)) self.assertEqual(s1.chains[2], 'A') p1r = [ None, None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, None, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, None, None, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 83, None, None, 84, 85, 86, 87, 88, 89, 90, 91, None, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, None, None, None, None, None, None, None, None, 121, 122, 123, None, None, None, 124, 125, None, None, 126, None, None, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, None, None, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, None, None, 150, 151, None, 152, 153, 154, 155, 156, None, None, None, None, None, None, None, None, None, None, ] p2r = [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 23, 24, 25, 26, 27, 28, None, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, None, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, None, None, None, None, 74, None, None, 75, None, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, None, None, None, None, None, None, None, None, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, None, 133, 134, 135, 136, 137, None, None, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, None, None, None, None, None, None, None, None, 160, 161, 162, 163, None, 164, 165, 166, 167, 168, 169, None, None, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, None, None, None, 185, 186, 187, 188, 189, 190, 191, 192, None, 193, 194, 195, 196, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ] p3r = [ None, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, None, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, None, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, None, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, None, None, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, None, None, 304, None, None, None, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, None, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, ] self.assertEqual(s1.resseqs[0], p1r) self.assertEqual(s1.resseqs[1], p2r) self.assertEqual(s1.resseqs[2], p3r)
def model_core_from_fasta(models, alignment_file, work_dir=None, case_sensitive=False): if not os.path.isdir(work_dir): os.mkdir(work_dir) # Read in alignment to get align_seq = sequence_util.Sequence(fasta=alignment_file) # Check all alignments the same length # Get pdb names from alignment headers seq_names = [h[1:].strip() for h in align_seq.headers] # Need to check if the alignment file is from gesamt, in which case, the names have the # chain names in brackets appended for i, s in enumerate(seq_names): x = re.search("\([a-zA-Z]*\)$", s) if x: seq_names[i] = s.replace(x.group(0), "") # Get array specifying which positions are core. If the positions all align, then there # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally- # aligned residues by lower-case letters GAP = '-' # Can't use below as Theseus ignores lower-case letters in the alignment if case_sensitive: core = [ all([x in ample_util.one2three.keys() for x in t]) for t in zip(*align_seq.sequences) ] else: core = [all([x != GAP for x in t]) for t in zip(*align_seq.sequences)] if not any(core): raise RuntimeError( "Cannot generate core for models: {0}".format(models)) # For each sequence, get a list of which positions are core core_positions = [] for seq in align_seq.sequences: p = [] count = 0 for i, pos in enumerate(seq): if pos != GAP: if core[i]: p.append(count) count += 1 core_positions.append(p) # Should check lengths of sequences match the length of the aa in the pdbs # Create dict mapping seq_names to core positions core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names)) # Cut the models down to core core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name]) core_models.append(pdbout) return core_models