def pdb_mapgaps(mapgaps_profile, pdb_fname): """Align a PDB structure to a MAPGAPS profile. Returns a tuple: (SeqRecord, list of aligned residue numbers) """ from biocma import cma with read_pdb_seq(pdb_fname) as (seqfname, seqs): call_quiet('run_gaps', mapgaps_profile, seqfname) pdb_cma = cma.read(seqfname + '_aln.cma') hits = {} head_lengths = {} for seq in pdb_cma['sequences']: hits[seq['id']] = seq['seq'] head_lengths[seq['id']] = seq['head_len'] ref_id, ref_aln = choose_best_aligned(hits) ref_record = SeqIO.to_dict(seqs)[ref_id] offset = (ref_record.annotations['start'] + head_lengths[ref_id] - 1) resnums, inserts = aln_resnums_inserts(ref_record, ref_aln, offset) return ref_record, resnums, inserts
def test_get_inserts(self): block = cma.read(EX_CMA) inserts = utils.get_inserts(block) self.assertEqual(len(inserts), len(block['sequences'])) fullseqs = SeqIO.to_dict(SeqIO.parse(EX_FASTA, 'fasta')) for sequence, targets in ( (block['sequences'][1], ['n', 't', 'fyklyllkkydsntlfnv']), (block['sequences'][-1], ['altkl', 'nkl', 'siptvgfskdgdrlqemykasvcsyteecqg', 'ndndgeylldge', 'eh', 'p', 'epecancneedknmsennhkkdskhkgdsnhksdsnhksdsnhksdsnhksgsnhksdcnhksgsnhksdsnhqsdcnhmsdhnhksdnnhksdsshksdsshksdsshksgsnhksdnnhksdsshksgsnhksdhnhksdsnhksdsnhknesnhknesnhknesnhknesnhknesnhkndsnhksdsnhmsdhnhksdnnhksdhnhmsdhnhksdnnhksdnnhmsdhnhksdnnhksdnnhksdnnhksdhnhmsdhnhksdnnhksdhnhksdsnhmsdhnhmsdhnhksdhnhksdhnhksdnnhksdsnhksdsnhksdhnhksdsnhmsdhnhmsdhnhksdhnhksdnnhksdsnhksdsnhksdhnhksdsnhmsdhnhmsdhnhmsdhnhksdhnhksdnnhksdsnhksdsnhksdsnhksdhnhksdhkhmsdnnhksdnnhksdhnhksdnnhksdhnhksdsnhksdsnhksdsnhksdsnhksdnnhksdhnhnsdsnhmsdhnhksdhnhksdhnhksdnnhksdnnhksdhnhksdhkknnnnnkdnknddnddsdasdavhediellesysdlnkfnemlteqln', 'vt', 'edtrv', 'pmythnl', 'g', 'sfqscqpcv', 'iirehiklkidnpfehlstitdqee', 'yfd', 'ra', 'fqlak'])): full = fullseqs[sequence['id']] ins_ranges = [str(full.seq)[start-1:end] for start, end in inserts[sequence['id']]] print sequence['id'], ins_ranges self.assertEqual(len(ins_ranges), len(targets)) for ins, tgt in zip(ins_ranges, targets): self.assertEqual(ins.lower(), tgt)
def pdb_mapgaps(mapgaps_profile, pdb_fname): """Align a PDB structure to a MAPGAPS profile. Returns a tuple: (SeqRecord, list of aligned residue numbers) """ from biocma import cma with read_pdb_seq(pdb_fname) as (seqfname, seqs): subprocess.check_call(['run_gaps', mapgaps_profile, seqfname]) pdb_cma = cma.read(seqfname + '_aln.cma') hits = {} head_lengths = {} for seq in pdb_cma['sequences']: hits[seq['id']] = seq['seq'] head_lengths[seq['id']] = seq['head_len'] ref_id, ref_aln = choose_best_aligned(hits) ref_record = SeqIO.to_dict(seqs)[ref_id] offset = (ref_record.annotations['start'] + head_lengths[ref_id] - 1) resnums, inserts = aln_resnums_inserts(ref_record, ref_aln, offset) return ref_record, resnums, inserts
def test_read(self): block = cma.read(EX_CMA) self.assertEqual(len(block['sequences']), 24) self.assertEqual(block['query_length'], block['sequences'][0]['length'])
graph.add((ruri, MSA.deleted_aln_pos, Literal(i))) else: graph.add((ruri, RDF.type, MSA.aligned_residue)) graph.add((ruri, MSA.aln_pos, Literal(i))) if unquote(acc) in dedup_eqv[i]: graph.add((ruri, MSA.native_pos, Literal(dedup_eqv[i][unquote(acc)]))) else: print "shouldn't happen" #deletions taken care of above graph.add((ruri, MSA.native_residue, Literal(r))) bar.next() bar.finish() graph.serialize(destination=outfile, format='pretty-xml') return if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description = 'Populate the Multiple Sequence Alignment Ontology') parser.add_argument('infile', metavar='infile', type=str, help='input aligned sequences') parser.add_argument('name', metavar='name', type=str, help='Graph name') parser.add_argument('--namespace', metavar='namespace', type=str, default='http://localhost/msaont#', help='Graph namespace') parser.add_argument('-p', dest='prop', action='store', default=0.25, help='proportion of inserts allowed in an aligned residue') parser.add_argument('-o', dest='outfile', action='store', default='out.rdf', help='outfile') args = parser.parse_args() ext = args.infile.split('.')[-1] if ext != 'cma': seqs = manipulate_fasta(args.infile, args.prop) else: seqs = cma.read(args.infile) populate(seqs, args.name, args.outfile, args.namespace)