#! /usr/bin/env python import sys, os import time from Bio import Fasta DEFAULT_DICT_FILE = '/project1/structure/mliang/pdb/derived_data/pdb_seqres.idx' DEFAULT_OUTFH = sys.stdout dict_file = DEFAULT_DICT_FILE outfh = DEFAULT_OUTFH start_time = time.time() fdict = Fasta.Dictionary(dict_file) elapse_time = time.time() - start_time print >> sys.stderr, "Time to load dictionary:", elapse_time start_time = time.time() chainmap = {} for key in fdict.keys(): chainmap.setdefault(key[:4], []).append(key) elapse_time = time.time() - start_time print >> sys.stderr, "Time to build chain map:", elapse_time start_time = time.time() args = sys.argv[1:] if not args: args = sys.stdin for field in args: fields = field.strip().split()
# Note that the alphabet is explicitly defined for the sequences. import os from Bio import Fasta from Bio.Alphabet import IUPAC def get_accession_num(fasta_record): title_atoms = fasta_record.title.split() accession_atoms = title_atoms[0].split('|') gb_name = accession_atoms[3] # strip the version info before returning return gb_name[:-2] if not os.path.isdir("my_orchid_dict.idx"): #Build a new index Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx", get_accession_num) else: print "Reusing existing index" dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna) orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser) for id_num in orchid_dict.keys(): print 'id number:', id_num print 'description:', orchid_dict[id_num].description print 'sequence:', orchid_dict[id_num].seq