def __init__(self, dir_path=None, version=None, scop=None, astral_file=None, db_handle=None): """ Initialise the astral database. You must provide either a directory of SCOP files: dir_path - string, the path to location of the scopseq-x.xx directory (not the directory itself), and version -a version number. or, a FASTA file: astral_file - string, a path to a fasta file (which will be loaded in memory) or, a MYSQL database: db_handle - a database handle for a MYSQL database containing a table 'astral' with the astral data in it. This can be created using writeToSQL. """ if astral_file is None and dir_path is None and db_handle is None: raise RuntimeError( "Need either file handle, or (dir_path + " + "version) or database handle to construct Astral") if not scop: raise RuntimeError("Must provide a Scop instance to construct") self.scop = scop self.db_handle = db_handle if not astral_file and not db_handle: if dir_path is None or version is None: raise RuntimeError("must provide dir_path and version") self.version = version self.path = os.path.join(dir_path, "scopseq-%s" % version) astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version astral_file = os.path.join(self.path, astral_file) if astral_file: #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) self.astral_file = astral_file self.EvDatasets = {} self.EvDatahash = {} self.IdDatasets = {} self.IdDatahash = {}
def __init__( self, dir_path=None, version=None, scop=None, astral_file=None, db_handle=None): """ Initialise the astral database. You must provide either a directory of SCOP files: dir_path - string, the path to location of the scopseq-x.xx directory (not the directory itself), and version -a version number. or, a FASTA file: astral_file - string, a path to a fasta file (which will be loaded in memory) or, a MYSQL database: db_handle - a database handle for a MYSQL database containing a table 'astral' with the astral data in it. This can be created using writeToSQL. """ if astral_file is None and dir_path is None and db_handle is None: raise RuntimeError("Need either file handle, or (dir_path + " + "version) or database handle to construct Astral") if not scop: raise RuntimeError("Must provide a Scop instance to construct") self.scop = scop self.db_handle = db_handle if not astral_file and not db_handle: if dir_path is None or version is None: raise RuntimeError("must provide dir_path and version") self.version = version self.path = os.path.join( dir_path, "scopseq-%s" % version) astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version astral_file = os.path.join(self.path, astral_file) if astral_file: #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) self.astral_file = astral_file self.EvDatasets = {} self.EvDatahash = {} self.IdDatasets = {} self.IdDatahash = {}
def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X', codon_table=default_codon_table, alphabet=None, complete_protein=False, anchor_len=10, max_score=10): """Build a codon alignment from a protein alignment and corresponding nucleotide sequences Arguments: - pro_align - a protein MultipleSeqAlignment object - nucl_align - an object returned by SeqIO.parse or SeqIO.index or a colloction of SeqRecord. - alphabet - alphabet for the returned codon alignment - corr_dict - a dict that maps protein id to nucleotide id - complete_protein - whether the sequence begins with a start codon - frameshift - whether to appply frameshift detection Return a CodonAlignment object >>> from SAP.Bio.Alphabet import IUPAC >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.SeqRecord import SeqRecord >>> from SAP.Bio.Align import MultipleSeqAlignment >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1') >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2') >>> aln = MultipleSeqAlignment([pro1, pro2]) >>> codon_aln = build(aln, [seq1, seq2]) >>> print(codon_aln) CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons) TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1 TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2 """ # TODO # add an option to allow the user to specify the returned object? from SAP.Bio.Alphabet import ProteinAlphabet from SAP.Bio.Align import MultipleSeqAlignment # check the type of object of pro_align if not isinstance(pro_align, MultipleSeqAlignment): raise TypeError("the first argument should be a MultipleSeqAlignment " "object") # check the alphabet of pro_align for pro in pro_align: if not isinstance(pro.seq.alphabet, ProteinAlphabet): raise TypeError("Alphabet Error!\nThe input alignment should be " "a *PROTEIN* alignment") if alphabet is None: alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char) # check whether the number of seqs in pro_align and nucl_seqs is # the same pro_num = len(pro_align) if corr_dict is None: if nucl_seqs.__class__.__name__ == "generator": # nucl_seqs will be a tuple if read by SeqIO.parse() nucl_seqs = tuple(nucl_seqs) nucl_num = len(nucl_seqs) if pro_num > nucl_num: raise ValueError("More Number of SeqRecords in Protein Alignment " "({0}) than the Number of Nucleotide SeqRecords " "({1}) are found!".format(pro_num, nucl_num)) # Determine the protein sequences and nucl sequences # correspondance. If nucl_seqs is a list, tuple or read by # SeqIO.parse(), we assume the order of sequences in pro_align # and nucl_seqs are the same. If nucl_seqs is a dict or read by # SeqIO.index(), we match seqs in pro_align and those in # nucl_seq by their id. if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): corr_method = 1 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): corr_method = 0 else: raise TypeError("Nucl Sequences Error, Unknown type to assign " "correspondance method") else: if not isinstance(corr_dict, dict): raise TypeError("corr_dict should be a dict that corresponds " "protein id to nucleotide id!") if len(corr_dict) >= pro_num: # read by SeqIO.parse() if nucl_seqs.__class__.__name__ == "generator": from SAP.Bio import SeqIO nucl_seqs = SeqIO.to_dict(nucl_seqs) elif nucl_seqs.__class__.__name__ in ("list", "tuple"): nucl_seqs = dict((i.id, i) for i in nucl_seqs) #nucl_seqs = {i.id: i for i in nucl_seqs} elif nucl_seqs.__class__.__name__ in \ ("_IndexedSeqFileDict", "dict"): pass else: raise TypeError("Nucl Sequences Error, Unknown type of " "Nucleotide Records!") corr_method = 2 else: raise RuntimeError("Number of items in corr_dict ({0}) is less " "than number of protein records " "({1})".format(len(corr_dict), pro_num)) # set up pro-nucl correspondance based on corr_method # corr_method = 0, consecutive pairing if corr_method == 0: pro_nucl_pair = izip(pro_align, nucl_seqs) # corr_method = 1, keyword pairing elif corr_method == 1: nucl_id = set(nucl_seqs.keys()) pro_id = set([i.id for i in pro_align]) # check if there is pro_id that does not have a nucleotide match if pro_id - nucl_id: diff = pro_id - nucl_id raise ValueError("Protein Record {0} cannot find a nucleotide " "sequence match, please check the " "id".format(', '.join(diff))) else: pro_nucl_pair = [] for pro_rec in pro_align: pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) # corr_method = 2, dict pairing elif corr_method == 2: pro_nucl_pair = [] for pro_rec in pro_align: try: nucl_id = corr_dict[pro_rec.id] except KeyError: print("Protein record (%s) is not in corr_dict!" % pro_rec.id) exit(1) pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) codon_aln = [] shift = None for pair in pro_nucl_pair: # Beaware that the following span corresponds to an ungapped # nucleotide sequence. corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char, codon_table=codon_table, complete_protein=complete_protein, anchor_len=anchor_len) if not corr_span: raise ValueError("Protein Record {0} and Nucleotide Record {1} do" " not match!".format((pair[0].id, pair[1].id))) else: codon_rec = _get_codon_rec(pair[0], pair[1], corr_span, alphabet=alphabet, complete_protein=False, max_score=max_score) codon_aln.append(codon_rec) if corr_span[1] == 2: shift = True if shift is True: return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet) else: return CodonAlignment(codon_aln, alphabet=alphabet)