def doReverseComplement(inputSeqRecord, id=False, name=False, description=False, features=True, annotations=False, letter_annotations=True, dbxrefs=False) : from Bio.SeqRecord import SeqRecord from Bio.Seq import MutableSeq #Lazy to avoid circular imports if isinstance(inputSeqRecord.seq, MutableSeq): #Currently the MutableSeq reverse complement is in situ answer = SeqRecord(inputSeqRecord.seq.toseq().reverse_complement()) else: answer = SeqRecord(inputSeqRecord.seq.reverse_complement()) if isinstance(id, basestring): answer.id = id elif id: answer.id = inputSeqRecord.id if isinstance(name, basestring): answer.name = name elif name: answer.name = inputSeqRecord.name if isinstance(description, basestring): answer.description = description elif description: answer.description = inputSeqRecord.description if isinstance(dbxrefs, list): answer.dbxrefs = dbxrefs elif dbxrefs: #Copy the old dbxrefs answer.dbxrefs = inputSeqRecord.dbxrefs[:] if isinstance(features, list): answer.features = features elif features: #Copy the old features, adjusting location and string l = len(answer) answer.features = [f._flip(l) for f in inputSeqRecord.features] #The old list should have been sorted by start location, #reversing it will leave it sorted by what is now the end position, #so we need to resort in case of overlapping features. #NOTE - In the common case of gene before CDS (and similar) with #the exact same locations, this will still maintain gene before CDS answer.features.sort(key=lambda x : x.location.start.position) if isinstance(annotations, dict): answer.annotations = annotations elif annotations: #Copy the old annotations, answer.annotations = inputSeqRecord.annotations.copy() if isinstance(letter_annotations, dict): answer.letter_annotations = letter_annotations elif letter_annotations: #Copy the old per letter annotations, reversing them for key, value in inputSeqRecord.letter_annotations.iteritems(): answer._per_letter_annotations[key] = value[::-1] return answer
def export(self, gene, outputFileName=None, keys = ['promoter' , 'utr5', 'cds', 'utr3', 'terminator']): self.keys = keys parts = [ getattr(gene, key) for key in self.keys ] strand = gene.locusStrand if outputFileName: gene = SeqRecord(id = str(gene.dbid).replace('.',''), name = str(gene.dbid), seq = '' ) else: gene = SeqRecord(id = gene.dbid, name = str(gene.dbid), seq = '' ) gene.annotations = {'strand' : strand} for partType, part in zip( self.keys, parts): l = len(gene) if isinstance(part, PartMixIn): if isinstance(part, ExonMixIn): feature = SeqFeature( type = partType, location = self.coordinatesToLocation(part.coordinates)._shift( l ), id=part.dbid ) else: feature = SeqFeature( type = partType, location = FeatureLocation( l, l + len(part.seq) ), id=part.dbid ) gene.seq += Seq(part.seq, generic_dna) gene.features.append(feature) if outputFileName: print 'outputFileName IS' else: print 'outputFileName IS NONE' if outputFileName: outputFile = open(outputFileName, 'w') SeqIO.write(gene, outputFile, "gb") else: return gene
def PhdIterator(handle): """Returns SeqRecord objects from a PHD file. This uses the Bio.Sequencing.Phd module to do the hard work. """ phd_records = Phd.parse(handle) for phd_record in phd_records: # Convert the PHY record into a SeqRecord... # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' # from unit test example file phd_solexa. # This will cause problems if used as the record identifier # (e.g. output for FASTQ format). name = phd_record.file_name.split(None, 1)[0] seq_record = SeqRecord(phd_record.seq, id=name, name=name, description=phd_record.file_name) # Just re-use the comments dictionary as the SeqRecord's annotations seq_record.annotations = phd_record.comments # And store the qualities and peak locations as per-letter-annotation seq_record.letter_annotations["phred_quality"] = \ [int(site[1]) for site in phd_record.sites] try: seq_record.letter_annotations["peak_location"] = \ [int(site[2]) for site in phd_record.sites] except IndexError: # peak locations are not always there according to # David Gordon (the Consed author) pass yield seq_record
def genome_to_seqrecord(phage_genome): """Creates a SeqRecord object from a pdm_utils Genome object. :param phage_genome: A pdm_utils Genome object. :type phage_genome: Genome :returns: A BioPython SeqRecord object :rtype: SeqRecord """ assert phage_genome != None,\ "Genome object passed is None and not initialized" try: record = SeqRecord(phage_genome.seq) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() except AttributeError: print("Genome object failed to be converted to SeqRecord.", "Genome valid attribute 'seq' is required to", "convert to SeqRecord object.") raise record.name = phage_genome.name if phage_genome.accession != "": record.id = phage_genome.accession record.features = get_seqrecord_features(phage_genome) record.description = get_seqrecord_description(phage_genome) record.annotations=\ get_seqrecord_annotations(phage_genome) return record
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None): """ Return the selected proteins as SeqRecord objects """ query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id from hpf.experiment e join bddb.protein p on e.id=p.experiment_key join ddbCommon.sequence s on p.sequence_key=s.id """ assert experiment!= None or sequence_key != None if experiment != None or filter_experiments==True or sequence_key != None: query += " where " if experiment: if not hasattr(experiment, "__iter__"): experiment = [experiment] query += " e.id in (%s)" % (",".join([str(key) for key in experiment])) if filter_experiments: t = " e.taxonomy_id!=0" query += " and "+t if experiment else t if sequence_key: t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key])) query += " and "+t if experiment or filter_experiments else t runtime().debug(query) cursor.execute(query) runtime().debug("Fetching") for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall(): record = SeqRecord(Seq(sequence), str(id), description=e_name) record.annotations = {"taxonomy_id":taxonomy_id, "experiment_key":e_id, "organism":e_name} yield record
def desanitize_fasta_names_in_seqrec_list(seqrec_list, used_dict): reverted = [] for rec in seqrec_list: ns = SeqRecord(rec.seq, id=used_dict[rec.id]) if len(rec.letter_annotations) != 0: ns.letter_annotations = rec.letter_annotations if len(rec.annotations) != 0: ns.annotations = rec.annotations reverted.append(ns) return reverted
def PhdIterator(handle) : """Returns SeqRecord objects from a PHD file. This uses the Bio.Sequencing.Phy module to do the hard work. """ phd_records = Phd.parse(handle) for phd_record in phd_records: #Convert the PHY record into a SeqRecord... seq_record = SeqRecord(phd_record.seq, id = phd_record.file_name, name = phd_record.file_name) #Just re-use the comments dictionary as the SeqRecord's annotations seq_record.annotations = phd_record.comments yield seq_record
def cds_to_seqrecord(cds): try: record = SeqRecord(cds.seq) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() except AttributeError: print("Genome object failed to be converted to SeqRecord\n." "Genome valid attribute 'seq' is required to " "convert to SeqRecord object.") record.name = cds.id if cds.locus_tag != "": record.id = cds.locus_tag cds.set_seqfeature() record.features = [cds.seqfeature] record.description = f"Single gene {cds.id}" record.annotations = get_cds_seqrecord_annotations(cds) return record
def sanitize_fasta_names_in_seqrec_list(seqrec_list, used_dict=None): if used_dict: sanitize_dict = used_dict else: sanitize_dict = {} sanitized_list = [] for rec in seqrec_list: uname = generate_random_name(8, list(sanitize_dict.keys())) sanitize_dict[uname] = rec.id ns = SeqRecord(rec.seq, id=uname) if len(rec.letter_annotations) != 0: ns.letter_annotations = rec.letter_annotations if len(rec.annotations) != 0: ns.annotations = rec.annotations sanitized_list.append(ns) return sanitized_list, sanitize_dict
def cds_to_seqrecord(cds, parent_genome, gene_domains=[]): """Creates a SeqRecord object from a Cds and its parent Genome. :param cds: A populated Cds object. :type cds: Cds :param phage_genome: Populated parent Genome object of the Cds object. :param domains: List of domain objects populated with column attributes :type domains: list :returns: Filled Biopython SeqRecord object. :rtype: SeqRecord """ record = SeqRecord(cds.translation) record.seq.alphabet = IUPAC.IUPACProtein() record.name = cds.id if cds.locus_tag == "" or cds.locus_tag is None: record.id = "".join(["DRAFT ", cds.id]) else: record.id = cds.locus_tag cds.set_seqfeature() source = f"{parent_genome.host_genus} phage {cds.genome_id}" source_feature = cds.create_seqfeature("source", 0, cds.translation_length, 1) source_feature.qualifiers["organism"] = [source] record.features = [source_feature] record.features.append( cds.create_seqfeature("Protein", 0, cds.translation_length, 1)) cds_feature = cds.create_seqfeature("CDS", 0, cds.translation_length, 1) format_cds_seqrecord_CDS_feature(cds_feature, cds, parent_genome) record.features.append(cds_feature) region_features = get_cds_seqrecord_regions(gene_domains, cds) for region_feature in region_features: record.features.append(region_feature) record.description = (f"{cds.seqfeature.qualifiers['product'][0]} " f"[{source}]") record.annotations = get_cds_seqrecord_annotations(cds, parent_genome) return record
def return_seqrec(self, **kwargs) -> SeqRecord: """A function to return the assembled construct as a seqrecord. Args: kwargs: Returns: seqrec: assembled construct as a new seqrecord. """ seqrec = SeqRecord(Seq(str())) for part_linker in self.parts_linkers: seqrec += part_linker.basic_slice() seqrec.id = self.id seqrec.name = "BASIC_construct_" + self.id seqrec.description = f"BASIC DNA Assembly of {[part_linker.name for part_linker in self.parts_linkers]}" seqrec.annotations = DEFAULT_ANNOTATIONS if kwargs: for key, value in kwargs.items(): setattr(seqrec, key, value) return seqrec
def getSeqRecord(self): """ id seq - The sequence itself (Seq object) Additional attributes: name - Sequence name, e.g. gene name (string) description - Additional text (string) dbxrefs - List of database cross references (list of strings) features - Any (sub)features defined (list of SeqFeature objects) annotations - Further information about the whole sequence (dictionary) """ seqr = BioSeqRecord(id=self.Accession(), seq=BioSeq(self.Sequence(), self.alphabetClass()()), name=self.Name(), description=self.Description()) seqr.features = self.features seqr.annotations = self.annotations return seqr
def cds_to_seqrecord(cds, parent_genome): """Creates a SeqRecord object from a Cds and its parent Genome. :param cds: A populated Cds object. :type cds: Cds :param phage_genome: Populated parent Genome object of the Cds object. :returns: Filled Biopython SeqRecord object. :rtype: SeqRecord """ record = SeqRecord(cds.translation) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() record.name = cds.id if cds.locus_tag != "": record.id = cds.locus_tag cds.set_seqfeature() record.features = [cds.seqfeature] record.description = ( f"{cds.description} " f"[{parent_genome.host_genus} phage {cds.genome_id}]") record.annotations = get_cds_seqrecord_annotations(cds, parent_genome) return record
def align_multiple_sequences(seqs): import subprocess as sp from Bio import SeqIO, AlignIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import protein from Bio.Align import MultipleSeqAlignment fn_tmp = '/tmp/tmp.fasta' fn_tmp2 = '/tmp/tmp_aligned.fasta' seqs_muscle = [] for i, seq in enumerate(seqs): name = 'seq{:}'.format(i) seqm = SeqRecord(seq.seq, id=name, name=name, description='') seqs_muscle.append(seqm) SeqIO.write(seqs_muscle, fn_tmp, 'fasta') sp.run( 'muscle -in {:} -out {:} -diags'.format(fn_tmp, fn_tmp2), shell=True, ) ali_muscle = AlignIO.read(fn_tmp2, 'fasta') ali = [] for seq_muscle in ali_muscle: i = int(seq_muscle.id[3:]) rec = seqs[i] seq = SeqRecord( seq_muscle.seq, id=rec.annotations['OrganismCommon'] + '-' + rec.id, name=rec.name, description=rec.description, ) seq.annotations = rec.annotations ali.append(seq) ali = MultipleSeqAlignment(ali) os.remove(fn_tmp) os.remove(fn_tmp2) return ali
def write_genbank_output(seq_record: SeqRecord, topology: str, organism: str, outfpath: str) -> None: # Function writes annotated sequence to output GenBank file. # :param seq_record: sequence record to output; # :param topology: string for `topology` annotation; # :param organism: string for `organism` annotation; # :param outfpath: path to output file; # Set annotations seq_record.annotations = { 'molecule_type': 'DNA', 'organism': organism, 'date': _get_date(), 'topology': topology } # Sort features by their location if ascending order seq_record.features = sorted( seq_record.features, key = lambda feature: feature.location.start ) # Write output file with open(outfpath, 'a') as outfile: SeqIO.write(seq_record, outfile, 'genbank')
def PdbAtomIterator(handle): """Returns SeqRecord objects for each chain in a PDB file The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. """ # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB import PDBParser from Bio.SeqUtils import seq1 from Bio.SCOP.three_to_one_dict import to_one_letter_code def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return seq1(residue.resname, custom_map=to_one_letter_code) # Deduce the PDB ID from the PDB header # ENH: or filename? from Bio.File import UndoHandle undo_handle = UndoHandle(handle) firstline = undo_handle.peekline() if firstline.startswith("HEADER"): pdb_id = firstline[62:66] else: warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") pdb_id = '????' struct = PDBParser().get_structure(pdb_id, undo_handle) model = struct[0] for chn_id, chain in sorted(model.child_dict.iteritems()): # HETATM mod. res. policy: remove mod if in sequence, else discard residues = [res for res in chain.get_unpacked_list() if seq1(res.get_resname().upper(), custom_map=to_one_letter_code) != "X"] if not residues: continue # Identify missing residues in the structure # (fill the sequence with 'X' residues in these regions) gaps = [] rnumbers = [r.id[1] for r in residues] for i, rnum in enumerate(rnumbers[:-1]): if rnumbers[i+1] != rnum + 1: # It's a gap! gaps.append((i+1, rnum, rnumbers[i+1])) if gaps: res_out = [] prev_idx = 0 for i, pregap, postgap in gaps: if postgap > pregap: gapsize = postgap - pregap - 1 res_out.extend(map(restype, residues[prev_idx:i])) prev_idx = i res_out.append('X'*gapsize) # Last segment res_out.extend(map(restype, residues[prev_idx:])) else: warnings.warn("Ignoring out-of-order residues after a gap", UserWarning) # Keep the normal part, drop the out-of-order segment # (presumably modified or hetatm residues, e.g. 3BEG) res_out.extend(map(restype, residues[prev_idx:i])) else: # No gaps res_out = map(restype, residues) record_id = "%s:%s" % (pdb_id, chn_id) # ENH - model number in SeqRecord id if multiple models? # id = "Chain%s" % str(chain.id) # if len(structure) > 1 : # id = ("Model%s|" % str(model.id)) + id record = SeqRecord(Seq(''.join(res_out), generic_protein), id=record_id, description=record_id, ) # The PDB header was loaded as a dictionary, so let's reuse it all record.annotations = struct.header.copy() # Plus some chain specifics: record.annotations["model"] = model.id record.annotations["chain"] = chain.id # Start & end record.annotations["start"] = int(rnumbers[0]) record.annotations["end"] = int(rnumbers[-1]) # ENH - add letter annotations -- per-residue info, e.g. numbers yield record
def PdbSeqresIterator(handle): """Return SeqRecord objects for each chain in a PDB file. The sequences are derived from the SEQRES lines in the PDB file header, not the atoms of the 3D structure. Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES See: http://www.wwpdb.org/documentation/format23/sect3.html This gets called internally via Bio.SeqIO for the SEQRES based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Equivalently, >>> with open("PDB/1A8O.pdb") as handle: ... for record in PdbSeqresIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Note the chain is recorded in the annotations dictionary, and any PDB DBREF lines are recorded in the database cross-references list. """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 chains = collections.defaultdict(list) metadata = collections.defaultdict(list) for line in handle: rec_name = line[0:6].strip() if rec_name == 'SEQRES': # NB: We only actually need chain ID and the residues here; # commented bits are placeholders from the wwPDB spec. # Serial number of the SEQRES record for the current chain. # Starts at 1 and increments by one each line. # Reset to 1 for each chain. # ser_num = int(line[8:10]) # Chain identifier. This may be any single legal character, # including a blank which is used if there is only one chain. chn_id = line[11] # Number of residues in the chain (repeated on every record) # num_res = int(line[13:17]) residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()] chains[chn_id].extend(residues) elif rec_name == 'DBREF': # ID code of this entry (PDB ID) pdb_id = line[7:11] # Chain identifier. chn_id = line[12] # Initial sequence number of the PDB sequence segment. # seq_begin = int(line[14:18]) # Initial insertion code of the PDB sequence segment. # icode_begin = line[18] # Ending sequence number of the PDB sequence segment. # seq_end = int(line[20:24]) # Ending insertion code of the PDB sequence segment. # icode_end = line[24] # Sequence database name. database = line[26:32].strip() # Sequence database accession code. db_acc = line[33:41].strip() # Sequence database identification code. db_id_code = line[42:54].strip() # Initial sequence number of the database seqment. # db_seq_begin = int(line[55:60]) # Insertion code of initial residue of the segment, if PDB is the # reference. # db_icode_begin = line[60] # Ending sequence number of the database segment. # db_seq_end = int(line[62:67]) # Insertion code of the ending residue of the segment, if PDB is the # reference. # db_icode_end = line[67] metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 'db_acc': db_acc, 'db_id_code': db_id_code}) # ENH: 'SEQADV' 'MODRES' for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ("%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code'])]) else: record.id = chn_id yield record
def CifSeqresIterator(handle): """Return SeqRecord objects for each chain in an mmCIF file. The sequences are derived from the _entity_poly_seq entries in the mmCIF file, not the atoms of the 3D structure. Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and _struct_ref_seq. The _pdbx_poly_seq records contain sequence information, and the _struct_ref_seq records contain database cross-references. See: http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html and http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html This gets called internally via Bio.SeqIO for the sequence-based interpretation of the mmCIF file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Equivalently, >>> with open("PDB/1A8O.cif") as handle: ... for record in CifSeqresIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Note the chain is recorded in the annotations dictionary, and any mmCIF _struct_ref_seq entries are recorded in the database cross-references list. """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict chains = collections.defaultdict(list) metadata = collections.defaultdict(list) records = MMCIF2Dict(handle) # Explicitly convert records to list (See #1533). # If an item is not present, use an empty list for field in ( PDBX_POLY_SEQ_SCHEME_FIELDS + STRUCT_REF_SEQ_FIELDS + STRUCT_REF_FIELDS): if field not in records: records[field] = [] elif not isinstance(records[field], list): records[field] = [records[field]] for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"], records["_pdbx_poly_seq_scheme.mon_id"]): mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1) chains[asym_id].append(mon_id_1l) # Build a dict of _struct_ref records, indexed by the id field: struct_refs = {} for fields in zip(records["_struct_ref.id"], records["_struct_ref.db_name"], records["_struct_ref.db_code"], records["_struct_ref.pdbx_db_accession"]): ref_id, db_name, db_code, db_acc = fields struct_refs[ref_id] = { "database": db_name, "db_id_code": db_code, "db_acc": db_acc} # Look through _struct_ref_seq records, look up the corresponding # _struct_ref and add an entry to the metadata list for this chain. for fields in zip(records["_struct_ref_seq.ref_id"], records["_struct_ref_seq.pdbx_PDB_id_code"], records["_struct_ref_seq.pdbx_strand_id"]): ref_id, pdb_id, chain_id = fields struct_ref = struct_refs[ref_id] # The names here mirror those in PdbIO metadata[chain_id].append({'pdb_id': pdb_id}) metadata[chain_id][-1].update(struct_ref) for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ("%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code'])]) else: record.id = chn_id yield record
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP "BLASTN": "nucleotide_match", "BLASTP": "protein_match", }.get(record.application, "match") recid = record.query if " " in recid: recid = recid[0:recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): for idx_hsp, hsp in enumerate(hit.hsps): qualifiers = { "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), } if include_seq: qualifiers.update({ "blast_qseq": hsp.query, "blast_sseq": hsp.sbjct, "blast_mseq": hsp.match, }) for prop in ( "score", "bits", "identities", "positives", "gaps", "align_length", "strand", "frame", "query_start", "query_end", "sbjct_start", "sbjct_end", ): qualifiers["blast_" + prop] = getattr(hsp, prop, None) desc = hit.title.split(" >")[0] qualifiers["description"] = desc[desc.index(" "):] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count( "-") # If we trim the left end, we need to trim without losing information. used_parent_match_start = parent_match_start if trim: if parent_match_start < 1: used_parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature( FeatureLocation(used_parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers, ) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = {"source": "blast"} top_feature.sub_features = [] for idx_part, (start, end, cigar) in enumerate( generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap)): part_qualifiers["Gap"] = cigar part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part) # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers), )) rec.features.append(top_feature) rec.annotations = {} yield rec
def CifSeqresIterator(handle): """Return SeqRecord objects for each chain in an mmCIF file. The sequences are derived from the _entity_poly_seq entries in the mmCIF file, not the atoms of the 3D structure. Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and _struct_ref_seq. The _pdbx_poly_seq records contain sequence information, and the _struct_ref_seq records contain database cross-references. See: http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html and http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html This gets called internally via Bio.SeqIO for the sequence-based interpretation of the mmCIF file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Equivalently, >>> with open("PDB/1A8O.cif") as handle: ... for record in CifSeqresIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Note the chain is recorded in the annotations dictionary, and any mmCIF _struct_ref_seq entries are recorded in the database cross-references list. """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB.MMCIF2Dict import MMCIF2Dict chains = collections.defaultdict(list) metadata = collections.defaultdict(list) records = MMCIF2Dict(handle) # Explicitly convert records to list (See #1533). # If an item is not present, use an empty list for field in ( PDBX_POLY_SEQ_SCHEME_FIELDS + STRUCT_REF_SEQ_FIELDS + STRUCT_REF_FIELDS): if field not in records: records[field] = [] elif not isinstance(records[field], list): records[field] = [records[field]] for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"], records["_pdbx_poly_seq_scheme.mon_id"]): mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1) chains[asym_id].append(mon_id_1l) # Build a dict of _struct_ref records, indexed by the id field: struct_refs = {} for fields in zip(records["_struct_ref.id"], records["_struct_ref.db_name"], records["_struct_ref.db_code"], records["_struct_ref.pdbx_db_accession"]): ref_id, db_name, db_code, db_acc = fields struct_refs[ref_id] = { "database": db_name, "db_id_code": db_code, "db_acc": db_acc} # Look through _struct_ref_seq records, look up the corresponding # _struct_ref and add an entry to the metadata list for this chain. for fields in zip(records["_struct_ref_seq.ref_id"], records["_struct_ref_seq.pdbx_PDB_id_code"], records["_struct_ref_seq.pdbx_strand_id"]): ref_id, pdb_id, chain_id = fields struct_ref = struct_refs[ref_id] # The names here mirror those in PdbIO metadata[chain_id].append({"pdb_id": pdb_id}) metadata[chain_id][-1].update(struct_ref) for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq("".join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id) record.description = ("%s:%s %s" % (m["database"], m["db_acc"], m["db_id_code"])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem["database"], melem["db_acc"]), "%s:%s" % (melem["database"], melem["db_id_code"])]) else: record.id = chn_id yield record
def score_mutants(mut_list, ref, min_length): """ This function takes a dictionary of mutants (all of the same reference sequence) and - filters out those that are too short - creates intron/exon features for all mutants based on the reference sequence - scores each mutant and reference sequence :param mut_list: list of tuples of mutants of the same reference sequence :param ref: dictionary of reference sequences :param min_length: Minimum length of mutant :return: tab-delimited result string, containing mutant header, sequence, mutant score, and original sequence score """ scores = [] # convert mutant list to dictionary mutants = {x[0]: x[1] for x in mut_list} if len(mutants) == 0: return [''] # get corresponding reference sequence name = mutants.keys()[0].split()[0].split('.')[0] ref_features = ref.get(name, None) if not ref_features: for header, seq in mutants.items(): scores.append('\t'.join([header, seq, 'None', 'None']) + '\n') return scores # create SeqRecord for original feature orig_record = SeqRecord(Seq(ref_features['seq']), id=name) orig_record.annotations = ref[name] # initialize with intron/exon features # get corresponding boundaries from reference upstr_intron_size, exon_size, downstr_intron_size = ref_features['len'] # calculate the feature start/ends upstr_loc = (0, upstr_intron_size) exon_loc = (upstr_loc[1], upstr_loc[1] + exon_size) downstr_loc = (exon_loc[1], exon_loc[1] + downstr_intron_size) # create feature for mutant and for original sequence upstr_intron_feature = SeqFeature( FeatureLocation(*upstr_loc), type="intron", strand=orig_record.annotations["strand"]) exon_feature = SeqFeature( FeatureLocation(*exon_loc), type="exon", strand=orig_record.annotations["strand"]) downstr_intron_feature = SeqFeature( FeatureLocation(*downstr_loc), type="intron", strand=orig_record.annotations["strand"]) orig_record.features.extend([upstr_intron_feature, exon_feature, downstr_intron_feature]) orig_record, orig_score = score_sequence(orig_record) for header, seq in mutants.items(): if len(seq) < min_length: continue # create SeqRecord object record = SeqRecord(Seq(seq.strip()), id=header.split()[0][1:]) annot_list = ['unknown', 'cigar', 'md', 'alignment'] for k, v in zip(annot_list, header.strip().split()[1:]): record.annotations[k] = v # initialize strand record.annotations['strand'] = ref_features['strand'] record.features.extend([upstr_intron_feature, exon_feature, downstr_intron_feature]) record, mut_score = score_sequence(record) scores.append('\t'.join([header, seq, str(mut_score), str(orig_score)]) + '\n') return scores
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) records = [] for record in blast_records: # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP 'BLASTN': 'nucleotide_match', 'BLASTP': 'protein_match', }.get(record.application, 'match') rec = SeqRecord(Seq("ACTG"), id=record.query) for hit in record.alignments: for hsp in hit.hsps: qualifiers = { "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(' >') } desc = hit.title.split(' >')[0] qualifiers['description'] = desc[desc.index(' '):] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count( '-') # However, if the user requests that we trim the feature, then # we need to cut the ``match`` start to 0 to match the parent feature. # We'll also need to cut the end to match the query's end. It (maybe) # should be the feature end? But we don't have access to that data, so # We settle for this. if trim: if parent_match_start < 1: parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature(FeatureLocation( parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = { "source": "blast", } top_feature.sub_features = [] for start, end, cigar in generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap): part_qualifiers['Gap'] = cigar part_qualifiers['ID'] = hit.hit_id if trim: # If trimming, then we start relative to the # match's start match_part_start = parent_match_start + start else: # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature(FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers))) rec.features.append(top_feature) rec.annotations = {} records.append(rec) return records
def shinefind( genbank_file, gff3_output=None, table_output=None, lookahead_min=5, lookahead_max=15, top_only=False, add=False, ): table_output.write("\t".join([ "ID", "Name", "Terminus", "Terminus", "Strand", "Upstream Sequence", "SD", "Spacing", ]) + "\n") sd_finder = NaiveSDCaller() # Parse GFF3 records for record in list(SeqIO.parse(genbank_file, "genbank")): # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE. seen = {} # Shinefind's "gff3_output". gff3_output_record = SeqRecord(record.seq, record.id) # Loop over all CDS features for feature in record.features: if feature.type != "CDS": continue seen_loc = (feature.location.start if feature.strand > 0 else feature.location.end) if seen_loc in seen: continue else: seen[seen_loc] = True sds, start, end, seq = sd_finder.testFeatureUpstream( feature, record, sd_min=lookahead_min, sd_max=lookahead_max) feature_id = get_id(feature) sd_features = sd_finder.to_features(sds, feature.location.strand, start, end, feature_id=feature.id) human_strand = "+" if feature.location.strand == 1 else "-" # http://book.pythontips.com/en/latest/for_-_else.html log.debug("Found %s SDs", len(sds)) for (sd, sd_feature) in zip(sds, sd_features): # If we only want the top feature, after the bulk of the # forloop executes once, we append the top feature, and fake a # break, because an actual break triggers the else: block table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, sd_finder.highlight_sd(seq, sd["start"], sd["end"]), sd["hit"], int(sd["spacing"]) + lookahead_min, ], )) + "\n") if add: # Append the top RBS to the gene feature record.features.append(sd_feature) # Also register the feature with the separate GFF3 output gff3_output_record.features.append(sd_feature) if top_only: break else: if len(sds) != 0: log.debug("Should not reach here if %s", len(sds) != 0) # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem. continue table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, seq, None, -1, ], )) + "\n") record.features = sorted(record.features, key=lambda x: x.location.start) SeqIO.write([record], sys.stdout, "genbank") gff3_output_record.features = sorted(gff3_output_record.features, key=lambda x: x.location.start) gff3_output_record.annotations = {} GFF.write([gff3_output_record], gff3_output)
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 MATCH_TYPE.get(record.application, 'match') recid = record.query if ' ' in recid: recid = recid[0:recid.index(' ')] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): qualifiers = { "ID": 'b2g.%s.%s' % (idx_record, idx_hit), "source": "blast", "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(' >'), } top_feature = SeqFeature( FeatureLocation(1, 1000000000), # TODO. type='match', strand=0, qualifiers=qualifiers, ) top_feature.sub_features = [] feat_min = None feat_max = None for idx_hsp, hsp in enumerate(hit.hsps): part_qualifiers = { "source": "blastn", } part_qualifiers.update(qualifiers) part_qualifiers['ID'] += '.%s' % idx_hsp if include_seq: part_qualifiers.update({ 'blast_qseq': hsp.query, 'blast_sseq': hsp.sbjct, 'blast_mseq': hsp.match, }) for prop in ('score', 'bits', 'identities', 'positives', 'gaps', 'align_length', 'strand', 'frame', 'query_start', 'query_end', 'sbjct_start', 'sbjct_end'): part_qualifiers['blast_' + prop] = getattr(hsp, prop, None) desc = hit.title.split(' >')[0] part_qualifiers['description'] = desc[desc.index(' '):] part_qualifiers['score'] = hsp.expect if feat_min is None: feat_min = hsp.sbjct_start feat_max = hsp.sbjct_end if hsp.sbjct_start < feat_min: feat_min = hsp.sbjct_start if hsp.sbjct_end > feat_max: feat_max = hsp.sbjct_end top_feature.sub_features.append( SeqFeature(FeatureLocation(hsp.query_start, hsp.query_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers))) top_feature.location._start = feat_min top_feature.location._end = feat_max rec.features.append(top_feature) rec.annotations = {} yield rec
def to_seqrecord(self): """Create a SeqRecord object from this Sequence instance. The seqrecord.annotations dictionary is packed like so:: { # Sequence attributes with no SeqRecord equivalent: 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': { 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type }, # Sequence.annotations attribute (list of Annotations) 'annotations': [{'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': [ann.confidence.value, ann.confidence.type], 'properties': [{'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref} for prop in ann.properties], } for ann in self.annotations], } """ def clean_dict(dct): """Remove None-valued items from a dictionary.""" return {key: val for key, val in dct.items() if val is not None} seqrec = SeqRecord( Seq(self.mol_seq.value), **clean_dict({ "id": str(self.accession), "name": self.symbol, "description": self.name, # 'dbxrefs': None, }), ) if self.domain_architecture: seqrec.features = [ dom.to_seqfeature() for dom in self.domain_architecture.domains ] # Sequence attributes with no SeqRecord equivalent if self.type == "dna": molecule_type = "DNA" elif self.type == "rna": molecule_type = "RNA" elif self.type == "protein": molecule_type = "protein" else: molecule_type = None seqrec.annotations = clean_dict({ "id_ref": self.id_ref, "id_source": self.id_source, "location": self.location, "uri": self.uri and clean_dict({ "value": self.uri.value, "desc": self.uri.desc, "type": self.uri.type, }), "molecule_type": molecule_type, "annotations": self.annotations and [ clean_dict({ "ref": ann.ref, "source": ann.source, "evidence": ann.evidence, "type": ann.type, "confidence": ann.confidence and [ann.confidence.value, ann.confidence.type], "properties": [ clean_dict({ "value": prop.value, "ref": prop.ref, "applies_to": prop.applies_to, "datatype": prop.datatype, "unit": prop.unit, "id_ref": prop.id_ref, }) for prop in ann.properties ], }) for ann in self.annotations ], }) return seqrec
def iterate(self, handle): """Iterate over the records in the PDB file.""" # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils # Not sure if this is really needed; Python can handle circular dependencies. from Bio.SeqUtils import seq1 chains = collections.defaultdict(list) metadata = collections.defaultdict(list) rec_name = None for line in handle: rec_name = line[0:6].strip() if rec_name == "SEQRES": # NB: We only actually need chain ID and the residues here; # commented bits are placeholders from the wwPDB spec. # Serial number of the SEQRES record for the current chain. # Starts at 1 and increments by one each line. # Reset to 1 for each chain. # ser_num = int(line[8:10]) # Chain identifier. This may be any single legal character, # including a blank which is used if there is only one chain. chn_id = line[11] # Number of residues in the chain (repeated on every record) # num_res = int(line[13:17]) residues = [ seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split() ] chains[chn_id].extend(residues) elif rec_name == "DBREF": # ID code of this entry (PDB ID) pdb_id = line[7:11] # Chain identifier. chn_id = line[12] # Initial sequence number of the PDB sequence segment. # seq_begin = int(line[14:18]) # Initial insertion code of the PDB sequence segment. # icode_begin = line[18] # Ending sequence number of the PDB sequence segment. # seq_end = int(line[20:24]) # Ending insertion code of the PDB sequence segment. # icode_end = line[24] # Sequence database name. database = line[26:32].strip() # Sequence database accession code. db_acc = line[33:41].strip() # Sequence database identification code. db_id_code = line[42:54].strip() # Initial sequence number of the database seqment. # db_seq_begin = int(line[55:60]) # Insertion code of initial residue of the segment, if PDB is the # reference. # db_icode_begin = line[60] # Ending sequence number of the database segment. # db_seq_end = int(line[62:67]) # Insertion code of the ending residue of the segment, if PDB is the # reference. # db_icode_end = line[67] metadata[chn_id].append({ "pdb_id": pdb_id, "database": database, "db_acc": db_acc, "db_id_code": db_id_code, }) # ENH: 'SEQADV' 'MODRES' if rec_name is None: raise ValueError("Empty file.") for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq("".join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id) record.description = "%s:%s %s" % ( m["database"], m["db_acc"], m["db_id_code"], ) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem["database"], melem["db_acc"]), "%s:%s" % (melem["database"], melem["db_id_code"]), ]) else: record.id = chn_id yield record
def PdbAtomIterator(handle): """Returns SeqRecord objects for each chain in a PDB file The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. This gets called internally via Bio.SeqIO for the atom based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A Equivalently, >>> with open("PDB/1A8O.pdb") as handle: ... for record in PdbAtomIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A """ # TODO - Add record.annotations to the doctest, esp the residues (not working?) # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB import PDBParser from Bio.SeqUtils import seq1 def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return seq1(residue.resname, custom_map=protein_letters_3to1) # Deduce the PDB ID from the PDB header # ENH: or filename? from Bio.File import UndoHandle undo_handle = UndoHandle(handle) firstline = undo_handle.peekline() if firstline.startswith("HEADER"): pdb_id = firstline[62:66] else: warnings.warn( "First line is not a 'HEADER'; can't determine PDB ID. " "Line: %r" % firstline, BiopythonWarning) pdb_id = '????' struct = PDBParser().get_structure(pdb_id, undo_handle) model = struct[0] for chn_id, chain in sorted(model.child_dict.items()): # HETATM mod. res. policy: remove mod if in sequence, else discard residues = [ res for res in chain.get_unpacked_list() if seq1(res.get_resname().upper(), custom_map=protein_letters_3to1) != "X" ] if not residues: continue # Identify missing residues in the structure # (fill the sequence with 'X' residues in these regions) gaps = [] rnumbers = [r.id[1] for r in residues] for i, rnum in enumerate(rnumbers[:-1]): if rnumbers[i + 1] != rnum + 1: # It's a gap! gaps.append((i + 1, rnum, rnumbers[i + 1])) if gaps: res_out = [] prev_idx = 0 for i, pregap, postgap in gaps: if postgap > pregap: gapsize = postgap - pregap - 1 res_out.extend(restype(x) for x in residues[prev_idx:i]) prev_idx = i res_out.append('X' * gapsize) else: warnings.warn("Ignoring out-of-order residues after a gap", BiopythonWarning) # Keep the normal part, drop the out-of-order segment # (presumably modified or hetatm residues, e.g. 3BEG) res_out.extend(restype(x) for x in residues[prev_idx:i]) break else: # Last segment res_out.extend(restype(x) for x in residues[prev_idx:]) else: # No gaps res_out = [restype(x) for x in residues] record_id = "%s:%s" % (pdb_id, chn_id) # ENH - model number in SeqRecord id if multiple models? # id = "Chain%s" % str(chain.id) # if len(structure) > 1 : # id = ("Model%s|" % str(model.id)) + id record = SeqRecord( Seq(''.join(res_out), generic_protein), id=record_id, description=record_id, ) # The PDB header was loaded as a dictionary, so let's reuse it all record.annotations = struct.header.copy() # Plus some chain specifics: record.annotations["model"] = model.id record.annotations["chain"] = chain.id # Start & end record.annotations["start"] = int(rnumbers[0]) record.annotations["end"] = int(rnumbers[-1]) # ENH - add letter annotations -- per-residue info, e.g. numbers yield record
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) records = [] for record in blast_records: # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP "BLASTN": "nucleotide_match", "BLASTP": "protein_match", }.get(record.application, "match") rec = SeqRecord(Seq("ACTG"), id=record.query) for hit in record.alignments: for hsp in hit.hsps: qualifiers = { "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), } desc = hit.title.split(" >")[0] qualifiers["description"] = desc[desc.index(" ") :] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count("-") # However, if the user requests that we trim the feature, then # we need to cut the ``match`` start to 0 to match the parent feature. # We'll also need to cut the end to match the query's end. It (maybe) # should be the feature end? But we don't have access to that data, so # We settle for this. if trim: if parent_match_start < 1: parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature( FeatureLocation(parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers, ) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = {"source": "blast"} top_feature.sub_features = [] for start, end, cigar in generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap): part_qualifiers["Gap"] = cigar part_qualifiers["ID"] = hit.hit_id if trim: # If trimming, then we start relative to the # match's start match_part_start = parent_match_start + start else: # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers), ) ) rec.features.append(top_feature) rec.annotations = {} records.append(rec) return records
def prodigal_parser(seq_file, sco_file, prefix, output_folder): bin_ffn_file = '%s.ffn' % prefix bin_faa_file = '%s.faa' % prefix bin_gbk_file = '%s.gbk' % prefix pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file) pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file) pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file) # get sequence id list id_to_sequence_dict = {} sequence_id_list = [] for each_seq in SeqIO.parse(seq_file, 'fasta'): id_to_sequence_dict[each_seq.id] = str(each_seq.seq) sequence_id_list.append(each_seq.id) # get sequence to cds dict and sequence to transl_table dict current_seq_id = '' current_transl_table = '' current_seq_csd_list = [] seq_to_cds_dict = {} seq_to_transl_table_dict = {} for each_cds in open(sco_file): if each_cds.startswith('# Sequence Data'): # add to dict if current_seq_id != '': seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table # reset value current_seq_id = each_cds.strip().split(';seqhdr=')[1][1:-1].split( ' ')[0] current_transl_table = '' current_seq_csd_list = [] elif each_cds.startswith('# Model Data'): current_transl_table = each_cds.strip().split(';')[-2].split( '=')[-1] else: current_seq_csd_list.append('_'.join( each_cds.strip().split('_')[1:])) seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w') bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w') bin_faa_file_handle = open(pwd_bin_faa_file, 'w') gene_index = 1 for seq_id in sequence_id_list: # create SeqRecord current_sequence = Seq(id_to_sequence_dict[seq_id]) current_SeqRecord = SeqRecord(current_sequence, id=seq_id) current_SeqRecord.seq.alphabet = generic_dna transl_table = seq_to_transl_table_dict[seq_id] # add SeqRecord annotations current_SeqRecord_annotations = {} current_SeqRecord_annotations['date'] = ( datetime.now().strftime('%d-%b-%Y')).upper() current_SeqRecord_annotations['accession'] = '' current_SeqRecord_annotations['version'] = '' current_SeqRecord_annotations['keywords'] = ['.'] current_SeqRecord_annotations['source'] = prefix current_SeqRecord_annotations['organism'] = prefix current_SeqRecord_annotations['taxonomy'] = ['Unclassified'] current_SeqRecord_annotations['comment'] = '.' current_SeqRecord.annotations = current_SeqRecord_annotations # add SeqFeature to SeqRecord for cds in seq_to_cds_dict[seq_id]: # define locus_tag id locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index)) # define FeatureLocation cds_split = cds.split('_') cds_start = SF.ExactPosition(int(cds_split[0])) cds_end = SF.ExactPosition(int(cds_split[1])) cds_strand = cds_split[2] current_strand = None if cds_strand == '+': current_strand = 1 if cds_strand == '-': current_strand = -1 current_feature_location = FeatureLocation(cds_start, cds_end, strand=current_strand) # get nc sequence sequence_nc = '' if cds_strand == '+': sequence_nc = id_to_sequence_dict[seq_id][cds_start - 1:cds_end] if cds_strand == '-': sequence_nc = str( Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end], generic_dna).reverse_complement()) # translate to aa sequence sequence_aa = str( SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table)) # remove * at the end sequence_aa = sequence_aa[:-1] # export nc and aa sequences export_dna_record(sequence_nc, locus_tag_id, '', bin_ffn_file_handle) export_aa_record(sequence_aa, locus_tag_id, '', bin_faa_file_handle) # Define feature type current_feature_type = 'CDS' # Define feature qualifiers current_qualifiers_dict = {} current_qualifiers_dict['locus_tag'] = locus_tag_id current_qualifiers_dict['transl_table'] = transl_table current_qualifiers_dict['translation'] = sequence_aa # Create a SeqFeature current_feature = SeqFeature(current_feature_location, type=current_feature_type, qualifiers=current_qualifiers_dict) # Append Feature to SeqRecord current_SeqRecord.features.append(current_feature) gene_index += 1 # export to gbk file SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank') bin_gbk_file_handle.close() bin_ffn_file_handle.close() bin_faa_file_handle.close()
def blastxml2gff3(blastxml): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) records = {} for record in blast_records: # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = 'match' for hit in record.alignments: if hit.accession in records: rec = records[hit.accession] else: rec = SeqRecord(Seq("ACTG"), id=hit.accession) for hsp in hit.hsps: if hsp.frame[1] < 0: strand = -1 elif hsp.frame[1] == 0: strand = 0 else: strand = 1 qualifiers = { "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_name": record.query, "Name": record.query } desc = hit.title.split(' >')[0] desc = desc[desc.index(' '):] if desc != ' No definition line': qualifiers['description'] = desc if hsp.sbjct_start < hsp.sbjct_end: parent_match_start = hsp.sbjct_start parent_match_end = hsp.sbjct_end else: parent_match_start = hsp.sbjct_end parent_match_end = hsp.sbjct_start # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature(FeatureLocation( parent_match_start, parent_match_end), type=match_type, strand=strand, qualifiers=qualifiers) top_feature.sub_features = [] part_qualifiers = {"source": "blast"} if hsp.sbjct_start < hsp.sbjct_end: match_part_start = hsp.sbjct_start match_part_end = hsp.sbjct_end else: match_part_start = hsp.sbjct_end match_part_end = hsp.sbjct_start top_feature.sub_features.append( SeqFeature(FeatureLocation(match_part_start, match_part_end), type="match_part", strand=strand, qualifiers=copy.deepcopy(part_qualifiers))) rec.features.append(top_feature) rec.annotations = {} records[hit.hit_id] = rec return records.values()
def PdbSeqresIterator(handle): """Returns SeqRecord objects for each chain in a PDB file. The sequences are derived from the SEQRES lines in the PDB file header, not the atoms of the 3D structure. Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES See: http://www.wwpdb.org/documentation/format23/sect3.html This gets called internally via Bio.SeqIO for the SEQRES based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Equivalently, >>> with open("PDB/1A8O.pdb") as handle: ... for record in PdbSeqresIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... print(record.dbxrefs) ... Record id 1A8O:A, chain A ['UNP:P12497', 'UNP:POL_HV1N5'] Note the chain is recorded in the annotations dictionary, and any PDB DBREF lines are recorded in the database cross-references list. """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils from Bio.SeqUtils import seq1 chains = collections.defaultdict(list) metadata = collections.defaultdict(list) for line in handle: rec_name = line[0:6].strip() if rec_name == 'SEQRES': # NB: We only actually need chain ID and the residues here; # commented bits are placeholders from the wwPDB spec. # Serial number of the SEQRES record for the current chain. # Starts at 1 and increments by one each line. # Reset to 1 for each chain. # ser_num = int(line[8:10]) # Chain identifier. This may be any single legal character, # including a blank which is used if there is only one chain. chn_id = line[11] # Number of residues in the chain (repeated on every record) # num_res = int(line[13:17]) residues = [ seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split() ] chains[chn_id].extend(residues) elif rec_name == 'DBREF': # ID code of this entry (PDB ID) pdb_id = line[7:11] # Chain identifier. chn_id = line[12] # Initial sequence number of the PDB sequence segment. # seq_begin = int(line[14:18]) # Initial insertion code of the PDB sequence segment. # icode_begin = line[18] # Ending sequence number of the PDB sequence segment. # seq_end = int(line[20:24]) # Ending insertion code of the PDB sequence segment. # icode_end = line[24] # Sequence database name. database = line[26:32].strip() # Sequence database accession code. db_acc = line[33:41].strip() # Sequence database identification code. db_id_code = line[42:54].strip() # Initial sequence number of the database seqment. # db_seq_begin = int(line[55:60]) # Insertion code of initial residue of the segment, if PDB is the # reference. # db_icode_begin = line[60] # Ending sequence number of the database segment. # db_seq_end = int(line[62:67]) # Insertion code of the ending residue of the segment, if PDB is the # reference. # db_icode_end = line[67] metadata[chn_id].append({ 'pdb_id': pdb_id, 'database': database, 'db_acc': db_acc, 'db_id_code': db_id_code }) # ENH: 'SEQADV' 'MODRES' for chn_id, residues in sorted(chains.items()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ( "%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code']) ]) else: record.id = chn_id yield record
def PdbSeqresIterator(handle): """Returns SeqRecord objects for each chain in a PDB file. The sequences are derived from the SEQRES lines in the PDB file header, not the atoms of the 3D structure. Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES See: http://www.wwpdb.org/documentation/format23/sect3.html """ # Late-binding import to avoid circular dependency on SeqIO in Bio.SCOP # TODO - swap in Bow's SeqUtils.seq1 once that's merged from Bio.SCOP.three_to_one_dict import to_one_letter_code chains = collections.defaultdict(list) metadata = collections.defaultdict(list) for line in handle: rec_name = line[0:6].strip() if rec_name == 'SEQRES': # NB: We only actually need chain ID and the residues here; # commented bits are placeholders from the wwPDB spec. # Serial number of the SEQRES record for the current chain. # Starts at 1 and increments by one each line. # Reset to 1 for each chain. # ser_num = int(line[8:10]) # Chain identifier. This may be any single legal character, # including a blank which is used if there is only one chain. chn_id = line[11] # Number of residues in the chain (repeated on every record) # num_res = int(line[13:17]) residues = [to_one_letter_code.get(res, 'X') for res in line[19:].split()] chains[chn_id].extend(residues) elif rec_name == 'DBREF': # ID code of this entry (PDB ID) pdb_id = line[7:11] # Chain identifier. chn_id = line[12] # Initial sequence number of the PDB sequence segment. # seq_begin = int(line[14:18]) # Initial insertion code of the PDB sequence segment. # icode_begin = line[18] # Ending sequence number of the PDB sequence segment. # seq_end = int(line[20:24]) # Ending insertion code of the PDB sequence segment. # icode_end = line[24] # Sequence database name. database = line[26:32].strip() # Sequence database accession code. db_acc = line[33:41].strip() # Sequence database identification code. db_id_code = line[42:54].strip() # Initial sequence number of the database seqment. # db_seq_begin = int(line[55:60]) # Insertion code of initial residue of the segment, if PDB is the # reference. # db_icode_begin = line[60] # Ending sequence number of the database segment. # db_seq_end = int(line[62:67]) # Insertion code of the ending residue of the segment, if PDB is the # reference. # db_icode_end = line[67] metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 'db_acc': db_acc, 'db_id_code': db_id_code}) # ENH: 'SEQADV' 'MODRES' for chn_id, residues in sorted(chains.iteritems()): record = SeqRecord(Seq(''.join(residues), generic_protein)) record.annotations = {"chain": chn_id} if chn_id in metadata: m = metadata[chn_id][0] record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) record.description = ("%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code'])) for melem in metadata[chn_id]: record.dbxrefs.extend([ "%s:%s" % (melem['database'], melem['db_acc']), "%s:%s" % (melem['database'], melem['db_id_code'])]) else: record.id = chn_id yield record
def to_seqrecord(self): """Create a SeqRecord object from this Sequence instance. The seqrecord.annotations dictionary is packed like so:: { # Sequence attributes with no SeqRecord equivalent: 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': { 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type }, # Sequence.annotations attribute (list of Annotations) 'annotations': [{ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': [ ann.confidence.value, ann.confidence.type ], 'properties': [{ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref } for prop in ann.properties], } for ann in self.annotations], } """ def clean_dict(dct): """Remove None-valued items from a dictionary.""" return dict((key, val) for key, val in dct.iteritems() if val is not None) seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), **clean_dict({ 'id': str(self.accession), 'name': self.symbol, 'description': self.name, # 'dbxrefs': None, })) if self.domain_architecture: seqrec.features = [dom.to_seqfeature() for dom in self.domain_architecture.domains] # Sequence attributes with no SeqRecord equivalent seqrec.annotations = clean_dict({ 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': self.uri and clean_dict({ 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type, }), 'annotations': self.annotations and [ clean_dict({ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': ann.confidence and [ ann.confidence.value, ann.confidence.type], 'properties': [clean_dict({ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref }) for prop in ann.properties], }) for ann in self.annotations], }) return seqrec
def blasttsv2gff3(blasttsv, include_seq=False): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 # match_type = { # Currently we can only handle BLASTN, BLASTP # "BLASTN": "nucleotide_match", # "BLASTP": "protein_match", # }.get(type, "match") match_type = "match" columns = [ "qseqid", # 01 Query Seq-id (ID of your sequence) "sseqid", # 02 Subject Seq-id (ID of the database hit) "pident", # 03 Percentage of identical matches "length", # 04 Alignment length "mismatch", # 05 Number of mismatches "gapopen", # 06 Number of gap openings "qstart", # 07 Start of alignment in query "qend", # 08 End of alignment in query "sstart", # 09 Start of alignment in subject (database hit) "send", # 10 End of alignment in subject (database hit) "evalue", # 11 Expectation value (E-value) "bitscore", # 12 Bit score "sallseqid", # 13 All subject Seq-id(s), separated by a ';' "score", # 14 Raw score "nident", # 15 Number of identical matches "positive", # 16 Number of positive-scoring matches "gaps", # 17 Total number of gaps "ppos", # 18 Percentage of positive-scoring matches "qframe", # 19 Query frame "sframe", # 20 Subject frame "qseq", # 21 Aligned part of query sequence "sseq", # 22 Aligned part of subject sequence "qlen", # 23 Query sequence length "slen", # 24 Subject sequence length "salltitles", # 25 All subject title(s), separated by a '<>' ] collected_records = [] for record_idx, record in enumerate(blasttsv): if record.startswith("#"): continue dc = { k: v for (k, v) in zip(columns, (x.strip() for x in record.split("\t"))) } rec = SeqRecord(Seq("ACTG"), id=dc["qseqid"]) feature_id = "b2g.%s" % (record_idx) hit_qualifiers = { "ID": feature_id, "Name": (dc["salltitles"].split("<>")[0]), "description": "Hit to {sstart}..{send} of {x}".format( x=dc["salltitles"].split("<>")[0], **dc), "source": "blast", "score": dc["evalue"], "accession": clean_string(dc["sseqid"]), "length": dc["qlen"], "hit_titles": clean_slist(dc["salltitles"].split("<>")), "target": clean_string(dc["qseqid"]), } hsp_qualifiers = {"source": "blast"} for key in dc.keys(): # Add the remaining BLAST info to the GFF qualifiers if key in ( "salltitles", "sallseqid", "sseqid", "qseqid", "qseq", "sseq", ): continue hsp_qualifiers["blast_%s" % key] = clean_string(dc[key]) # Below numbers stored as strings, convert to proper form for ( integer_numerical_key ) in "gapopen gaps length mismatch nident positive qend qframe qlen qstart score send sframe slen sstart".split( " "): dc[integer_numerical_key] = int(dc[integer_numerical_key]) for float_numerical_key in "bitscore evalue pident ppos".split(" "): dc[float_numerical_key] = float(dc[float_numerical_key]) parent_match_start = dc["qstart"] parent_match_end = dc["qend"] parent_match_start, parent_match_end = check_bounds( parent_match_start, parent_match_end, dc["qstart"], dc["qend"]) # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature( FeatureLocation( min(parent_match_start, parent_match_end) - 1, max(parent_match_start, parent_match_end), ), type=match_type, strand=0, qualifiers=hit_qualifiers, ) top_feature.sub_features = [] # There is a possibility of multiple lines containing the HSPS # for the same hit. # Unlike the parent feature, ``match_part``s have sources. hsp_qualifiers["ID"] = clean_string(dc["sseqid"]) match_part_start = dc["qstart"] match_part_end = dc["qend"] top_feature.sub_features.append( SeqFeature( FeatureLocation( min(match_part_start, match_part_end) - 1, max(match_part_start, match_part_end), ), type="match_part", strand=0, qualifiers=copy.deepcopy(hsp_qualifiers), )) top_feature.sub_features = sorted(top_feature.sub_features, key=lambda x: int(x.location.start)) rec.features = [top_feature] rec.annotations = {} collected_records.append(rec) collected_records = combine_records(collected_records) for rec in collected_records: yield rec
def blastxml2gff3(blastxml, include_seq=False): blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 # match_type = { # Currently we can only handle BLASTN, BLASTP # "BLASTN": "nucleotide_match", # "BLASTP": "protein_match", # }.get(record.application, "match") match_type = "match" collected_records = [] recid = record.query if " " in recid: recid = clean_string(recid[0:recid.index(" ")]) for idx_hit, hit in enumerate(record.alignments): # gotta check all hsps in a hit to see boundaries rec = SeqRecord(Seq("ACTG"), id=recid) parent_match_start = 0 parent_match_end = 0 hit_qualifiers = { "ID": "b2g.%s.%s" % (idx_record, idx_hit), "source": "blast", "accession": hit.accession, "hit_id": clean_string(hit.hit_id), "score": None, "length": hit.length, "hit_titles": clean_slist(hit.title.split(" >")), "hsp_count": len(hit.hsps), } desc = hit.title.split(" >")[0] hit_qualifiers["Name"] = desc sub_features = [] for idx_hsp, hsp in enumerate(hit.hsps): if idx_hsp == 0: # -2 and +1 for start/end to convert 0 index of python to 1 index of people, -2 on start because feature location saving issue parent_match_start = hsp.query_start parent_match_end = hsp.query_end hit_qualifiers["score"] = hsp.expect # generate qualifiers to be added to gff3 feature hit_qualifiers["score"] = min(hit_qualifiers["score"], hsp.expect) hsp_qualifiers = { "ID": "b2g.%s.%s.hsp%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": clean_string(hit.hit_id), "length": hit.length, "hit_titles": clean_slist(hit.title.split(" >")), } if include_seq: if ( "blast_qseq", "blast_sseq", "blast_mseq", ) in hit_qualifiers.keys(): hit_qualifiers.update({ "blast_qseq": hit_qualifiers["blast_qseq"] + hsp.query, "blast_sseq": hit_qualifiers["blast_sseq"] + hsp.sbjct, "blast_mseq": hit_qualifiers["blast_mseq"] + hsp.match, }) else: hit_qualifiers.update({ "blast_qseq": hsp.query, "blast_sseq": hsp.sbjct, "blast_mseq": hsp.match, }) for prop in ( "score", "bits", "identities", "positives", "gaps", "align_length", "strand", "frame", "query_start", "query_end", "sbjct_start", "sbjct_end", ): hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None) # check if parent boundary needs to increase to envelope hsp # if hsp.query_start < parent_match_start: # parent_match_start = hsp.query_start - 1 # if hsp.query_end > parent_match_end: # parent_match_end = hsp.query_end + 1 parent_match_start, parent_match_end = check_bounds( parent_match_start, parent_match_end, hsp.query_start, hsp.query_end) # add hsp to the gff3 feature as a "match_part" sub_features.append( SeqFeature( FeatureLocation(hsp.query_start - 1, hsp.query_end), type="match_part", strand=0, qualifiers=copy.deepcopy(hsp_qualifiers), )) # Build the top level seq feature for the hit hit_qualifiers["description"] = "Hit to %s..%s of %s" % ( parent_match_start, parent_match_end, desc, ) top_feature = SeqFeature( FeatureLocation(parent_match_start - 1, parent_match_end), type=match_type, strand=0, qualifiers=hit_qualifiers, ) # add the generated subfeature hsp match_parts to the hit feature top_feature.sub_features = copy.deepcopy( sorted(sub_features, key=lambda x: int(x.location.start))) # Add the hit feature to the record rec.features.append(top_feature) rec.annotations = {} collected_records.append(rec) for rec in collected_records: yield rec
def to_seqrecord(self): """Create a SeqRecord object from this Sequence instance. The seqrecord.annotations dictionary is packed like so:: { # Sequence attributes with no SeqRecord equivalent: 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': { 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type }, # Sequence.annotations attribute (list of Annotations) 'annotations': [{ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': [ ann.confidence.value, ann.confidence.type ], 'properties': [{ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref } for prop in ann.properties], } for ann in self.annotations], } """ def clean_dict(dct): """Remove None-valued items from a dictionary.""" return dict( (key, val) for key, val in dct.iteritems() if val is not None) seqrec = SeqRecord( Seq(self.mol_seq.value, self.get_alphabet()), **clean_dict({ 'id': str(self.accession), 'name': self.symbol, 'description': self.name, # 'dbxrefs': None, })) if self.domain_architecture: seqrec.features = [ dom.to_seqfeature() for dom in self.domain_architecture.domains ] # Sequence attributes with no SeqRecord equivalent seqrec.annotations = clean_dict({ 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': self.uri and clean_dict({ 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type, }), 'annotations': self.annotations and [ clean_dict({ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': ann.confidence and [ann.confidence.value, ann.confidence.type], 'properties': [ clean_dict({ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref }) for prop in ann.properties ], }) for ann in self.annotations ], }) return seqrec
def blastxml2gff3(blastxml, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP "BLASTN": "nucleotide_match", "BLASTP": "protein_match", }.get(record.application, "match") recid = record.query if " " in recid: recid = recid[0:recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): # gotta check all hsps in a hit to see boundaries parent_match_start = 0 parent_match_end = 0 hit_qualifiers = { "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, "0"), "source": "blast", "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), "hsp_count": len(hit.hsps), } sub_features = [] for idx_hsp, hsp in enumerate(hit.hsps): hsp_qualifiers = { "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(" >"), } if include_seq: hsp_qualifiers.update({ "blast_qseq": hsp.query, "blast_sseq": hsp.sbjct, "blast_mseq": hsp.match, }) for prop in ( "score", "bits", "identities", "positives", "gaps", "align_length", "strand", "frame", "query_start", "query_end", "sbjct_start", "sbjct_end", ): hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None) desc = hit.title.split(" >")[0] hsp_qualifiers["description"] = desc[desc.index(" "):] # check if parent boundary needs to increase if hsp.query_start < parent_match_start: parent_match_start = hsp.query_start if hsp.query_end > parent_match_end: parent_match_end = hsp.query_end + 1 # Build out the match_part features for each HSP for idx_part, (start, end, cigar) in enumerate( generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=10)): hsp_qualifiers["Gap"] = cigar hsp_qualifiers["ID"] = hit_qualifiers["ID"] + (".%s" % idx_part) match_part_start = hsp.query_start # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(hsp_qualifiers), )) # Build the top level seq feature for the hit top_feature = SeqFeature( FeatureLocation(parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=hit_qualifiers, ) # add the generated subfeature hsp match_parts to the hit feature top_feature.sub_features = copy.deepcopy(sub_features) # Add the hit feature to the record rec.features.append(top_feature) rec.annotations = {} yield rec