def doReverseComplement(inputSeqRecord, id=False, name=False, description=False, features=True, annotations=False, letter_annotations=True, dbxrefs=False) :

    from Bio.SeqRecord import SeqRecord    
    from Bio.Seq import MutableSeq #Lazy to avoid circular imports 
    if isinstance(inputSeqRecord.seq, MutableSeq): 
        #Currently the MutableSeq reverse complement is in situ 
        answer = SeqRecord(inputSeqRecord.seq.toseq().reverse_complement()) 
    else: 
        answer = SeqRecord(inputSeqRecord.seq.reverse_complement()) 
    if isinstance(id, basestring): 
        answer.id = id 
    elif id: 
        answer.id = inputSeqRecord.id 
    if isinstance(name, basestring): 
        answer.name = name 
    elif name: 
        answer.name = inputSeqRecord.name 
    if isinstance(description, basestring): 
        answer.description = description 
    elif description: 
        answer.description = inputSeqRecord.description 
    if isinstance(dbxrefs, list): 
        answer.dbxrefs = dbxrefs 
    elif dbxrefs: 
        #Copy the old dbxrefs 
        answer.dbxrefs = inputSeqRecord.dbxrefs[:] 
    if isinstance(features, list): 
        answer.features = features 
    elif features: 
        #Copy the old features, adjusting location and string 
        l = len(answer) 
        answer.features = [f._flip(l) for f in inputSeqRecord.features] 
        #The old list should have been sorted by start location, 
        #reversing it will leave it sorted by what is now the end position, 
        #so we need to resort in case of overlapping features. 
        #NOTE - In the common case of gene before CDS (and similar) with 
        #the exact same locations, this will still maintain gene before CDS 
        answer.features.sort(key=lambda x : x.location.start.position) 
    if isinstance(annotations, dict): 
        answer.annotations = annotations 
    elif annotations: 
        #Copy the old annotations, 
        answer.annotations = inputSeqRecord.annotations.copy() 
    if isinstance(letter_annotations, dict): 
        answer.letter_annotations = letter_annotations 
    elif letter_annotations: 
        #Copy the old per letter annotations, reversing them 
        for key, value in inputSeqRecord.letter_annotations.iteritems(): 
            answer._per_letter_annotations[key] = value[::-1] 
    return answer 
Example #2
0
	def export(self, gene, outputFileName=None, keys = ['promoter' , 'utr5', 'cds', 'utr3', 'terminator']):
		self.keys = keys
		parts = [  getattr(gene, key) for key in self.keys ]
		
		strand = gene.locusStrand

		if outputFileName:
			gene = SeqRecord(id = str(gene.dbid).replace('.',''), name = str(gene.dbid), seq = '' )
		else:
			gene = SeqRecord(id = gene.dbid, name = str(gene.dbid), seq = '' )

		gene.annotations = {'strand' : strand}

		for partType, part in zip( self.keys, parts):
			l = len(gene)
			if isinstance(part, PartMixIn):
				if isinstance(part, ExonMixIn):
					feature = SeqFeature( type = partType, location = self.coordinatesToLocation(part.coordinates)._shift( l ), id=part.dbid )
				else:
					feature = SeqFeature( type = partType, location = FeatureLocation( l, l + len(part.seq) ), id=part.dbid )

				gene.seq += Seq(part.seq, generic_dna)
				gene.features.append(feature)
		if outputFileName:
			print 'outputFileName IS'
		else:
			print 'outputFileName IS NONE'

		if outputFileName:
			outputFile = open(outputFileName, 'w')
			SeqIO.write(gene, outputFile, "gb")
		else:
			return gene
Example #3
0
def PhdIterator(handle):
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        # Convert the PHY record into a SeqRecord...
        # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
        # from unit test example file phd_solexa.
        # This will cause problems if used as the record identifier
        # (e.g. output for FASTQ format).
        name = phd_record.file_name.split(None, 1)[0]
        seq_record = SeqRecord(phd_record.seq,
                               id=name, name=name,
                               description=phd_record.file_name)
        # Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        # And store the qualities and peak locations as per-letter-annotation
        seq_record.letter_annotations["phred_quality"] = \
            [int(site[1]) for site in phd_record.sites]
        try:
            seq_record.letter_annotations["peak_location"] = \
                [int(site[2]) for site in phd_record.sites]
        except IndexError:
            # peak locations are not always there according to
            # David Gordon (the Consed author)
            pass
        yield seq_record
Example #4
0
def genome_to_seqrecord(phage_genome):
    """Creates a SeqRecord object from a pdm_utils Genome object.

    :param phage_genome: A pdm_utils Genome object.
    :type phage_genome: Genome
    :returns: A BioPython SeqRecord object
    :rtype: SeqRecord
    """

    assert phage_genome != None,\
    "Genome object passed is None and not initialized"
    try:
        record = SeqRecord(phage_genome.seq)
        record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    except AttributeError:
        print("Genome object failed to be converted to SeqRecord.",
              "Genome valid attribute 'seq' is required to",
              "convert to SeqRecord object.")
        raise
    record.name = phage_genome.name
    if phage_genome.accession != "":
        record.id = phage_genome.accession
    record.features = get_seqrecord_features(phage_genome)
    record.description = get_seqrecord_description(phage_genome)
    record.annotations=\
            get_seqrecord_annotations(phage_genome)

    return record
Example #5
0
def PhdIterator(handle):
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        # Convert the PHY record into a SeqRecord...
        # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
        # from unit test example file phd_solexa.
        # This will cause problems if used as the record identifier
        # (e.g. output for FASTQ format).
        name = phd_record.file_name.split(None, 1)[0]
        seq_record = SeqRecord(phd_record.seq,
                               id=name,
                               name=name,
                               description=phd_record.file_name)
        # Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        # And store the qualities and peak locations as per-letter-annotation
        seq_record.letter_annotations["phred_quality"] = \
            [int(site[1]) for site in phd_record.sites]
        try:
            seq_record.letter_annotations["peak_location"] = \
                [int(site[2]) for site in phd_record.sites]
        except IndexError:
            # peak locations are not always there according to
            # David Gordon (the Consed author)
            pass
        yield seq_record
Example #6
0
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None):
    """
    Return the selected proteins as SeqRecord objects
    """
    query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id
        from hpf.experiment e 
        join bddb.protein p on e.id=p.experiment_key
        join ddbCommon.sequence s on p.sequence_key=s.id 
        """
    assert experiment!= None or sequence_key != None
    if experiment != None or filter_experiments==True or sequence_key != None:
        query += " where "
    if experiment:
        if not hasattr(experiment, "__iter__"):
            experiment = [experiment]
        query += " e.id in (%s)" % (",".join([str(key) for key in experiment]))
    if filter_experiments:
        t = " e.taxonomy_id!=0"
        query += " and "+t if experiment else t
    if sequence_key:
        t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key]))
        query += " and "+t if experiment or filter_experiments else t
    runtime().debug(query)
    cursor.execute(query)
    runtime().debug("Fetching")
    for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall():
        record = SeqRecord(Seq(sequence), str(id), description=e_name)
        record.annotations = {"taxonomy_id":taxonomy_id,
                              "experiment_key":e_id,
                              "organism":e_name}
        yield record
Example #7
0
def desanitize_fasta_names_in_seqrec_list(seqrec_list, used_dict):
    reverted = []
    for rec in seqrec_list:
        ns = SeqRecord(rec.seq, id=used_dict[rec.id])

        if len(rec.letter_annotations) != 0:
            ns.letter_annotations = rec.letter_annotations
        if len(rec.annotations) != 0:
            ns.annotations = rec.annotations

        reverted.append(ns)
    return reverted
Example #8
0
def PhdIterator(handle) :
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phy module to do the hard work.
    """

    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        #Convert the PHY record into a SeqRecord...
        seq_record = SeqRecord(phd_record.seq,
                               id = phd_record.file_name,
                               name = phd_record.file_name)
        #Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        yield seq_record 
Example #9
0
def cds_to_seqrecord(cds):
    try:
        record = SeqRecord(cds.seq)
        record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    except AttributeError:
        print("Genome object failed to be converted to SeqRecord\n."
              "Genome valid attribute 'seq' is required to "
              "convert to SeqRecord object.")

    record.name = cds.id
    if cds.locus_tag != "":
        record.id = cds.locus_tag
    cds.set_seqfeature()
    record.features = [cds.seqfeature]
    record.description = f"Single gene {cds.id}"
    record.annotations = get_cds_seqrecord_annotations(cds)

    return record
Example #10
0
def sanitize_fasta_names_in_seqrec_list(seqrec_list, used_dict=None):
    if used_dict:
        sanitize_dict = used_dict
    else:
        sanitize_dict = {}

    sanitized_list = []
    for rec in seqrec_list:
        uname = generate_random_name(8, list(sanitize_dict.keys()))
        sanitize_dict[uname] = rec.id
        ns = SeqRecord(rec.seq, id=uname)

        if len(rec.letter_annotations) != 0:
            ns.letter_annotations = rec.letter_annotations
        if len(rec.annotations) != 0:
            ns.annotations = rec.annotations

        sanitized_list.append(ns)
    return sanitized_list, sanitize_dict
Example #11
0
def cds_to_seqrecord(cds, parent_genome, gene_domains=[]):
    """Creates a SeqRecord object from a Cds and its parent Genome.

    :param cds: A populated Cds object.
    :type cds: Cds
    :param phage_genome: Populated parent Genome object of the Cds object.
    :param domains: List of domain objects populated with column attributes
    :type domains: list
    :returns: Filled Biopython SeqRecord object.
    :rtype: SeqRecord
    """
    record = SeqRecord(cds.translation)
    record.seq.alphabet = IUPAC.IUPACProtein()
    record.name = cds.id
    if cds.locus_tag == "" or cds.locus_tag is None:
        record.id = "".join(["DRAFT ", cds.id])
    else:
        record.id = cds.locus_tag

    cds.set_seqfeature()

    source = f"{parent_genome.host_genus} phage {cds.genome_id}"
    source_feature = cds.create_seqfeature("source", 0, cds.translation_length,
                                           1)
    source_feature.qualifiers["organism"] = [source]

    record.features = [source_feature]
    record.features.append(
        cds.create_seqfeature("Protein", 0, cds.translation_length, 1))

    cds_feature = cds.create_seqfeature("CDS", 0, cds.translation_length, 1)
    format_cds_seqrecord_CDS_feature(cds_feature, cds, parent_genome)
    record.features.append(cds_feature)

    region_features = get_cds_seqrecord_regions(gene_domains, cds)
    for region_feature in region_features:
        record.features.append(region_feature)

    record.description = (f"{cds.seqfeature.qualifiers['product'][0]} "
                          f"[{source}]")
    record.annotations = get_cds_seqrecord_annotations(cds, parent_genome)

    return record
Example #12
0
    def return_seqrec(self, **kwargs) -> SeqRecord:
        """A function to return the assembled construct as a seqrecord.

        Args:
            kwargs:

        Returns:
            seqrec: assembled construct as a new seqrecord.
        """
        seqrec = SeqRecord(Seq(str()))
        for part_linker in self.parts_linkers:
            seqrec += part_linker.basic_slice()
        seqrec.id = self.id
        seqrec.name = "BASIC_construct_" + self.id
        seqrec.description = f"BASIC DNA Assembly of {[part_linker.name for part_linker in self.parts_linkers]}"
        seqrec.annotations = DEFAULT_ANNOTATIONS
        if kwargs:
            for key, value in kwargs.items():
                setattr(seqrec, key, value)
        return seqrec
Example #13
0
 def getSeqRecord(self):
     """
         id
         seq         - The sequence itself (Seq object)
         Additional attributes:
         name        - Sequence name, e.g. gene name (string)
         description - Additional text (string)
         dbxrefs     - List of database cross references (list of
                       strings)
         features    - Any (sub)features defined (list of
                       SeqFeature objects)
         annotations - Further information about the whole
                       sequence (dictionary)
     """
     seqr = BioSeqRecord(id=self.Accession(),
                         seq=BioSeq(self.Sequence(), self.alphabetClass()()),
                         name=self.Name(),
                         description=self.Description())
     seqr.features = self.features
     seqr.annotations = self.annotations
     return seqr
Example #14
0
 def getSeqRecord(self):
     """
         id
         seq         - The sequence itself (Seq object)
         Additional attributes:
         name        - Sequence name, e.g. gene name (string)
         description - Additional text (string)
         dbxrefs     - List of database cross references (list of
                       strings)
         features    - Any (sub)features defined (list of
                       SeqFeature objects)
         annotations - Further information about the whole
                       sequence (dictionary)
     """
     seqr = BioSeqRecord(id=self.Accession(),
                         seq=BioSeq(self.Sequence(),
                                    self.alphabetClass()()),
                         name=self.Name(),
                         description=self.Description())
     seqr.features = self.features
     seqr.annotations = self.annotations
     return seqr
Example #15
0
def cds_to_seqrecord(cds, parent_genome):
    """Creates a SeqRecord object from a Cds and its parent Genome.

    :param cds: A populated Cds object.
    :type cds: Cds
    :param phage_genome: Populated parent Genome object of the Cds object.
    :returns: Filled Biopython SeqRecord object.
    :rtype: SeqRecord
    """
    record = SeqRecord(cds.translation)
    record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    record.name = cds.id
    if cds.locus_tag != "":
        record.id = cds.locus_tag

    cds.set_seqfeature()
    record.features = [cds.seqfeature]

    record.description = (
        f"{cds.description} "
        f"[{parent_genome.host_genus} phage {cds.genome_id}]")
    record.annotations = get_cds_seqrecord_annotations(cds, parent_genome)

    return record
Example #16
0
def align_multiple_sequences(seqs):
    import subprocess as sp
    from Bio import SeqIO, AlignIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import protein
    from Bio.Align import MultipleSeqAlignment
    fn_tmp = '/tmp/tmp.fasta'
    fn_tmp2 = '/tmp/tmp_aligned.fasta'
    seqs_muscle = []
    for i, seq in enumerate(seqs):
        name = 'seq{:}'.format(i)
        seqm = SeqRecord(seq.seq, id=name, name=name, description='')
        seqs_muscle.append(seqm)
    SeqIO.write(seqs_muscle, fn_tmp, 'fasta')
    sp.run(
        'muscle -in {:} -out {:} -diags'.format(fn_tmp, fn_tmp2),
        shell=True,
    )
    ali_muscle = AlignIO.read(fn_tmp2, 'fasta')
    ali = []
    for seq_muscle in ali_muscle:
        i = int(seq_muscle.id[3:])
        rec = seqs[i]
        seq = SeqRecord(
            seq_muscle.seq,
            id=rec.annotations['OrganismCommon'] + '-' + rec.id,
            name=rec.name,
            description=rec.description,
        )
        seq.annotations = rec.annotations
        ali.append(seq)
    ali = MultipleSeqAlignment(ali)
    os.remove(fn_tmp)
    os.remove(fn_tmp2)
    return ali
Example #17
0
def write_genbank_output(seq_record: SeqRecord, topology: str, organism: str, outfpath: str) -> None:
    # Function writes annotated sequence to output GenBank file.
    # :param seq_record: sequence record to output;
    # :param topology: string for `topology` annotation;
    # :param organism: string for `organism` annotation;
    # :param outfpath: path to output file;

    # Set annotations
    seq_record.annotations = {
        'molecule_type': 'DNA',
        'organism': organism,
        'date': _get_date(),
        'topology': topology
    }

    # Sort features by their location if ascending order
    seq_record.features = sorted(
        seq_record.features,
        key = lambda feature: feature.location.start
    )

    # Write output file
    with open(outfpath, 'a') as outfile:
        SeqIO.write(seq_record, outfile, 'genbank')
Example #18
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.
    """
    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB import PDBParser
    from Bio.SeqUtils import seq1
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=to_one_letter_code)

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.iteritems()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [res for res in chain.get_unpacked_list()
                    if seq1(res.get_resname().upper(),
                        custom_map=to_one_letter_code) != "X"]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i+1] != rnum + 1:
                # It's a gap!
                gaps.append((i+1, rnum, rnumbers[i+1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(map(restype, residues[prev_idx:i]))
                    prev_idx = i
                    res_out.append('X'*gapsize)
                    # Last segment
                    res_out.extend(map(restype, residues[prev_idx:]))
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  UserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(map(restype, residues[prev_idx:i]))
        else:
            # No gaps
            res_out = map(restype, residues)
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                id=record_id,
                description=record_id,
                )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
Example #19
0
def PdbSeqresIterator(handle):
    """Return SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html

    This gets called internally via Bio.SeqIO for the SEQRES based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.pdb") as handle:
    ...     for record in PdbSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any PDB DBREF
    lines are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
                                    'db_acc': db_acc, 'db_id_code': db_id_code})
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
Example #20
0
def CifSeqresIterator(handle):
    """Return SeqRecord objects for each chain in an mmCIF file.

    The sequences are derived from the _entity_poly_seq entries in the mmCIF
    file, not the atoms of the 3D structure.

    Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and
    _struct_ref_seq. The _pdbx_poly_seq records contain sequence information,
    and the _struct_ref_seq records contain database cross-references.

    See:
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html
    and
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html

    This gets called internally via Bio.SeqIO for the sequence-based
    interpretation of the mmCIF file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.cif") as handle:
    ...     for record in CifSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any mmCIF
    _struct_ref_seq entries are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    records = MMCIF2Dict(handle)

    # Explicitly convert records to list (See #1533).
    # If an item is not present, use an empty list
    for field in (
            PDBX_POLY_SEQ_SCHEME_FIELDS
            + STRUCT_REF_SEQ_FIELDS
            + STRUCT_REF_FIELDS):
        if field not in records:
            records[field] = []
        elif not isinstance(records[field], list):
            records[field] = [records[field]]

    for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"],
                               records["_pdbx_poly_seq_scheme.mon_id"]):
        mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1)
        chains[asym_id].append(mon_id_1l)

    # Build a dict of _struct_ref records, indexed by the id field:
    struct_refs = {}
    for fields in zip(records["_struct_ref.id"],
                      records["_struct_ref.db_name"],
                      records["_struct_ref.db_code"],
                      records["_struct_ref.pdbx_db_accession"]):
        ref_id, db_name, db_code, db_acc = fields
        struct_refs[ref_id] = {
            "database": db_name,
            "db_id_code": db_code,
            "db_acc": db_acc}

    # Look through _struct_ref_seq records, look up the corresponding
    # _struct_ref and add an entry to the metadata list for this chain.
    for fields in zip(records["_struct_ref_seq.ref_id"],
                      records["_struct_ref_seq.pdbx_PDB_id_code"],
                      records["_struct_ref_seq.pdbx_strand_id"]):
        ref_id, pdb_id, chain_id = fields
        struct_ref = struct_refs[ref_id]

        # The names here mirror those in PdbIO
        metadata[chain_id].append({'pdb_id': pdb_id})
        metadata[chain_id][-1].update(struct_ref)

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
def blastxml2gff3(blastxml,
                  min_gap=3,
                  trim=False,
                  trim_end=False,
                  include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            "BLASTN": "nucleotide_match",
            "BLASTP": "protein_match",
        }.get(record.application, "match")

        recid = record.query
        if " " in recid:
            recid = recid[0:recid.index(" ")]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            for idx_hsp, hsp in enumerate(hit.hsps):
                qualifiers = {
                    "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(" >"),
                }
                if include_seq:
                    qualifiers.update({
                        "blast_qseq": hsp.query,
                        "blast_sseq": hsp.sbjct,
                        "blast_mseq": hsp.match,
                    })

                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(" >")[0]
                qualifiers["description"] = desc[desc.index(" "):]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count(
                    "-")

                # If we trim the left end, we need to trim without losing information.
                used_parent_match_start = parent_match_start
                if trim:
                    if parent_match_start < 1:
                        used_parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(
                    FeatureLocation(used_parent_match_start, parent_match_end),
                    type=match_type,
                    strand=0,
                    qualifiers=qualifiers,
                )

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {"source": "blast"}
                top_feature.sub_features = []
                for idx_part, (start, end, cigar) in enumerate(
                        generate_parts(hsp.query,
                                       hsp.match,
                                       hsp.sbjct,
                                       ignore_under=min_gap)):
                    part_qualifiers["Gap"] = cigar
                    part_qualifiers["ID"] = qualifiers["ID"] + (".%s" %
                                                                idx_part)

                    # Otherwise, we have to account for the subject start's location
                    match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part",
                            strand=0,
                            qualifiers=copy.deepcopy(part_qualifiers),
                        ))

                rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Example #22
0
def CifSeqresIterator(handle):
    """Return SeqRecord objects for each chain in an mmCIF file.

    The sequences are derived from the _entity_poly_seq entries in the mmCIF
    file, not the atoms of the 3D structure.

    Specifically, these mmCIF records are handled: _pdbx_poly_seq_scheme and
    _struct_ref_seq. The _pdbx_poly_seq records contain sequence information,
    and the _struct_ref_seq records contain database cross-references.

    See:
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v40.dic/Categories/pdbx_poly_seq_scheme.html
    and
    http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html

    This gets called internally via Bio.SeqIO for the sequence-based
    interpretation of the mmCIF file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.cif", "cif-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.cif") as handle:
    ...     for record in CifSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any mmCIF
    _struct_ref_seq entries are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    records = MMCIF2Dict(handle)

    # Explicitly convert records to list (See #1533).
    # If an item is not present, use an empty list
    for field in (
            PDBX_POLY_SEQ_SCHEME_FIELDS
            + STRUCT_REF_SEQ_FIELDS
            + STRUCT_REF_FIELDS):
        if field not in records:
            records[field] = []
        elif not isinstance(records[field], list):
            records[field] = [records[field]]

    for asym_id, mon_id in zip(records["_pdbx_poly_seq_scheme.asym_id"],
                               records["_pdbx_poly_seq_scheme.mon_id"]):
        mon_id_1l = seq1(mon_id, custom_map=protein_letters_3to1)
        chains[asym_id].append(mon_id_1l)

    # Build a dict of _struct_ref records, indexed by the id field:
    struct_refs = {}
    for fields in zip(records["_struct_ref.id"],
                      records["_struct_ref.db_name"],
                      records["_struct_ref.db_code"],
                      records["_struct_ref.pdbx_db_accession"]):
        ref_id, db_name, db_code, db_acc = fields
        struct_refs[ref_id] = {
            "database": db_name,
            "db_id_code": db_code,
            "db_acc": db_acc}

    # Look through _struct_ref_seq records, look up the corresponding
    # _struct_ref and add an entry to the metadata list for this chain.
    for fields in zip(records["_struct_ref_seq.ref_id"],
                      records["_struct_ref_seq.pdbx_PDB_id_code"],
                      records["_struct_ref_seq.pdbx_strand_id"]):
        ref_id, pdb_id, chain_id = fields
        struct_ref = struct_refs[ref_id]

        # The names here mirror those in PdbIO
        metadata[chain_id].append({"pdb_id": pdb_id})
        metadata[chain_id][-1].update(struct_ref)

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq("".join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id)
            record.description = ("%s:%s %s" % (m["database"],
                                                m["db_acc"],
                                                m["db_id_code"]))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem["database"], melem["db_acc"]),
                    "%s:%s" % (melem["database"], melem["db_id_code"])])
        else:
            record.id = chn_id
        yield record
def score_mutants(mut_list, ref, min_length):
    """
    This function takes a dictionary of mutants (all of the same reference sequence) and
    - filters out those that are too short
    - creates intron/exon features for all mutants based on the reference sequence
    - scores each mutant and reference sequence

    :param mut_list: list of tuples of mutants of the same reference sequence
    :param ref: dictionary of reference sequences
    :param min_length: Minimum length of mutant
    :return: tab-delimited result string, containing mutant header, sequence, mutant score, and original sequence score
    """

    scores = []

    # convert mutant list to dictionary
    mutants = {x[0]: x[1] for x in mut_list}
    if len(mutants) == 0:
        return ['']
    # get corresponding reference sequence
    name = mutants.keys()[0].split()[0].split('.')[0]
    ref_features = ref.get(name, None)
    if not ref_features:
        for header, seq in mutants.items():
            scores.append('\t'.join([header, seq, 'None', 'None']) + '\n')
        return scores

    # create SeqRecord for original feature
    orig_record = SeqRecord(Seq(ref_features['seq']), id=name)
    orig_record.annotations = ref[name]

    # initialize with intron/exon features
    # get corresponding boundaries from reference
    upstr_intron_size, exon_size, downstr_intron_size = ref_features['len']

    # calculate the feature start/ends
    upstr_loc = (0, upstr_intron_size)
    exon_loc = (upstr_loc[1], upstr_loc[1] + exon_size)
    downstr_loc = (exon_loc[1], exon_loc[1] + downstr_intron_size)

    # create feature for mutant and for original sequence
    upstr_intron_feature = SeqFeature(
        FeatureLocation(*upstr_loc),
        type="intron",
        strand=orig_record.annotations["strand"])

    exon_feature = SeqFeature(
        FeatureLocation(*exon_loc),
        type="exon",
        strand=orig_record.annotations["strand"])

    downstr_intron_feature = SeqFeature(
        FeatureLocation(*downstr_loc),
        type="intron",
        strand=orig_record.annotations["strand"])

    orig_record.features.extend([upstr_intron_feature, exon_feature, downstr_intron_feature])

    orig_record, orig_score = score_sequence(orig_record)

    for header, seq in mutants.items():
        if len(seq) < min_length:
            continue

        # create SeqRecord object
        record = SeqRecord(Seq(seq.strip()), id=header.split()[0][1:])
        annot_list = ['unknown', 'cigar', 'md', 'alignment']
        for k, v in zip(annot_list, header.strip().split()[1:]):
            record.annotations[k] = v

        # initialize strand
        record.annotations['strand'] = ref_features['strand']

        record.features.extend([upstr_intron_feature, exon_feature, downstr_intron_feature])

        record, mut_score = score_sequence(record)

        scores.append('\t'.join([header, seq, str(mut_score), str(orig_score)]) + '\n')

    return scores
Example #24
0
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    records = []
    for record in blast_records:
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            'BLASTN': 'nucleotide_match',
            'BLASTP': 'protein_match',
        }.get(record.application, 'match')

        rec = SeqRecord(Seq("ACTG"), id=record.query)
        for hit in record.alignments:
            for hsp in hit.hsps:
                qualifiers = {
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(' >')
                }
                desc = hit.title.split(' >')[0]
                qualifiers['description'] = desc[desc.index(' '):]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count(
                    '-')

                # However, if the user requests that we trim the feature, then
                # we need to cut the ``match`` start to 0 to match the parent feature.
                # We'll also need to cut the end to match the query's end. It (maybe)
                # should be the feature end? But we don't have access to that data, so
                # We settle for this.
                if trim:
                    if parent_match_start < 1:
                        parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(FeatureLocation(
                    parent_match_start, parent_match_end),
                                         type=match_type,
                                         strand=0,
                                         qualifiers=qualifiers)

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {
                    "source": "blast",
                }
                top_feature.sub_features = []
                for start, end, cigar in generate_parts(hsp.query,
                                                        hsp.match,
                                                        hsp.sbjct,
                                                        ignore_under=min_gap):
                    part_qualifiers['Gap'] = cigar
                    part_qualifiers['ID'] = hit.hit_id

                    if trim:
                        # If trimming, then we start relative to the
                        # match's start
                        match_part_start = parent_match_start + start
                    else:
                        # Otherwise, we have to account for the subject start's location
                        match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(FeatureLocation(match_part_start,
                                                   match_part_end),
                                   type="match_part",
                                   strand=0,
                                   qualifiers=copy.deepcopy(part_qualifiers)))

                rec.features.append(top_feature)
        rec.annotations = {}
        records.append(rec)
    return records
Example #25
0
def shinefind(
    genbank_file,
    gff3_output=None,
    table_output=None,
    lookahead_min=5,
    lookahead_max=15,
    top_only=False,
    add=False,
):
    table_output.write("\t".join([
        "ID",
        "Name",
        "Terminus",
        "Terminus",
        "Strand",
        "Upstream Sequence",
        "SD",
        "Spacing",
    ]) + "\n")

    sd_finder = NaiveSDCaller()
    # Parse GFF3 records
    for record in list(SeqIO.parse(genbank_file, "genbank")):
        # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE.
        seen = {}
        # Shinefind's "gff3_output".
        gff3_output_record = SeqRecord(record.seq, record.id)
        # Loop over all CDS features
        for feature in record.features:
            if feature.type != "CDS":
                continue

            seen_loc = (feature.location.start
                        if feature.strand > 0 else feature.location.end)
            if seen_loc in seen:
                continue
            else:
                seen[seen_loc] = True

            sds, start, end, seq = sd_finder.testFeatureUpstream(
                feature, record, sd_min=lookahead_min, sd_max=lookahead_max)

            feature_id = get_id(feature)
            sd_features = sd_finder.to_features(sds,
                                                feature.location.strand,
                                                start,
                                                end,
                                                feature_id=feature.id)

            human_strand = "+" if feature.location.strand == 1 else "-"

            # http://book.pythontips.com/en/latest/for_-_else.html
            log.debug("Found %s SDs", len(sds))
            for (sd, sd_feature) in zip(sds, sd_features):
                # If we only want the top feature, after the bulk of the
                # forloop executes once, we append the top feature, and fake a
                # break, because an actual break triggers the else: block
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            sd_finder.highlight_sd(seq, sd["start"],
                                                   sd["end"]),
                            sd["hit"],
                            int(sd["spacing"]) + lookahead_min,
                        ],
                    )) + "\n")

                if add:
                    # Append the top RBS to the gene feature
                    record.features.append(sd_feature)
                # Also register the feature with the separate GFF3 output
                gff3_output_record.features.append(sd_feature)

                if top_only:
                    break
            else:
                if len(sds) != 0:
                    log.debug("Should not reach here if %s", len(sds) != 0)
                    # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem.
                    continue
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            seq,
                            None,
                            -1,
                        ],
                    )) + "\n")

        record.features = sorted(record.features,
                                 key=lambda x: x.location.start)
        SeqIO.write([record], sys.stdout, "genbank")

        gff3_output_record.features = sorted(gff3_output_record.features,
                                             key=lambda x: x.location.start)
        gff3_output_record.annotations = {}
        GFF.write([gff3_output_record], gff3_output)
def blastxml2gff3(blastxml,
                  min_gap=3,
                  trim=False,
                  trim_end=False,
                  include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        MATCH_TYPE.get(record.application, 'match')

        recid = record.query
        if ' ' in recid:
            recid = recid[0:recid.index(' ')]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            qualifiers = {
                "ID": 'b2g.%s.%s' % (idx_record, idx_hit),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": hit.hit_id,
                "length": hit.length,
                "hit_titles": hit.title.split(' >'),
            }
            top_feature = SeqFeature(
                FeatureLocation(1, 1000000000),  # TODO.
                type='match',
                strand=0,
                qualifiers=qualifiers,
            )
            top_feature.sub_features = []
            feat_min = None
            feat_max = None

            for idx_hsp, hsp in enumerate(hit.hsps):
                part_qualifiers = {
                    "source": "blastn",
                }
                part_qualifiers.update(qualifiers)
                part_qualifiers['ID'] += '.%s' % idx_hsp

                if include_seq:
                    part_qualifiers.update({
                        'blast_qseq': hsp.query,
                        'blast_sseq': hsp.sbjct,
                        'blast_mseq': hsp.match,
                    })

                for prop in ('score', 'bits', 'identities', 'positives',
                             'gaps', 'align_length', 'strand', 'frame',
                             'query_start', 'query_end', 'sbjct_start',
                             'sbjct_end'):
                    part_qualifiers['blast_' + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(' >')[0]
                part_qualifiers['description'] = desc[desc.index(' '):]
                part_qualifiers['score'] = hsp.expect

                if feat_min is None:
                    feat_min = hsp.sbjct_start
                    feat_max = hsp.sbjct_end

                if hsp.sbjct_start < feat_min:
                    feat_min = hsp.sbjct_start

                if hsp.sbjct_end > feat_max:
                    feat_max = hsp.sbjct_end

                top_feature.sub_features.append(
                    SeqFeature(FeatureLocation(hsp.query_start, hsp.query_end),
                               type="match_part",
                               strand=0,
                               qualifiers=copy.deepcopy(part_qualifiers)))

            top_feature.location._start = feat_min
            top_feature.location._end = feat_max
            rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Example #27
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.

        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref': self.id_ref,
              'id_source': self.id_source,
              'location': self.location,
              'uri': { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{'ref': ann.ref,
                               'source': ann.source,
                               'evidence': ann.evidence,
                               'type': ann.type,
                               'confidence': [ann.confidence.value,
                                              ann.confidence.type],
                               'properties': [{'value': prop.value,
                                                'ref': prop.ref,
                                                'applies_to': prop.applies_to,
                                                'datatype': prop.datatype,
                                                'unit': prop.unit,
                                                'id_ref': prop.id_ref}
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }

        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return {key: val for key, val in dct.items() if val is not None}

        seqrec = SeqRecord(
            Seq(self.mol_seq.value),
            **clean_dict({
                "id": str(self.accession),
                "name": self.symbol,
                "description": self.name,
                # 'dbxrefs': None,
            }),
        )
        if self.domain_architecture:
            seqrec.features = [
                dom.to_seqfeature() for dom in self.domain_architecture.domains
            ]
        # Sequence attributes with no SeqRecord equivalent
        if self.type == "dna":
            molecule_type = "DNA"
        elif self.type == "rna":
            molecule_type = "RNA"
        elif self.type == "protein":
            molecule_type = "protein"
        else:
            molecule_type = None
        seqrec.annotations = clean_dict({
            "id_ref":
            self.id_ref,
            "id_source":
            self.id_source,
            "location":
            self.location,
            "uri":
            self.uri and clean_dict({
                "value": self.uri.value,
                "desc": self.uri.desc,
                "type": self.uri.type,
            }),
            "molecule_type":
            molecule_type,
            "annotations":
            self.annotations and [
                clean_dict({
                    "ref":
                    ann.ref,
                    "source":
                    ann.source,
                    "evidence":
                    ann.evidence,
                    "type":
                    ann.type,
                    "confidence":
                    ann.confidence
                    and [ann.confidence.value, ann.confidence.type],
                    "properties": [
                        clean_dict({
                            "value": prop.value,
                            "ref": prop.ref,
                            "applies_to": prop.applies_to,
                            "datatype": prop.datatype,
                            "unit": prop.unit,
                            "id_ref": prop.id_ref,
                        }) for prop in ann.properties
                    ],
                }) for ann in self.annotations
            ],
        })
        return seqrec
Example #28
0
    def iterate(self, handle):
        """Iterate over the records in the PDB file."""
        # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
        # Not sure if this is really needed; Python can handle circular dependencies.
        from Bio.SeqUtils import seq1

        chains = collections.defaultdict(list)
        metadata = collections.defaultdict(list)

        rec_name = None
        for line in handle:
            rec_name = line[0:6].strip()
            if rec_name == "SEQRES":
                # NB: We only actually need chain ID and the residues here;
                # commented bits are placeholders from the wwPDB spec.
                # Serial number of the SEQRES record for the current chain.
                # Starts at 1 and increments by one each line.
                # Reset to 1 for each chain.
                # ser_num = int(line[8:10])
                # Chain identifier. This may be any single legal character,
                # including a blank which is used if there is only one chain.
                chn_id = line[11]
                # Number of residues in the chain (repeated on every record)
                # num_res = int(line[13:17])
                residues = [
                    seq1(res, custom_map=protein_letters_3to1)
                    for res in line[19:].split()
                ]
                chains[chn_id].extend(residues)
            elif rec_name == "DBREF":
                #  ID code of this entry (PDB ID)
                pdb_id = line[7:11]
                # Chain identifier.
                chn_id = line[12]
                # Initial sequence number of the PDB sequence segment.
                # seq_begin = int(line[14:18])
                # Initial insertion code of the PDB sequence segment.
                # icode_begin = line[18]
                # Ending sequence number of the PDB sequence segment.
                # seq_end = int(line[20:24])
                # Ending insertion code of the PDB sequence segment.
                # icode_end = line[24]
                # Sequence database name.
                database = line[26:32].strip()
                # Sequence database accession code.
                db_acc = line[33:41].strip()
                # Sequence database identification code.
                db_id_code = line[42:54].strip()
                # Initial sequence number of the database seqment.
                # db_seq_begin = int(line[55:60])
                # Insertion code of initial residue of the segment, if PDB is the
                # reference.
                # db_icode_begin = line[60]
                # Ending sequence number of the database segment.
                # db_seq_end = int(line[62:67])
                # Insertion code of the ending residue of the segment, if PDB is the
                # reference.
                # db_icode_end = line[67]
                metadata[chn_id].append({
                    "pdb_id": pdb_id,
                    "database": database,
                    "db_acc": db_acc,
                    "db_id_code": db_id_code,
                })
            # ENH: 'SEQADV' 'MODRES'

        if rec_name is None:
            raise ValueError("Empty file.")

        for chn_id, residues in sorted(chains.items()):
            record = SeqRecord(Seq("".join(residues), generic_protein))
            record.annotations = {"chain": chn_id}
            if chn_id in metadata:
                m = metadata[chn_id][0]
                record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id)
                record.description = "%s:%s %s" % (
                    m["database"],
                    m["db_acc"],
                    m["db_id_code"],
                )
                for melem in metadata[chn_id]:
                    record.dbxrefs.extend([
                        "%s:%s" % (melem["database"], melem["db_acc"]),
                        "%s:%s" % (melem["database"], melem["db_id_code"]),
                    ])
            else:
                record.id = chn_id
            yield record
Example #29
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.

    This gets called internally via Bio.SeqIO for the atom based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    Equivalently,

    >>> with open("PDB/1A8O.pdb") as handle:
    ...     for record in PdbAtomIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    """
    # TODO - Add record.annotations to the doctest, esp the residues (not working?)

    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB import PDBParser
    from Bio.SeqUtils import seq1

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=protein_letters_3to1)

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn(
            "First line is not a 'HEADER'; can't determine PDB ID. "
            "Line: %r" % firstline, BiopythonWarning)
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.items()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [
            res for res in chain.get_unpacked_list()
            if seq1(res.get_resname().upper(), custom_map=protein_letters_3to1)
            != "X"
        ]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i + 1] != rnum + 1:
                # It's a gap!
                gaps.append((i + 1, rnum, rnumbers[i + 1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    prev_idx = i
                    res_out.append('X' * gapsize)
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  BiopythonWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    break
            else:
                # Last segment
                res_out.extend(restype(x) for x in residues[prev_idx:])
        else:
            # No gaps
            res_out = [restype(x) for x in residues]
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(
            Seq(''.join(res_out), generic_protein),
            id=record_id,
            description=record_id,
        )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    records = []
    for record in blast_records:
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            "BLASTN": "nucleotide_match",
            "BLASTP": "protein_match",
        }.get(record.application, "match")

        rec = SeqRecord(Seq("ACTG"), id=record.query)
        for hit in record.alignments:
            for hsp in hit.hsps:
                qualifiers = {
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(" >"),
                }
                desc = hit.title.split(" >")[0]
                qualifiers["description"] = desc[desc.index(" ") :]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")

                # However, if the user requests that we trim the feature, then
                # we need to cut the ``match`` start to 0 to match the parent feature.
                # We'll also need to cut the end to match the query's end. It (maybe)
                # should be the feature end? But we don't have access to that data, so
                # We settle for this.
                if trim:
                    if parent_match_start < 1:
                        parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(
                    FeatureLocation(parent_match_start, parent_match_end),
                    type=match_type,
                    strand=0,
                    qualifiers=qualifiers,
                )

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {"source": "blast"}
                top_feature.sub_features = []
                for start, end, cigar in generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap):
                    part_qualifiers["Gap"] = cigar
                    part_qualifiers["ID"] = hit.hit_id

                    if trim:
                        # If trimming, then we start relative to the
                        # match's start
                        match_part_start = parent_match_start + start
                    else:
                        # Otherwise, we have to account for the subject start's location
                        match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part",
                            strand=0,
                            qualifiers=copy.deepcopy(part_qualifiers),
                        )
                    )

                rec.features.append(top_feature)
        rec.annotations = {}
        records.append(rec)
    return records
Example #31
0
def prodigal_parser(seq_file, sco_file, prefix, output_folder):

    bin_ffn_file = '%s.ffn' % prefix
    bin_faa_file = '%s.faa' % prefix
    bin_gbk_file = '%s.gbk' % prefix
    pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
    pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
    pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file)

    # get sequence id list
    id_to_sequence_dict = {}
    sequence_id_list = []
    for each_seq in SeqIO.parse(seq_file, 'fasta'):
        id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
        sequence_id_list.append(each_seq.id)

    # get sequence to cds dict and sequence to transl_table dict
    current_seq_id = ''
    current_transl_table = ''
    current_seq_csd_list = []
    seq_to_cds_dict = {}
    seq_to_transl_table_dict = {}
    for each_cds in open(sco_file):
        if each_cds.startswith('# Sequence Data'):

            # add to dict
            if current_seq_id != '':
                seq_to_cds_dict[current_seq_id] = current_seq_csd_list
                seq_to_transl_table_dict[current_seq_id] = current_transl_table

            # reset value
            current_seq_id = each_cds.strip().split(';seqhdr=')[1][1:-1].split(
                ' ')[0]
            current_transl_table = ''
            current_seq_csd_list = []

        elif each_cds.startswith('# Model Data'):
            current_transl_table = each_cds.strip().split(';')[-2].split(
                '=')[-1]

        else:
            current_seq_csd_list.append('_'.join(
                each_cds.strip().split('_')[1:]))

    seq_to_cds_dict[current_seq_id] = current_seq_csd_list
    seq_to_transl_table_dict[current_seq_id] = current_transl_table

    bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w')
    bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
    bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
    gene_index = 1
    for seq_id in sequence_id_list:

        # create SeqRecord
        current_sequence = Seq(id_to_sequence_dict[seq_id])
        current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
        current_SeqRecord.seq.alphabet = generic_dna
        transl_table = seq_to_transl_table_dict[seq_id]

        # add SeqRecord annotations
        current_SeqRecord_annotations = {}
        current_SeqRecord_annotations['date'] = (
            datetime.now().strftime('%d-%b-%Y')).upper()
        current_SeqRecord_annotations['accession'] = ''
        current_SeqRecord_annotations['version'] = ''
        current_SeqRecord_annotations['keywords'] = ['.']
        current_SeqRecord_annotations['source'] = prefix
        current_SeqRecord_annotations['organism'] = prefix
        current_SeqRecord_annotations['taxonomy'] = ['Unclassified']
        current_SeqRecord_annotations['comment'] = '.'
        current_SeqRecord.annotations = current_SeqRecord_annotations

        # add SeqFeature to SeqRecord
        for cds in seq_to_cds_dict[seq_id]:

            # define locus_tag id
            locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))

            # define FeatureLocation
            cds_split = cds.split('_')
            cds_start = SF.ExactPosition(int(cds_split[0]))
            cds_end = SF.ExactPosition(int(cds_split[1]))
            cds_strand = cds_split[2]
            current_strand = None
            if cds_strand == '+':
                current_strand = 1
            if cds_strand == '-':
                current_strand = -1
            current_feature_location = FeatureLocation(cds_start,
                                                       cds_end,
                                                       strand=current_strand)

            # get nc sequence
            sequence_nc = ''
            if cds_strand == '+':
                sequence_nc = id_to_sequence_dict[seq_id][cds_start -
                                                          1:cds_end]
            if cds_strand == '-':
                sequence_nc = str(
                    Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end],
                        generic_dna).reverse_complement())

            # translate to aa sequence
            sequence_aa = str(
                SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))

            # remove * at the end
            sequence_aa = sequence_aa[:-1]

            # export nc and aa sequences
            export_dna_record(sequence_nc, locus_tag_id, '',
                              bin_ffn_file_handle)
            export_aa_record(sequence_aa, locus_tag_id, '',
                             bin_faa_file_handle)

            # Define feature type
            current_feature_type = 'CDS'

            # Define feature qualifiers
            current_qualifiers_dict = {}
            current_qualifiers_dict['locus_tag'] = locus_tag_id
            current_qualifiers_dict['transl_table'] = transl_table
            current_qualifiers_dict['translation'] = sequence_aa

            # Create a SeqFeature
            current_feature = SeqFeature(current_feature_location,
                                         type=current_feature_type,
                                         qualifiers=current_qualifiers_dict)

            # Append Feature to SeqRecord
            current_SeqRecord.features.append(current_feature)
            gene_index += 1

        # export to gbk file
        SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank')

    bin_gbk_file_handle.close()
    bin_ffn_file_handle.close()
    bin_faa_file_handle.close()
def blastxml2gff3(blastxml):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    records = {}
    for record in blast_records:
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = 'match'

        for hit in record.alignments:
            if hit.accession in records:
                rec = records[hit.accession]
            else:
                rec = SeqRecord(Seq("ACTG"), id=hit.accession)

            for hsp in hit.hsps:
                if hsp.frame[1] < 0:
                    strand = -1
                elif hsp.frame[1] == 0:
                    strand = 0
                else:
                    strand = 1
                qualifiers = {
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_name": record.query,
                    "Name": record.query
                }
                desc = hit.title.split(' >')[0]
                desc = desc[desc.index(' '):]
                if desc != ' No definition line':
                    qualifiers['description'] = desc

                if hsp.sbjct_start < hsp.sbjct_end:
                    parent_match_start = hsp.sbjct_start
                    parent_match_end = hsp.sbjct_end
                else:
                    parent_match_start = hsp.sbjct_end
                    parent_match_end = hsp.sbjct_start

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(FeatureLocation(
                    parent_match_start, parent_match_end),
                                         type=match_type,
                                         strand=strand,
                                         qualifiers=qualifiers)
                top_feature.sub_features = []

                part_qualifiers = {"source": "blast"}

                if hsp.sbjct_start < hsp.sbjct_end:
                    match_part_start = hsp.sbjct_start
                    match_part_end = hsp.sbjct_end
                else:
                    match_part_start = hsp.sbjct_end
                    match_part_end = hsp.sbjct_start

                top_feature.sub_features.append(
                    SeqFeature(FeatureLocation(match_part_start,
                                               match_part_end),
                               type="match_part",
                               strand=strand,
                               qualifiers=copy.deepcopy(part_qualifiers)))

                rec.features.append(top_feature)
            rec.annotations = {}
            records[hit.hit_id] = rec
    return records.values()
Example #33
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html

    This gets called internally via Bio.SeqIO for the SEQRES based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.pdb") as handle:
    ...     for record in PdbSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any PDB DBREF
    lines are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [
                seq1(res, custom_map=protein_letters_3to1)
                for res in line[19:].split()
            ]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({
                'pdb_id': pdb_id,
                'database': database,
                'db_acc': db_acc,
                'db_id_code': db_id_code
            })
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = (
                "%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])
                ])
        else:
            record.id = chn_id
        yield record
Example #34
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SCOP
    # TODO - swap in Bow's SeqUtils.seq1 once that's merged
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [to_one_letter_code.get(res, 'X')
                        for res in line[19:].split()]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
                                    'db_acc': db_acc, 'db_id_code': db_id_code})
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.iteritems()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
Example #35
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.
        
        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref':     self.id_ref,
              'id_source':  self.id_source,
              'location':   self.location,
              'uri':        { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{ 'ref':      ann.ref,
                                'source':   ann.source,
                                'evidence': ann.evidence,
                                'type':     ann.type,
                                'confidence': [ ann.confidence.value,
                                                ann.confidence.type ],
                                'properties': [{ 'value': prop.value,
                                                 'ref': prop.ref,
                                                 'applies_to': prop.applies_to,
                                                 'datatype':   prop.datatype,
                                                 'unit':       prop.unit,
                                                 'id_ref':     prop.id_ref }
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }
        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return dict((key, val) for key, val in dct.iteritems()
                        if val is not None)

        seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
                           **clean_dict({
                               'id':    str(self.accession),
                               'name':  self.symbol,
                               'description': self.name,
                               # 'dbxrefs': None,
                               }))
        if self.domain_architecture:
            seqrec.features = [dom.to_seqfeature()
                               for dom in self.domain_architecture.domains]
        # Sequence attributes with no SeqRecord equivalent
        seqrec.annotations = clean_dict({
                'id_ref':       self.id_ref,
                'id_source':    self.id_source,
                'location':     self.location,
                'uri':          self.uri and clean_dict({
                                    'value': self.uri.value,
                                    'desc': self.uri.desc,
                                    'type': self.uri.type,
                                    }),
                'annotations':  self.annotations and [
                    clean_dict({
                        'ref':          ann.ref,
                        'source':       ann.source,
                        'evidence':     ann.evidence,
                        'type':         ann.type,
                        'confidence':   ann.confidence and [
                                            ann.confidence.value,
                                            ann.confidence.type],
                        'properties':   [clean_dict({
                                            'value':      prop.value,
                                            'ref':        prop.ref,
                                            'applies_to': prop.applies_to,
                                            'datatype':   prop.datatype,
                                            'unit':       prop.unit,
                                            'id_ref':     prop.id_ref })
                                         for prop in ann.properties],
                        }) for ann in self.annotations],
                })
        return seqrec
def blasttsv2gff3(blasttsv, include_seq=False):

    # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
    # match_type = {  # Currently we can only handle BLASTN, BLASTP
    #    "BLASTN": "nucleotide_match",
    #    "BLASTP": "protein_match",
    # }.get(type, "match")
    match_type = "match"

    columns = [
        "qseqid",  # 01 Query Seq-id (ID of your sequence)
        "sseqid",  # 02 Subject Seq-id (ID of the database hit)
        "pident",  # 03 Percentage of identical matches
        "length",  # 04 Alignment length
        "mismatch",  # 05 Number of mismatches
        "gapopen",  # 06 Number of gap openings
        "qstart",  # 07 Start of alignment in query
        "qend",  # 08 End of alignment in query
        "sstart",  # 09 Start of alignment in subject (database hit)
        "send",  # 10 End of alignment in subject (database hit)
        "evalue",  # 11 Expectation value (E-value)
        "bitscore",  # 12 Bit score
        "sallseqid",  # 13 All subject Seq-id(s), separated by a ';'
        "score",  # 14 Raw score
        "nident",  # 15 Number of identical matches
        "positive",  # 16 Number of positive-scoring matches
        "gaps",  # 17 Total number of gaps
        "ppos",  # 18 Percentage of positive-scoring matches
        "qframe",  # 19 Query frame
        "sframe",  # 20 Subject frame
        "qseq",  # 21 Aligned part of query sequence
        "sseq",  # 22 Aligned part of subject sequence
        "qlen",  # 23 Query sequence length
        "slen",  # 24 Subject sequence length
        "salltitles",  # 25 All subject title(s), separated by a '<>'
    ]
    collected_records = []
    for record_idx, record in enumerate(blasttsv):
        if record.startswith("#"):
            continue

        dc = {
            k: v
            for (k, v) in zip(columns, (x.strip() for x in record.split("\t")))
        }

        rec = SeqRecord(Seq("ACTG"), id=dc["qseqid"])

        feature_id = "b2g.%s" % (record_idx)
        hit_qualifiers = {
            "ID":
            feature_id,
            "Name": (dc["salltitles"].split("<>")[0]),
            "description":
            "Hit to {sstart}..{send} of {x}".format(
                x=dc["salltitles"].split("<>")[0], **dc),
            "source":
            "blast",
            "score":
            dc["evalue"],
            "accession":
            clean_string(dc["sseqid"]),
            "length":
            dc["qlen"],
            "hit_titles":
            clean_slist(dc["salltitles"].split("<>")),
            "target":
            clean_string(dc["qseqid"]),
        }
        hsp_qualifiers = {"source": "blast"}
        for key in dc.keys():
            # Add the remaining BLAST info to the GFF qualifiers
            if key in (
                    "salltitles",
                    "sallseqid",
                    "sseqid",
                    "qseqid",
                    "qseq",
                    "sseq",
            ):
                continue
            hsp_qualifiers["blast_%s" % key] = clean_string(dc[key])

        # Below numbers stored as strings, convert to proper form
        for (
                integer_numerical_key
        ) in "gapopen gaps length mismatch nident positive qend qframe qlen qstart score send sframe slen sstart".split(
                " "):
            dc[integer_numerical_key] = int(dc[integer_numerical_key])

        for float_numerical_key in "bitscore evalue pident ppos".split(" "):
            dc[float_numerical_key] = float(dc[float_numerical_key])

        parent_match_start = dc["qstart"]
        parent_match_end = dc["qend"]

        parent_match_start, parent_match_end = check_bounds(
            parent_match_start, parent_match_end, dc["qstart"], dc["qend"])

        # The ``match`` feature will hold one or more ``match_part``s
        top_feature = SeqFeature(
            FeatureLocation(
                min(parent_match_start, parent_match_end) - 1,
                max(parent_match_start, parent_match_end),
            ),
            type=match_type,
            strand=0,
            qualifiers=hit_qualifiers,
        )
        top_feature.sub_features = []
        # There is a possibility of multiple lines containing the HSPS
        # for the same hit.
        # Unlike the parent feature, ``match_part``s have sources.
        hsp_qualifiers["ID"] = clean_string(dc["sseqid"])
        match_part_start = dc["qstart"]
        match_part_end = dc["qend"]

        top_feature.sub_features.append(
            SeqFeature(
                FeatureLocation(
                    min(match_part_start, match_part_end) - 1,
                    max(match_part_start, match_part_end),
                ),
                type="match_part",
                strand=0,
                qualifiers=copy.deepcopy(hsp_qualifiers),
            ))
        top_feature.sub_features = sorted(top_feature.sub_features,
                                          key=lambda x: int(x.location.start))
        rec.features = [top_feature]
        rec.annotations = {}
        collected_records.append(rec)

    collected_records = combine_records(collected_records)

    for rec in collected_records:
        yield rec
Example #37
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SCOP
    # TODO - swap in Bow's SeqUtils.seq1 once that's merged
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [to_one_letter_code.get(res, 'X')
                        for res in line[19:].split()]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name. 
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
                                    'db_acc': db_acc, 'db_id_code': db_id_code})
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.iteritems()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
def blastxml2gff3(blastxml, include_seq=False):

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        # match_type = {  # Currently we can only handle BLASTN, BLASTP
        #    "BLASTN": "nucleotide_match",
        #    "BLASTP": "protein_match",
        # }.get(record.application, "match")
        match_type = "match"
        collected_records = []

        recid = record.query
        if " " in recid:
            recid = clean_string(recid[0:recid.index(" ")])

        for idx_hit, hit in enumerate(record.alignments):
            # gotta check all hsps in a hit to see boundaries
            rec = SeqRecord(Seq("ACTG"), id=recid)
            parent_match_start = 0
            parent_match_end = 0
            hit_qualifiers = {
                "ID": "b2g.%s.%s" % (idx_record, idx_hit),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": clean_string(hit.hit_id),
                "score": None,
                "length": hit.length,
                "hit_titles": clean_slist(hit.title.split(" >")),
                "hsp_count": len(hit.hsps),
            }
            desc = hit.title.split(" >")[0]
            hit_qualifiers["Name"] = desc
            sub_features = []
            for idx_hsp, hsp in enumerate(hit.hsps):
                if idx_hsp == 0:
                    # -2 and +1 for start/end to convert 0 index of python to 1 index of people, -2 on start because feature location saving issue
                    parent_match_start = hsp.query_start
                    parent_match_end = hsp.query_end
                    hit_qualifiers["score"] = hsp.expect
                # generate qualifiers to be added to gff3 feature
                hit_qualifiers["score"] = min(hit_qualifiers["score"],
                                              hsp.expect)
                hsp_qualifiers = {
                    "ID": "b2g.%s.%s.hsp%s" % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": clean_string(hit.hit_id),
                    "length": hit.length,
                    "hit_titles": clean_slist(hit.title.split(" >")),
                }
                if include_seq:
                    if (
                            "blast_qseq",
                            "blast_sseq",
                            "blast_mseq",
                    ) in hit_qualifiers.keys():
                        hit_qualifiers.update({
                            "blast_qseq":
                            hit_qualifiers["blast_qseq"] + hsp.query,
                            "blast_sseq":
                            hit_qualifiers["blast_sseq"] + hsp.sbjct,
                            "blast_mseq":
                            hit_qualifiers["blast_mseq"] + hsp.match,
                        })
                    else:
                        hit_qualifiers.update({
                            "blast_qseq": hsp.query,
                            "blast_sseq": hsp.sbjct,
                            "blast_mseq": hsp.match,
                        })
                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                # check if parent boundary needs to increase to envelope hsp
                # if hsp.query_start < parent_match_start:
                #    parent_match_start = hsp.query_start - 1
                # if hsp.query_end > parent_match_end:
                #    parent_match_end = hsp.query_end + 1

                parent_match_start, parent_match_end = check_bounds(
                    parent_match_start, parent_match_end, hsp.query_start,
                    hsp.query_end)

                # add hsp to the gff3 feature as a "match_part"
                sub_features.append(
                    SeqFeature(
                        FeatureLocation(hsp.query_start - 1, hsp.query_end),
                        type="match_part",
                        strand=0,
                        qualifiers=copy.deepcopy(hsp_qualifiers),
                    ))

            # Build the top level seq feature for the hit
            hit_qualifiers["description"] = "Hit to %s..%s of %s" % (
                parent_match_start,
                parent_match_end,
                desc,
            )
            top_feature = SeqFeature(
                FeatureLocation(parent_match_start - 1, parent_match_end),
                type=match_type,
                strand=0,
                qualifiers=hit_qualifiers,
            )
            # add the generated subfeature hsp match_parts to the hit feature
            top_feature.sub_features = copy.deepcopy(
                sorted(sub_features, key=lambda x: int(x.location.start)))
            # Add the hit feature to the record
            rec.features.append(top_feature)
            rec.annotations = {}
            collected_records.append(rec)
        for rec in collected_records:
            yield rec
Example #39
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.

        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref':     self.id_ref,
              'id_source':  self.id_source,
              'location':   self.location,
              'uri':        { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{ 'ref':      ann.ref,
                                'source':   ann.source,
                                'evidence': ann.evidence,
                                'type':     ann.type,
                                'confidence': [ ann.confidence.value,
                                                ann.confidence.type ],
                                'properties': [{ 'value': prop.value,
                                                 'ref': prop.ref,
                                                 'applies_to': prop.applies_to,
                                                 'datatype':   prop.datatype,
                                                 'unit':       prop.unit,
                                                 'id_ref':     prop.id_ref }
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }
        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return dict(
                (key, val) for key, val in dct.iteritems() if val is not None)

        seqrec = SeqRecord(
            Seq(self.mol_seq.value, self.get_alphabet()),
            **clean_dict({
                'id': str(self.accession),
                'name': self.symbol,
                'description': self.name,
                # 'dbxrefs': None,
            }))
        if self.domain_architecture:
            seqrec.features = [
                dom.to_seqfeature() for dom in self.domain_architecture.domains
            ]
        # Sequence attributes with no SeqRecord equivalent
        seqrec.annotations = clean_dict({
            'id_ref':
            self.id_ref,
            'id_source':
            self.id_source,
            'location':
            self.location,
            'uri':
            self.uri and clean_dict({
                'value': self.uri.value,
                'desc': self.uri.desc,
                'type': self.uri.type,
            }),
            'annotations':
            self.annotations and [
                clean_dict({
                    'ref':
                    ann.ref,
                    'source':
                    ann.source,
                    'evidence':
                    ann.evidence,
                    'type':
                    ann.type,
                    'confidence':
                    ann.confidence
                    and [ann.confidence.value, ann.confidence.type],
                    'properties': [
                        clean_dict({
                            'value': prop.value,
                            'ref': prop.ref,
                            'applies_to': prop.applies_to,
                            'datatype': prop.datatype,
                            'unit': prop.unit,
                            'id_ref': prop.id_ref
                        }) for prop in ann.properties
                    ],
                }) for ann in self.annotations
            ],
        })
        return seqrec
def blastxml2gff3(blastxml, include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            "BLASTN": "nucleotide_match",
            "BLASTP": "protein_match",
        }.get(record.application, "match")

        recid = record.query
        if " " in recid:
            recid = recid[0:recid.index(" ")]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            # gotta check all hsps in a hit to see boundaries
            parent_match_start = 0
            parent_match_end = 0
            hit_qualifiers = {
                "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, "0"),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": hit.hit_id,
                "length": hit.length,
                "hit_titles": hit.title.split(" >"),
                "hsp_count": len(hit.hsps),
            }
            sub_features = []
            for idx_hsp, hsp in enumerate(hit.hsps):
                hsp_qualifiers = {
                    "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(" >"),
                }
                if include_seq:
                    hsp_qualifiers.update({
                        "blast_qseq": hsp.query,
                        "blast_sseq": hsp.sbjct,
                        "blast_mseq": hsp.match,
                    })

                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(" >")[0]
                hsp_qualifiers["description"] = desc[desc.index(" "):]

                # check if parent boundary needs to increase
                if hsp.query_start < parent_match_start:
                    parent_match_start = hsp.query_start
                if hsp.query_end > parent_match_end:
                    parent_match_end = hsp.query_end + 1

                # Build out the match_part features for each HSP
                for idx_part, (start, end, cigar) in enumerate(
                        generate_parts(hsp.query,
                                       hsp.match,
                                       hsp.sbjct,
                                       ignore_under=10)):
                    hsp_qualifiers["Gap"] = cigar
                    hsp_qualifiers["ID"] = hit_qualifiers["ID"] + (".%s" %
                                                                   idx_part)

                    match_part_start = hsp.query_start

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part",
                            strand=0,
                            qualifiers=copy.deepcopy(hsp_qualifiers),
                        ))

            # Build the top level seq feature for the hit
            top_feature = SeqFeature(
                FeatureLocation(parent_match_start, parent_match_end),
                type=match_type,
                strand=0,
                qualifiers=hit_qualifiers,
            )
            # add the generated subfeature hsp match_parts to the hit feature
            top_feature.sub_features = copy.deepcopy(sub_features)
            # Add the hit feature to the record
            rec.features.append(top_feature)
        rec.annotations = {}
        yield rec