def make_genbank_recs(rec):
    new_rec = rec
    #new_rec.seq.alphabet = generic_dna
    scaffold = new_rec.id

    scaffold_recs = list(
        filter(lambda x: x.id.startswith(scaffold + '_'), protein_recs))

    for protein_rec in scaffold_recs:
        start = int(protein_rec.description.split(' # ')[1])
        startpos = SeqFeature.ExactPosition(start)
        end = int(protein_rec.description.split(' # ')[2])
        endpos = int(SeqFeature.ExactPosition(end))
        strand = int(protein_rec.description.split(' # ')[3])
        rec_location = FeatureLocation(startpos, endpos)
        rec_feature = SeqFeature.SeqFeature(rec_location,
                                            type="CDS",
                                            strand=strand)

        #Add ORF name without genome ID
        rec_feature.qualifiers['protein_id'] = protein_rec.id
        rec_feature.qualifiers['translation'] = protein_rec.seq
        rec_feature.qualifiers['locus_tag'] = protein_rec.description

        new_rec.features.append(rec_feature)
    return new_rec
예제 #2
0
def create_feature_annot(loc_range, featuretype, s):
    """ Create a new feature annotation at loc_range with featuretype on strand s. """
    location = SeqFeature.FeatureLocation(
        SeqFeature.ExactPosition(loc_range[0]),
        SeqFeature.ExactPosition(loc_range[1]))
    new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s)
    return (new_feature)
예제 #3
0
def modify_genbank(gb_file, fasta_file):
    gb_filename = re.search(r'(.*/users/.*/uploads/.*).(\w*)', gb_file)
    out_file = str(gb_filename.group(1)) + '_modified.' + str(
        gb_filename.group(2))

    genome = SeqIO.read(fasta_file, "fasta").seq
    final_annotations = get_final_annotations(genome)
    final_features = []
    for record in SeqIO.parse(open(gb_file, "r"), "genbank"):
        for feature in record.features:
            if feature.type == "gene" or feature.type == "CDS":
                locus_tag = feature.qualifiers["locus_tag"][0]
                if locus_tag in final_annotations.keys():
                    new_start = final_annotations[locus_tag]["start"]
                    feature.location = SeqFeature.FeatureLocation(
                        SeqFeature.ExactPosition(new_start - 1),
                        SeqFeature.ExactPosition(
                            feature.location.end.position),
                        feature.location.strand)
                    if feature.type == "CDS":
                        feature.qualifiers["product"][0] = final_annotations[
                            locus_tag]["function"]
                        feature.qualifiers["translation"][
                            0] = final_annotations[locus_tag]["translation"]
                else:
                    continue
            final_features.append(feature)  # Append final features
        record.features = final_features
        with open(out_file, "w") as new_gb:
            SeqIO.write(record, new_gb, "genbank")

    return out_file
예제 #4
0
def createFEATUREannot(loc_range, featuretype, s):
    """ Creates a new SeqFeature with ExactPositions based on range."""
    location = SeqFeature.FeatureLocation(
        SeqFeature.ExactPosition(loc_range[0]),
        SeqFeature.ExactPosition(loc_range[1]))
    new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s)
    return (new_feature)
def _extract_regions(gff_iterator):
    """Function added by KC Jan 2020. This Extracts regions from the first annotated position to the last annotated position, and updates the locations to correspond to the location in the sequence.
    """
    for rec in gff_iterator:
        pos = []
        loc = min([i.location.start for i in rec.features])
        endloc = max([i.location.end for i in rec.features])
        for i in range(len(rec.features)):
            pos += range(int(rec.features[i].location.start),
                         int(rec.features[i].location.end))
            rec.features[i].location = SeqFeature.FeatureLocation(
                SeqFeature.ExactPosition(rec.features[i].location.start - loc),
                SeqFeature.ExactPosition(rec.features[i].location.end - loc),
                strand=rec.features[i].strand)
            for j in range(len(rec.features[i].sub_features)):
                rec.features[i].sub_features[
                    j].location = SeqFeature.FeatureLocation(
                        SeqFeature.ExactPosition(
                            rec.features[i].sub_features[j].location.start -
                            loc),
                        SeqFeature.ExactPosition(
                            rec.features[i].sub_features[j].location.end -
                            loc),
                        strand=rec.features[i].sub_features[j].strand)
        rec.seq = rec.seq[loc:endloc]
        yield rec
예제 #6
0
 def test_GenerateFeatLoc__make_start_fuzzy__1(self):
     ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`.
         This test evaluates the case where FeatureLocations are made fuzzy. '''
     from Bio import SeqFeature
     start_pos = SeqFeature.ExactPosition(5)
     end_pos = SeqFeature.ExactPosition(9)
     location_object = SeqFeature.FeatureLocation(start_pos, end_pos)
     out = GnOps.GenerateFeatLoc().make_start_fuzzy(location_object)
     self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation
     self.assertIsInstance(out.start, Bio.SeqFeature.BeforePosition) # Fuzzy Start
예제 #7
0
 def test_GenerateFeatLoc__make_start_fuzzy__3(self):
     ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`.
         This test evaluates if end FeatureLocations are made fuzzy. 
         See AfterPosition. '''
     from Bio import SeqFeature
     start_pos = SeqFeature.ExactPosition(5)
     end_pos = SeqFeature.ExactPosition(9)
     location_object = SeqFeature.FeatureLocation(start_pos, end_pos)
     out = GnOps.GenerateFeatLoc().make_end_fuzzy(location_object)
     self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation
     self.assertIsInstance(out.end, Bio.SeqFeature.AfterPosition) # Fuzzy End
예제 #8
0
def _make_position(location_string, offset=0):
    """Turn a Swiss location position into a SeqFeature position object (PRIVATE).

    An offset of -1 is used with a start location to make it pythonic.
    """
    if location_string == "?":
        return SeqFeature.UnknownPosition()
    # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0.
    try:
        return SeqFeature.ExactPosition(max(0, offset + int(location_string)))
    except ValueError:
        pass
    if location_string.startswith("<"):
        try:
            return SeqFeature.BeforePosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith(">"):  # e.g. ">13"
        try:
            return SeqFeature.AfterPosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith("?"):  # e.g. "?22"
        try:
            return SeqFeature.UncertainPosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    raise NotImplementedError("Cannot parse location '%s'" % location_string)
예제 #9
0
def write_gb(main_record_file, add="", destination=""):
    destination = expanduser(destination)
    main_record_file = expanduser(main_record_file)
    main_record = SeqIO.read(main_record_file, "fasta")
    add = expanduser(add)
    add = SeqIO.read(add, "fasta")

    main_record.seq.alphabet = IUPACAmbiguousDNA()
    cre_end = len(main_record.seq)
    main_record.seq = Seq(
        concatenate_overlapping_sequences(main_record.seq.tostring(),
                                          add.seq.tostring()),
        IUPACAmbiguousDNA())
    main_record.name = "ePet-cre"
    main_record.id = "ePet-cre"
    main_record.description = "ePet-cre construct from doi:10.1038/nn.2623"

    my_start_pos = SeqFeature.ExactPosition(0)
    my_end_pos = SeqFeature.ExactPosition(cre_end)

    my_feature_location = SeqFeature.FeatureLocation(my_start_pos,
                                                     my_end_pos,
                                                     strand=1)
    my_feature_type = "CDS"
    my_feature = SeqFeature.SeqFeature(my_feature_location,
                                       type=my_feature_type)
    my_feature.qualifiers["gene"] = "Cre"
    main_record.features.append(my_feature)

    # Add SV40 PolyA
    my_start_pos = SeqFeature.ExactPosition(cre_end)
    my_end_pos = SeqFeature.ExactPosition(cre_end + 118)

    my_feature_location = SeqFeature.FeatureLocation(my_start_pos,
                                                     my_end_pos,
                                                     strand=1)
    my_feature_type = "PolyA"
    my_feature = SeqFeature.SeqFeature(my_feature_location,
                                       type=my_feature_type)
    my_feature.qualifiers["PolyA"] = "SV40-PolyA"
    main_record.features.append(my_feature)

    SeqIO.write(main_record, destination + ".gb", "genbank")
    SeqIO.write(main_record, destination + ".fastas", "fasta")
예제 #10
0
def get_newends(location, length, strand, end, distance, code, subject_start,
                feat_start, feat_finish, truncated):

    # Convert location if on reverse strand
    location = location if strand == 1 else abs(location - (2 * distance + 1))

    # Generate the new end position
    # Correct by length of the match if at the finish end
    change = location + strand * length if end == "finish" else location
    # Multiply by 3 if AA
    change = change * 3 if code == 'A' else change
    # Calculate
    newend = subject_start + change

    # Apply new end to appropriate end
    if ((end == "start" and strand == 1)
            or (end == "finish" and strand == -1)):
        feat_start = SeqFeature.BeforePosition(
            newend) if truncated else SeqFeature.ExactPosition(newend)
    else:
        feat_finish = SeqFeature.AfterPosition(
            newend) if truncated else SeqFeature.ExactPosition(newend)

    return (feat_start, feat_finish)
예제 #11
0
 def _parse_position(element, offset=0):
     try:
         position = int(element.attrib["position"]) + offset
     except KeyError as err:
         position = None
     status = element.attrib.get("status", "")
     if status == "unknown":
         assert position is None
         return SeqFeature.UnknownPosition()
     elif not status:
         return SeqFeature.ExactPosition(position)
     elif status == "greater than":
         return SeqFeature.AfterPosition(position)
     elif status == "less than":
         return SeqFeature.BeforePosition(position)
     elif status == "uncertain":
         return SeqFeature.UncertainPosition(position)
     else:
         raise NotImplementedError("Position status %r" % status)
예제 #12
0
#!/usr/bin/env python
"""Test the Location code located in SeqFeature.py

This checks to be sure fuzzy and non-fuzzy representations of locations
are working properly.
"""
from Bio import SeqFeature

# --- test fuzzy representations
print("Testing fuzzy representations...")

# check the positions alone
exact_pos = SeqFeature.ExactPosition(5)
within_pos_s = SeqFeature.WithinPosition(10, left=10, right=13)
within_pos_e = SeqFeature.WithinPosition(13, left=10, right=13)
between_pos_e = SeqFeature.BetweenPosition(24, left=20, right=24)
before_pos = SeqFeature.BeforePosition(15)
after_pos = SeqFeature.AfterPosition(40)

print "Exact:", exact_pos
print("Within (as start, %i): %s" % (int(within_pos_s), within_pos_s))
print("Within (as end, %i): %s" % (int(within_pos_e), within_pos_e))
print("Between (as end, %i): %s" % (int(between_pos_e), between_pos_e))
print "Before:", before_pos
print "After:", after_pos

# put these into Locations
location1 = SeqFeature.FeatureLocation(exact_pos, within_pos_e)
location2 = SeqFeature.FeatureLocation(before_pos, between_pos_e)
location3 = SeqFeature.FeatureLocation(within_pos_s, after_pos)
예제 #13
0
def genbankOutput(resultGbFile,
                  resultFile,
                  listOfFeaturesToOutput,
                  buildCloroplast=False,
                  dLoopSize=800,
                  nWalk=20):
    '''
	Creates a genbank file based on a fasta file given (resultfile) and a list of features that the genbank
	file should present (listoffeaturestooutput)
	'''
    #creating the genbank file, not annotated, to be opened afterwards and have the features inserted
    with open(resultGbFile, "w") as outputResult:
        finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna)
        finalResults.seq = finalResults.seq.upper()
        finalResults.name = finalResults.name[0:10] + '_draft'
        finalResults.id = finalResults.name[0:10] + '_draft'
        if len(
                finalResults.name
        ) > 16:  #has to 16 characters long at max, or else genbank file throws error
            finalResults.name = finalResults.name[0:16]
            finalResults.id = finalResults.id[0:16]
        count = SeqIO.write(finalResults, outputResult, "genbank")

    dico_intron = {}
    for thisFeatureAlignment in listOfFeaturesToOutput:
        if not ('trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \
         or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower()):
            if dico_intron.has_key(thisFeatureAlignment.seq2.split("_")[0]):
                dico_intron[thisFeatureAlignment.seq2.split("_")[0]] += 1
            else:
                dico_intron[thisFeatureAlignment.seq2.split("_")[0]] = 1

    dico_gene = {}
    with open(
            resultGbFile, "rU"
    ) as outputResult:  #opening the output file, this time to insert the features
        finalResults = SeqIO.read(outputResult, "genbank", generic_dna)
        #lastFeatureAlignment = None
        dLoopFound = False
        for thisFeatureAlignment in listOfFeaturesToOutput:
            # 1. Define a feature type as a text string
            main_feature_qualifiers = {
            }  #create qualifiers dict where the name will be stored

            if 'trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \
            or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower():
                main_feature_qualifiers['product'] = thisFeatureAlignment.seq2
                if 'trn' in thisFeatureAlignment.seq2.lower():
                    main_feature_type = "tRNA"
                else:
                    main_feature_type = "rRNA"
            else:
                main_feature_qualifiers['gene'] = thisFeatureAlignment.seq2
                main_feature_type = "gene"
            gene = thisFeatureAlignment.seq2.split("_")[0]
            if dico_gene.has_key(gene):
                dico_gene[gene] += 1
            else:
                dico_gene[gene] = 1

            main_start_pos = SeqFeature.ExactPosition(
                thisFeatureAlignment.startBase)
            main_end_pos = SeqFeature.ExactPosition(
                thisFeatureAlignment.endBase)

            if main_feature_type == "gene":
                codonDiff = ((main_end_pos - main_start_pos + 1) % 3)
                if codonDiff == 2:
                    main_end_pos += 1
                elif codonDiff == 1:
                    main_end_pos -= 1
            #print main_start_pos
            #print main_end_pos
            #print thisFeatureAlignment.frame

            # 2. Use the locations do define a FeatureLocation
            if thisFeatureAlignment.frame < 0:
                strandToOutput = -1
            else:
                strandToOutput = 1
            main_feature_location = SeqFeature.FeatureLocation(
                main_start_pos - 1, main_end_pos, strand=strandToOutput)
            # 3. Create a SeqFeature
            main_feature = SeqFeature.SeqFeature(
                main_feature_location,
                type=main_feature_type,
                qualifiers=main_feature_qualifiers)
            '''
			#find d-loop part
			#basically just look for a big gap between last feature and this current feature, if there is a gap that
			#is about the size of a d-loop, it most likely is a dloop, since nothing aligned with it and it has that size
			#ignore this check if a cloroplast was built
			if lastFeatureAlignment != None and dLoopFound == False and buildCloroplast == False and dLoopSize > 0:
				if thisFeatureAlignment.startBase > lastFeatureAlignment.endBase + dLoopSize \
				 and thisFeatureAlignment.startBase < lastFeatureAlignment.endBase + 3200:
					dLoopFound = True
					dLoopStartPos = SeqFeature.ExactPosition(lastFeatureAlignment.endBase)
					dLoopEndPos = SeqFeature.ExactPosition(thisFeatureAlignment.startBase)
					dLoopLocation = SeqFeature.FeatureLocation(dLoopStartPos,dLoopEndPos,strand=-1)
					dLoopType = "D-loop"
					dLoopFeature = SeqFeature.SeqFeature(dLoopLocation,type=dLoopType)
					finalResults.features.append(dLoopFeature)

			lastFeatureAlignment = thisFeatureAlignment
			'''
            # 4. Append your newly created SeqFeature to your SeqRecord
            if main_feature_type == "gene":
                cds_qualifiers = dict(main_feature_qualifiers)
                coding_dna = Seq(
                    str(finalResults.seq[thisFeatureAlignment.startBase -
                                         1:thisFeatureAlignment.endBase]),
                    IUPAC.unambiguous_dna)
                if strandToOutput == -1:
                    coding_dna = coding_dna.reverse_complement()
                translationTable = thisFeatureAlignment.translationTable
                tableToUse = CodonTable.unambiguous_dna_by_id[translationTable]
                listOfStartCodons = []
                listOfStopCodons = []
                for startCodon in tableToUse.start_codons:
                    """startCodonSeq = Seq(startCodon, IUPAC.unambiguous_dna)
					startCodonTranslation = str(startCodonSeq.translate(table=translationTable))
					if startCodonTranslation not in listOfStartCodons:
						listOfStartCodons.append(startCodonTranslation)"""
                    if startCodon not in listOfStartCodons:
                        listOfStartCodons.append(startCodon)
                    startCodons = tuple(
                        listOfStartCodons
                    )  #need to make it a tuple so that startswith works with it
                for stopCodon in tableToUse.stop_codons:
                    if stopCodon not in listOfStopCodons:
                        listOfStopCodons.append(stopCodon)
                    stopCodons = tuple(listOfStopCodons)

                nWalkStart = int(nWalk)
                nWalkStop = int(nWalk)
                '''
				For genes in the -1 strand, we look for the stop codons at the start and the start codons at the end!
				'''
                """	if strandToOutput == -1:
					tempStartCodons = startCodons
					tempStopCodons = stopCodons
					startCodons = tempStopCodons
					stopCodons = tempStartCodons
					nWalkStart = nWalk
					nWalkStop = nWalk	"""

                try:
                    '''
					Making sure it starts with startCodons
					'''
                    try:
                        coding_dna_Forward = coding_dna
                        coding_dna_Backward = coding_dna
                        startBase = int(thisFeatureAlignment.startBase)
                        endBase = int(thisFeatureAlignment.endBase)
                        n = 0
                        if strandToOutput == 1:
                            while not coding_dna_Forward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(stopCodons) \
                            and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0:
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    '''print str(strandToOutput)
									print "looking for start ="+str(startCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.startswith(startCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward
									print coding_dna_Backward.startswith(startCodons)'''
                                except:
                                    pass
                            else:
                                if coding_dna_Forward.startswith(startCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.startswith(
                                        startCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.startswith(
                                        stopCodons
                                ):  # we look for a start inside the hit
                                    n = 0
                                    while not coding_dna_Forward.startswith(
                                            startCodons
                                    ) and n < nWalkStart and startBase + (
                                            3 * (n + 1)) <= endBase:
                                        try:
                                            n += 1
                                            coding_dna_Forward = Seq(
                                                str(finalResults.
                                                    seq[startBase - 1 +
                                                        (3 * n):endBase]),
                                                IUPAC.unambiguous_dna)
                                        except:
                                            pass
                                    else:
                                        if coding_dna_Forward.startswith(
                                                startCodons):
                                            main_start_pos = SeqFeature.ExactPosition(
                                                startBase + (3 * n))
                                            thisFeatureAlignment.startBase = main_start_pos
                                            main_feature_location = SeqFeature.FeatureLocation(
                                                main_start_pos - 1,
                                                main_end_pos,
                                                strand=strandToOutput)

                        if strandToOutput == -1:
                            while not coding_dna_Forward.endswith(stopCodons) \
                            and not coding_dna_Backward.endswith(stopCodons) \
                            and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0 and startBase + (3*(n+1)) <= endBase:
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Backward = coding_dna_Backward.reverse_complement(
                                    )
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase - 1 +
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                    )
                                    '''print str(strandToOutput)
									print "looking for stop ="+str(stopCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.endswith(stopCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward
									print coding_dna_Backward.endswith(stopCodons)'''
                                except:
                                    pass
                            else:
                                if coding_dna_Forward.endswith(stopCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase + (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.endswith(stopCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                    except:
                        pass
                    '''
					Updating coding_dna with (new) coordinates
					'''
                    coding_dna = Seq(
                        str(finalResults.seq[thisFeatureAlignment.startBase -
                                             1:thisFeatureAlignment.endBase]),
                        IUPAC.unambiguous_dna)
                    if strandToOutput == -1:
                        coding_dna = coding_dna.reverse_complement()
                    '''
					Making sure it ends with * (stop codon)
					'''
                    try:
                        coding_dna_Forward = coding_dna
                        coding_dna_Backward = coding_dna
                        startBase = int(thisFeatureAlignment.startBase)
                        endBase = int(thisFeatureAlignment.endBase)
                        n = 0
                        if strandToOutput == 1:
                            while not coding_dna_Forward.endswith(stopCodons) \
                            and not coding_dna_Backward.endswith(stopCodons) \
                            and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults):
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase -
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    '''print str(strandToOutput)
									print "looking for stop ="+str(stopCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.endswith(stopCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward	
									print coding_dna_Backward.endswith(stopCodons)'''

                                except:
                                    pass
                            else:
                                if coding_dna_Backward.endswith(stopCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase - (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.endswith(stopCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)

                        if strandToOutput == -1:
                            while not coding_dna_Forward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(startCodons) \
                            and not coding_dna_Forward.startswith(stopCodons) \
                            and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults):
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Backward = coding_dna_Backward.reverse_complement(
                                    )
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                    )
                                    '''print str(strandToOutput)
									print "looking for start ="+str(startCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.startswith(startCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward	
									print coding_dna_Backward.startswith(startCodons)'''

                                except:
                                    pass
                            else:
                                if coding_dna_Backward.startswith(startCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.startswith(
                                        startCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.startswith(
                                        stopCodons
                                ):  # we look for a start inside the hit
                                    n = 0
                                    while not coding_dna_Forward.startswith(
                                            startCodons
                                    ) and n < nWalkStop and endBase - (
                                            3 * (n + 1)) >= startBase:
                                        try:
                                            n += 1
                                            coding_dna_Forward = Seq(
                                                str(finalResults.
                                                    seq[startBase - 1:endBase -
                                                        (3 * n)]),
                                                IUPAC.unambiguous_dna)
                                            coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                            )
                                        except:
                                            pass
                                    else:
                                        if coding_dna_Forward.startswith(
                                                startCodons):
                                            main_end_pos = SeqFeature.ExactPosition(
                                                endBase - (3 * n))
                                            thisFeatureAlignment.endBase = main_end_pos
                                            main_feature_location = SeqFeature.FeatureLocation(
                                                main_start_pos - 1,
                                                main_end_pos,
                                                strand=strandToOutput)
                    except:
                        pass

                    coding_dna = Seq(
                        str(finalResults.seq[thisFeatureAlignment.startBase -
                                             1:thisFeatureAlignment.endBase]),
                        IUPAC.unambiguous_dna)
                    '''print "\n\nFINAL SEQUENCE IS:"
					if strandToOutput == 1:
						print coding_dna+"\n"
					else:
						print coding_dna.reverse_complement()+"\n"'''
                    if strandToOutput == 1:
                        coding_dna_Translation = coding_dna.translate(
                            table=translationTable)
                    else:
                        coding_dna_Translation = coding_dna.reverse_complement(
                        ).translate(table=translationTable)
                    cds_qualifiers['translation'] = coding_dna_Translation
                except:
                    cds_qualifiers['translation'] = 'ERROR'
                cds_feature = SeqFeature.SeqFeature(main_feature_location,
                                                    type='CDS',
                                                    qualifiers=cds_qualifiers)
                main_feature = SeqFeature.SeqFeature(
                    main_feature_location,
                    type=main_feature_type,
                    qualifiers=main_feature_qualifiers)
                finalResults.features.append(main_feature)
                finalResults.features.append(cds_feature)
            else:  #if it's a tRNA or rRNA
                gene_feature = SeqFeature.SeqFeature(
                    main_feature_location,
                    type='gene',
                    qualifiers=main_feature_qualifiers)
                finalResults.features.append(gene_feature)
                finalResults.features.append(main_feature)

    #returns the final SeqRecord object, with all features, so that the script that called genbankOutput can output this result whatever way
    #it wants
    return finalResults
예제 #14
0
def prodigal_parser(seq_file, sco_file, prefix, output_folder):

    bin_ffn_file = '%s.ffn' % prefix
    bin_faa_file = '%s.faa' % prefix
    bin_gbk_file = '%s.gbk' % prefix
    pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
    pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
    pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file)

    # get sequence id list
    id_to_sequence_dict = {}
    sequence_id_list = []
    for each_seq in SeqIO.parse(seq_file, 'fasta'):
        id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
        sequence_id_list.append(each_seq.id)

    # get sequence to cds dict and sequence to transl_table dict
    current_seq_id = ''
    current_transl_table = ''
    current_seq_csd_list = []
    seq_to_cds_dict = {}
    seq_to_transl_table_dict = {}
    for each_cds in open(sco_file):
        if each_cds.startswith('# Sequence Data'):

            # add to dict
            if current_seq_id != '':
                seq_to_cds_dict[current_seq_id] = current_seq_csd_list
                seq_to_transl_table_dict[current_seq_id] = current_transl_table

            # reset value
            current_seq_id = each_cds.strip().split('=')[-1][1:-1].split(
                ' ')[0]
            current_transl_table = ''
            current_seq_csd_list = []

        elif each_cds.startswith('# Model Data'):
            current_transl_table = each_cds.strip().split(';')[-2].split(
                '=')[-1]

        else:
            current_seq_csd_list.append('_'.join(
                each_cds.strip().split('_')[1:]))

    seq_to_cds_dict[current_seq_id] = current_seq_csd_list
    seq_to_transl_table_dict[current_seq_id] = current_transl_table

    bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w')
    bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
    bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
    gene_index = 1
    for seq_id in sequence_id_list:

        # create SeqRecord
        current_sequence = Seq(id_to_sequence_dict[seq_id])
        current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
        current_SeqRecord.seq.alphabet = generic_dna
        transl_table = seq_to_transl_table_dict[seq_id]

        # add SeqFeature to SeqRecord
        for cds in seq_to_cds_dict[seq_id]:

            # define locus_tag id
            locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))

            # define FeatureLocation
            cds_split = cds.split('_')
            cds_start = SF.ExactPosition(int(cds_split[0]))
            cds_end = SF.ExactPosition(int(cds_split[1]))
            cds_strand = cds_split[2]
            current_strand = None
            if cds_strand == '+':
                current_strand = 1
            if cds_strand == '-':
                current_strand = -1
            current_feature_location = FeatureLocation(cds_start,
                                                       cds_end,
                                                       strand=current_strand)

            # get nc sequence
            sequence_nc = ''
            if cds_strand == '+':
                sequence_nc = id_to_sequence_dict[seq_id][cds_start -
                                                          1:cds_end]
            if cds_strand == '-':
                sequence_nc = str(
                    Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end],
                        generic_dna).reverse_complement())

            # translate to aa sequence
            sequence_aa = str(
                SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))

            # remove * at the end
            sequence_aa = sequence_aa[:-1]

            # export nc and aa sequences
            export_dna_record(sequence_nc, locus_tag_id, '',
                              bin_ffn_file_handle)
            export_aa_record(sequence_aa, locus_tag_id, '',
                             bin_faa_file_handle)

            # Define feature type
            current_feature_type = 'CDS'

            # Define feature qualifiers
            current_qualifiers_dict = {}
            current_qualifiers_dict['locus_tag'] = locus_tag_id
            current_qualifiers_dict['transl_table'] = transl_table
            current_qualifiers_dict['translation'] = sequence_aa

            # Create a SeqFeature
            current_feature = SeqFeature(current_feature_location,
                                         type=current_feature_type,
                                         qualifiers=current_qualifiers_dict)

            # Append Feature to SeqRecord
            current_SeqRecord.features.append(current_feature)
            gene_index += 1

        # export to gbk file
        SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank')

    bin_gbk_file_handle.close()
    bin_ffn_file_handle.close()
    bin_faa_file_handle.close()
예제 #15
0
def main():
    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 concat_seq_multi_fasta.py"')

    parser = argparse.ArgumentParser(prog='concat_seq_multi_fasta.py',
                                     description='Concatenate all sequences found in a fasta file into a single'
                                                 ' sequence',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-f', '--fasta', nargs=1, type=argparse.FileType('r'), required=True,
                                 metavar='/path/to/multi/fasta/file.fasta',
                                 help='Path to the input multi fasta file')

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-o', '--outfile', type=str, metavar='/path/to/output/concatenated/file.fasta',
                                         help='Path to the directory where the sequences will be stored',
                                         required=False, default='concatenated.fasta')
    parser_optional_general.add_argument('-s', '--spacer', type=int, metavar='10',
                                         help='Number of "N"s to be added between sequences',
                                         required=False, default=100)

    args = parser.parse_args()

    print('\n'
          '===>  RUNNING  concat_seq_multi_fasta.py  <===')

    args.fasta = os.path.abspath(args.fasta[0].name)

    args.outfile = os.path.abspath(args.outfile)
    if not os.path.isdir(os.path.dirname(args.outfile)):
        os.makedirs(os.path.dirname(args.outfile))

    concatenated = ''
    features = {}
    for seq in SeqIO.parse(args.fasta, 'fasta'):
        id_seq = seq.id
        start = len(concatenated)
        end = len(concatenated) + len(seq)
        features[id_seq] = (start, end)
        concatenated += str(seq.seq)
        concatenated += 'N' * args.spacer
    if args.spacer > 0:
        concatenated = concatenated[:-args.spacer]

    concatenated = SeqRecord(Seq(concatenated, generic_dna),
                             id=os.path.splitext(os.path.basename(args.outfile))[0],
                             description='')

    with open(args.outfile, 'wt', newline='\n') as writer:
        _ = SeqIO.write(concatenated, writer, "fasta")

    for id_seq, info in list(features.items()):
        my_start_pos = SeqFeature.ExactPosition(info[0])
        my_end_pos = SeqFeature.ExactPosition(info[1])

        my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos)

        my_feature_type = "misc_feature"
        qualifiers = {'label': id_seq, 'note': 'Geneious type: Concatenated sequence'}
        strand = 0

        my_feature = SeqFeature.SeqFeature(location=my_feature_location, type=my_feature_type, qualifiers=qualifiers,
                                           strand=strand)

        concatenated.features.append(my_feature)

    with open('{base}.{new_type}'.format(base=args.outfile, new_type='gb'), 'wt', newline='\n') as writer:
        _ = SeqIO.write(concatenated, writer, 'genbank')
from Bio import SeqFeature

# 명확한 position
start_pos=SeqFeature.ExactPosition(15)
end_pos=SeqFeature.ExactPosition(30)
location=SeqFeature.FeatureLocation(start_pos, end_pos)
print(start_pos,end_pos,location)

# 명확하지 않은 position
start_pos2 = SeqFeature.AfterPosition(1)
end_pos2 = SeqFeature.BeforePosition(8)
# end_pos2 = SeqFeature.BetweenPosition(9, left=8, right=9)
my_location = SeqFeature.FeatureLocation(start_pos2, end_pos2)
print(start_pos2,end_pos2,my_location)
예제 #17
0
def construct_scaffold_genbank(protein_recs,
                               protein_file,
                               scaffold_id,
                               outdir=outdir):

    genome_id = protein_file.split('.faa')[0]

    #Get the scaffold rec
    scaffold_filter = lambda x: x.id == scaffold_id
    if '|' not in protein_file:
        contigs = list(
            filter(
                scaffold_filter,
                SeqIO.parse(
                    os.path.join(contigs_dir,
                                 protein_file.replace('.faa', '.fna')),
                    'fasta')))
    else:
        genome_id = protein_file.split('|')[0]
        contigs = list(
            filter(
                scaffold_filter,
                SeqIO.parse(os.path.join(contigs_dir, genome_id + '.fna'),
                            'fasta')))

    genbank_rec = contigs[0]

    #Scaffold-only pfamscan annotations
    if pfam_dir != None:
        try:
            pfam_file = list(
                filter(lambda x: x == protein_file + '.out',
                       os.listdir(pfam_dir)))[0]
            pfam_df = parse_pfam_outfile(os.path.join(pfam_dir, pfam_file))
        except:
            pfam_file = None
            pfam_df = None
        #pfam_df['orf_id'] = pfam_df.orf_id.apply(lambda x: x.split('|')[1] if '|' in x else x)
    else:
        pfam_df = None

    if kofam_dir != None:
        try:
            kofam_file = list(
                filter(lambda x: x == protein_file + '.out.parsed.good',
                       os.listdir(kofam_dir)))[0]
        except:
            print(genome_id + ' doesnt have KOFAM')
            kofam_file = None
            kofam_scafdf = None

        if kofam_file != None:
            kofam_df = pd.read_csv(os.path.join(kofam_dir, kofam_file),
                                   sep='\t',
                                   names=kofam_header)

            kofam_df['scaffold_id'] = kofam_df.orf_id.apply(
                lambda x: '_'.join(x.split('_')[:-1]))
            kofam_df['scaffold_id'] = kofam_df.scaffold_id.apply(
                lambda x: x.split('|')[1] if '|' in x else x)
            kofam_scafdf = kofam_df[kofam_df.scaffold_id == scaffold_id]
    else:
        kofam_scafdf = None

    #Start the nucleotide sequence of the genbank file at the start position of the first CDS
    total_start = int(protein_recs[0].description.split(' # ')[1])
    total_end = int(protein_recs[-1].description.split(' # ')[2])
    genbank_rec.seq = genbank_rec.seq[total_start:total_end + 1]
    genbank_rec.seq.alphabet = generic_dna
    count = 0
    if goososfile != None:
        goosos_df = pd.read_csv(goososfile, sep='\t')
    else:
        goosos_df = None

    for protein_rec in protein_recs:
        #Prepare location info and construct SeqFeature object
        start = int(protein_rec.description.split(' # ')[1]) - total_start
        startpos = SeqFeature.ExactPosition(start)
        end = int(protein_rec.description.split(' # ')[2]) - total_start
        endpos = int(SeqFeature.ExactPosition(end))
        strand = int(protein_rec.description.split(' # ')[3])
        rec_location = FeatureLocation(startpos, endpos)
        rec_feature = SeqFeature.SeqFeature(rec_location,
                                            type="CDS",
                                            strand=strand)

        #Add ORF name without genome ID
        if '|' in protein_rec.id:
            rec_feature.qualifiers['protein_id'] = protein_rec.id.split('|')[1]
        else:
            rec_feature.qualifiers['protein_id'] = protein_rec.id
        rec_feature.qualifiers['translation'] = protein_rec.seq

        #Get pfam info
        if pfam_dir != None and pfam_file != None:
            if '|' in protein_rec.id:
                red_pfam_df = pfam_df[pfam_df.orf_id == protein_rec.id.split(
                    '|')[1]]
            else:
                red_pfam_df = pfam_df[pfam_df.orf_id == protein_rec.id]
            domains = '+'.join(red_pfam_df.pfam_name.tolist())
            rec_feature.qualifiers['name'] = domains

        #Get kofam info
        if kofam_dir != None:
            red_kofam_df = kofam_scafdf[kofam_scafdf.orf_id == protein_rec.id]

            kofam_annotations = '+'.join(red_kofam_df.description.tolist())

            if kofam_annotations != '':
                rec_feature.qualifiers['locus_tag'] = kofam_annotations

        if goosos_df != None:
            red_goosos_df = goosos_df[goosos_df.orf_id == protein_rec.id]
            goosos_annotations = '+'.join(goosos_df.family_hmm.tolist())
            rec_feature.qualifiers['label'] = goosos_annotations

        if annotation_dict != None:

            if protein_rec.id in annotation_dict:
                rec_feature.qualifiers['id'] = annotation_dict[protein_rec.id]
            elif '|' in protein_rec.id and unlabeled_annotation:
                if protein_rec.id.split('|') in annotation_dict:
                    rec_feature.qualifiers['id'] = annotation_dict[
                        protein_rec.id.split('|')]
            elif '|' not in protein_rec.id and not unlabeled_annotation:
                if genome_id + '|' + protein_rec.id in annotation_dict:
                    rec_feature.qualifiers['id'] = annotation_dict[
                        genome_id + '|' + protein_rec.id]
        genbank_rec.features.append(rec_feature)

    SeqIO.write(genbank_rec,
                os.path.join(outdir, genome_id + '_' + scaffold_id + '.gbk'),
                'genbank')
    return
예제 #18
0
    def parse(self):
        """Parse the input."""
        assert self.entry.tag == NS + 'entry'

        def append_to_annotations(key, value):
            if key not in self.ParsedSeqRecord.annotations:
                self.ParsedSeqRecord.annotations[key] = []
            if value not in self.ParsedSeqRecord.annotations[key]:
                self.ParsedSeqRecord.annotations[key].append(value)

        def _parse_name(element):
            self.ParsedSeqRecord.name = element.text
            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' +
                                                element.text)

        def _parse_accession(element):
            append_to_annotations(
                'accessions',
                element.text)  # to cope with SwissProt plain text parser
            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' +
                                                element.text)

        def _parse_protein(element):
            """Parse protein names (PRIVATE)."""
            descr_set = False
            for protein_element in element.getchildren():
                if protein_element.tag in [
                        NS + 'recommendedName', NS + 'alternativeName'
                ]:  #recommendedName tag are parsed before
                    #use protein fields for name and description
                    for rec_name in protein_element.getchildren():
                        ann_key = '%s_%s' % (protein_element.tag.replace(
                            NS, ''), rec_name.tag.replace(NS, ''))
                        append_to_annotations(ann_key, rec_name.text)
                        if (rec_name.tag == NS + 'fullName') and not descr_set:
                            self.ParsedSeqRecord.description = rec_name.text
                            descr_set = True
                elif protein_element.tag == NS + 'component':
                    pass  #not parsed
                elif protein_element.tag == NS + 'domain':
                    pass  #not parsed

        def _parse_gene(element):
            for genename_element in element.getchildren():
                if 'type' in genename_element.attrib:
                    ann_key = 'gene_%s_%s' % (genename_element.tag.replace(
                        NS, ''), genename_element.attrib['type'])
                    if genename_element.attrib['type'] == 'primary':
                        self.ParsedSeqRecord.annotations[
                            ann_key] = genename_element.text
                    else:
                        append_to_annotations(ann_key, genename_element.text)

        def _parse_geneLocation(element):
            append_to_annotations('geneLocation', element.attrib['type'])

        def _parse_organism(element):
            organism_name = com_name = sci_name = ''
            for organism_element in element.getchildren():
                if organism_element.tag == NS + 'name':
                    if organism_element.text:
                        if organism_element.attrib['type'] == 'scientific':
                            sci_name = organism_element.text
                        elif organism_element.attrib['type'] == 'common':
                            com_name = organism_element.text
                        else:
                            #e.g. synonym
                            append_to_annotations("organism_name",
                                                  organism_element.text)
                elif organism_element.tag == NS + 'dbReference':
                    self.ParsedSeqRecord.dbxrefs.append(
                        organism_element.attrib['type'] + ':' +
                        organism_element.attrib['id'])
                elif organism_element.tag == NS + 'lineage':
                    for taxon_element in organism_element.getchildren():
                        if taxon_element.tag == NS + 'taxon':
                            append_to_annotations('taxonomy',
                                                  taxon_element.text)
            if sci_name and com_name:
                organism_name = '%s (%s)' % (sci_name, com_name)
            elif sci_name:
                organism_name = sci_name
            elif com_name:
                organism_name = com_name
            self.ParsedSeqRecord.annotations['organism'] = organism_name

        def _parse_organismHost(element):
            for organism_element in element.getchildren():
                if organism_element.tag == NS + 'name':
                    append_to_annotations("organism_host",
                                          organism_element.text)

        def _parse_keyword(element):
            append_to_annotations('keywords', element.text)

        def _parse_comment(element):
            """Parse comments (PRIVATE).

            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
            To store all the contained data, more complex data structures are needed, such as
            annidated dictionaries. This is left to end user, by optionally setting:

            return_raw_comments=True

            the orginal XMLs is returned in the annotation fields.

            available comment types at december 2009:
                "allergen"
                "alternative products"
                "biotechnology"
                "biophysicochemical properties"
                "catalytic activity"
                "caution"
                "cofactor"
                "developmental stage"
                "disease"
                "domain"
                "disruption phenotype"
                "enzyme regulation"
                "function"
                "induction"
                "miscellaneous"
                "pathway"
                "pharmaceutical"
                "polymorphism"
                "PTM"
                "RNA editing"
                "similarity"
                "subcellular location"
                "sequence caution"
                "subunit"
                "tissue specificity"
                "toxic dose"
                "online information"
                "mass spectrometry"
                "interaction"
            """

            simple_comments = [
                "allergen",
                "biotechnology",
                "biophysicochemical properties",
                "catalytic activity",
                "caution",
                "cofactor",
                "developmental stage",
                "disease",
                "domain",
                "disruption phenotype",
                "enzyme regulation",
                "function",
                "induction",
                "miscellaneous",
                "pathway",
                "pharmaceutical",
                "polymorphism",
                "PTM",
                "RNA editing",  #positions not parsed
                "similarity",
                "subunit",
                "tissue specificity",
                "toxic dose",
            ]

            if element.attrib['type'] in simple_comments:
                ann_key = 'comment_%s' % element.attrib['type'].replace(
                    ' ', '')
                for text_element in element.getiterator(NS + 'text'):
                    if text_element.text:
                        append_to_annotations(ann_key, text_element.text)
            elif element.attrib['type'] == 'subcellular location':
                for subloc_element in element.getiterator(
                        NS + 'subcellularLocation'):
                    for el in subloc_element.getchildren():
                        if el.text:
                            ann_key = 'comment_%s_%s' % (
                                element.attrib['type'].replace(
                                    ' ', ''), el.tag.replace(NS, ''))
                            append_to_annotations(ann_key, el.text)
            elif element.attrib['type'] == 'interaction':
                for interact_element in element.getiterator(NS +
                                                            'interactant'):
                    ann_key = 'comment_%s_intactId' % element.attrib['type']
                    append_to_annotations(ann_key,
                                          interact_element.attrib['intactId'])
            elif element.attrib['type'] == 'alternative products':
                for alt_element in element.getiterator(NS + 'isoform'):
                    ann_key = 'comment_%s_isoform' % element.attrib[
                        'type'].replace(' ', '')
                    for id_element in alt_element.getiterator(NS + 'id'):
                        append_to_annotations(ann_key, id_element.text)
            elif element.attrib['type'] == 'mass spectrometry':
                ann_key = 'comment_%s' % element.attrib['type'].replace(
                    ' ', '')
                start = end = 0
                for loc_element in element.getiterator(NS + 'location'):
                    pos_els = loc_element.getiterator(NS + 'position')
                    pos_els = list(pos_els)
                    # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry
                    try:
                        if pos_els:
                            end = int(pos_els[0].attrib['position'])
                            start = end - 1
                        else:
                            start = int(
                                loc_element.getiterator(NS + 'begin')
                                [0].attrib['position']) - 1
                            end = int(
                                loc_element.getiterator(NS + 'end')
                                [0].attrib['position'])
                    except:  #undefined positions or erroneusly mapped
                        pass
                mass = element.attrib['mass']
                method = element.attrib[
                    'mass']  #TODO - Check this, looks wrong!
                if start == end == 0:
                    append_to_annotations(ann_key,
                                          'undefined:%s|%s' % (mass, method))
                else:
                    append_to_annotations(
                        ann_key, '%s..%s:%s|%s' % (start, end, mass, method))
            elif element.attrib['type'] == 'sequence caution':
                pass  #not parsed: few information, complex structure
            elif element.attrib['type'] == 'online information':
                for link_element in element.getiterator(NS + 'link'):
                    ann_key = 'comment_%s' % element.attrib['type'].replace(
                        ' ', '')
                    for id_element in link_element.getiterator(NS + 'link'):
                        append_to_annotations(
                            ann_key, '%s@%s' % (element.attrib['name'],
                                                link_element.attrib['uri']))

            #return raw XML comments if needed
            if self.return_raw_comments:
                ann_key = 'comment_%s_xml' % element.attrib['type'].replace(
                    ' ', '')
                append_to_annotations(ann_key, ElementTree.tostring(element))

        def _parse_dbReference(element):
            self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' +
                                                element.attrib['id'])
            #e.g.
            # <dbReference type="PDB" key="11" id="2GEZ">
            #   <property value="X-ray" type="method"/>
            #   <property value="2.60 A" type="resolution"/>
            #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
            # </dbReference>
            if 'type' in element.attrib:
                if element.attrib['type'] == 'PDB':
                    method = ""
                    resolution = ""
                    for ref_element in element.getchildren():
                        if ref_element.tag == NS + 'property':
                            dat_type = ref_element.attrib['type']
                            if dat_type == 'method':
                                method = ref_element.attrib['value']
                            if dat_type == 'resolution':
                                resolution = ref_element.attrib['value']
                            if dat_type == 'chains':
                                pairs = ref_element.attrib['value'].split(',')
                                for elem in pairs:
                                    pair = elem.strip().split('=')
                                    if pair[1] != '-':
                                        #TODO - How best to store these, do SeqFeatures make sense?
                                        feature = SeqFeature.SeqFeature()
                                        feature.type = element.attrib['type']
                                        feature.qualifiers[
                                            'name'] = element.attrib['id']
                                        feature.qualifiers['method'] = method
                                        feature.qualifiers[
                                            'resolution'] = resolution
                                        feature.qualifiers['chains'] = pair[
                                            0].split('/')
                                        start = int(pair[1].split('-')[0]) - 1
                                        end = int(pair[1].split('-')[1])
                                        feature.location = SeqFeature.FeatureLocation(
                                            start, end)
                                        #self.ParsedSeqRecord.features.append(feature)

            for ref_element in element.getchildren():
                if ref_element.tag == NS + 'property':
                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs

        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ''
            pub_type = ''
            pub_date = ''
            for ref_element in element.getchildren():
                if ref_element.tag == NS + 'citation':
                    pub_type = ref_element.attrib['type']
                    if pub_type == 'submission':
                        pub_type += ' to the ' + ref_element.attrib['db']
                    if 'name' in ref_element.attrib:
                        journal_name = ref_element.attrib['name']
                    pub_date = ref_element.attrib.get('date', '')
                    j_volume = ref_element.attrib.get('volume', '')
                    j_first = ref_element.attrib.get('first', '')
                    j_last = ref_element.attrib.get('last', '')
                    for cit_element in ref_element.getchildren():
                        if cit_element.tag == NS + 'title':
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + 'authorList':
                            for person_element in cit_element.getchildren():
                                authors.append(person_element.attrib['name'])
                        elif cit_element.tag == NS + 'dbReference':
                            self.ParsedSeqRecord.dbxrefs.append(
                                cit_element.attrib['type'] + ':' +
                                cit_element.attrib['id'])
                            if cit_element.attrib['type'] == 'PubMed':
                                reference.pubmed_id = cit_element.attrib['id']
                            elif ref_element.attrib['type'] == 'MEDLINE':
                                reference.medline_id = cit_element.attrib['id']
                elif ref_element.tag == NS + 'scope':
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + 'source':
                    for source_element in ref_element.getchildren():
                        if source_element.tag == NS + 'tissue':
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = 'Scope: ' + ', '.join(scopes)
            else:
                scopes_str = ''
            if tissues:
                tissues_str = 'Tissue: ' + ', '.join(tissues)
            else:
                tissues_str = ''

            reference.location = [
            ]  #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation.
            reference.authors = ', '.join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % dict(
                        name=journal_name,
                        volume=j_volume,
                        first=j_first,
                        last=j_last,
                        pub_date=pub_date)
                else:
                    reference.journal = journal_name
            reference.comment = ' | '.join(
                (pub_type, pub_date, scopes_str, tissues_str))
            append_to_annotations('references', reference)

        def _parse_position(element, offset=0):
            try:
                position = int(element.attrib['position']) + offset
            except KeyError, err:
                position = None
            status = element.attrib.get('status', '')
            if status == 'unknown':
                assert position is None
                return SeqFeature.UnknownPosition()
            elif not status:
                return SeqFeature.ExactPosition(position)
            elif status == 'greater than':
                return SeqFeature.AfterPosition(position)
            elif status == 'less than':
                return SeqFeature.BeforePosition(position)
            elif status == 'uncertain':
                return SeqFeature.UncertainPosition(position)
            else:
                raise NotImplementedError("Position status %r" % status)
예제 #19
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr,
                        cutoff_spacing, referenceGenomeForDAS, spacerLength,
                        distanceToCutSiteFromPAM_bp):
    from Bio import SeqFeature

    if PAMside == 3:
        distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
        # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
    else:
        distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp - 1
        # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp

    s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS)

    s = s.upper()
    PAM = Seq(seqStr, IUPAC.ambiguous_dna)
    PAM_length = len(seqStr)
    if seqStr == str(PAM.reverse_complement()):
        DoRevComp = 0
        forwardNameString = "{name}_{num:0{width}}"
    else:
        DoRevComp = 1
        forwardNameString = "{name}_F{num:0{width}}"
    listSpacer = []
    listDistBetweenSpacers = []

    spacerNum = 0
    prevStartLocInRefSeq = -9999
    if PAMside == 3:
        gbStringForSearch = s[spacerLength:]
        # Cas9
    else:
        gbStringForSearch = s[:-spacerLength]
        # Cpf1, get all but last ~20 bases of sequence

    spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
    if len(spacerInds) > 1:  # matches found
        del spacerInds[0]  # first result from nt_search is regexp expansion
        #print "len line below {fname}".format(fname=len(spacerInds))
        formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
        print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds))

        for idx, item in enumerate(spacerInds):
            startPos = SeqFeature.ExactPosition(
                item)  # start and end pos of PAM
            endPos = SeqFeature.ExactPosition(item + PAM_length)

            if PAMside == 3:  # Cas9-like
                startLocInRefSeq = startPos + 1
                endLocInRefSeq = startLocInRefSeq + spacerLength - 1
            else:  # Cpf1-like
                startLocInRefSeq = endPos  #Starts immediately after PAM
                endLocInRefSeq = startLocInRefSeq + spacerLength

            startLocInRefGenome = chromStartRG + startLocInRefSeq
            endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
            cutSiteInRefGenome = startLocInRefGenome + distanceToCutSiteFrom5pEnd

            # Only add the spacer if it is a certain distance from the previous spacer
            if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                spacerNum += 1
                strand = "+"
                if spacerNum > 1:
                    distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                else:
                    distFromPrevSpacer = 0
                if PAMside == 3:
                    spacerAsStr = str(s[startLocInRefSeq - 1:endLocInRefSeq])
                    exactPAM = s[endLocInRefSeq:endLocInRefSeq + PAM_length]
                else:
                    spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
                    exactPAM = s[startLocInRefSeq -
                                 PAM_length:startLocInRefSeq]
                    # Python slices: second index is first char you *DON'T* want

                GCcontent = SeqUtils.GC(spacerAsStr)
                listItem = [
                    spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                    chromPos, startLocInRefGenome, endLocInRefGenome,
                    cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                    exactPAM, GCcontent
                ]
                listSpacer.append(listItem)
                listDistBetweenSpacers.append(float(distFromPrevSpacer))
                prevStartLocInRefSeq = startLocInRefSeq

    print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
        limit=cutoff_spacing, num=spacerNum)
    spacerNumTotal = spacerNum

    # Search rev complement of PAM
    # print PAM
    # print PAM.reverse_complement()
    prevStartLocInRefSeq = -9999
    spacerNum = 0
    if DoRevComp:
        if PAMside == 3:
            gbStringForSearch = s[:-spacerLength]
            # get all but last ~20 bases of sequence
        else:
            gbStringForSearch = s[spacerLength:]

        spacerInds = SeqUtils.nt_search(gbStringForSearch,
                                        str(PAM.reverse_complement()))
        if len(spacerInds) > 1:  # matches found
            del spacerInds[
                0]  # first result from nt_search is regexp expansion
            #print "len line below {fname}".format(fname=len(spacerInds))
            formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
            print "Minus strand sgRNAs found: {num}".format(
                num=len(spacerInds))

            for idx, item in enumerate(spacerInds):
                startPos = SeqFeature.ExactPosition(item)
                endPos = SeqFeature.ExactPosition(item + PAM_length)
                #print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)

                # Start and end locations are flipped here due to reverse strand
                if PAMside == 3:
                    endLocInRefSeq = endPos + 1  #flipped for reverse strand
                    startLocInRefSeq = endLocInRefSeq + spacerLength - 1  #flipped for reverse strand
                else:
                    # startLocInRefSeq is 5' end of spacer on PAM-containing strand
                    # endLocInRefSeq is 3' end of spacer on PAM-containing strand
                    # Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
                    startLocInRefSeq = startPos + spacerLength  # b/c spacer length is the offset between gbStringForSearch to RefSeq
                    endLocInRefSeq = startLocInRefSeq - spacerLength + 1

                startLocInRefGenome = chromStartRG + startLocInRefSeq - 1
                endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
                cutSiteInRefGenome = startLocInRefGenome - distanceToCutSiteFrom5pEnd

                # Only add the spacer if it is a certain distance from the previous spacer
                if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                    spacerNum += 1
                    strand = "-"
                    if spacerNum > 1:
                        distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                    else:
                        distFromPrevSpacer = 0
                    if PAMside == 3:  # Cas9-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[endLocInRefSeq -
                                      (PAM_length + 1):endLocInRefSeq - 1]),
                                IUPAC.ambiguous_dna).reverse_complement())
                    else:  # Cpf1-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[startLocInRefSeq:startLocInRefSeq +
                                      PAM_length]),
                                IUPAC.ambiguous_dna).reverse_complement())

                    GCcontent = SeqUtils.GC(spacerAsStr)
                    listItem = [
                        spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                        chromPos, startLocInRefGenome, endLocInRefGenome,
                        cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                        exactPAM, GCcontent
                    ]

                    listSpacer.append(listItem)
                    listDistBetweenSpacers.append(float(distFromPrevSpacer))
                    prevStartLocInRefSeq = startLocInRefSeq

        print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
            limit=cutoff_spacing, num=spacerNum)
        spacerNumTotal = spacerNumTotal + spacerNum

    arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
    meanDist = np.mean(arrDistBetweenSpacers)
    return (listSpacer, spacerNumTotal, meanDist)