コード例 #1
0
ファイル: sam.py プロジェクト: radio1988/pybioinfo
def contain_A(read, n=2):
    '''test if contain A in fastq read found in bam line
    :return: true/false
    '''
    seq = Seq(read.query_sequence)
    if read.flag & 16:
        # reverse
        seq = seq.reverse_complement()
    else:
        # forward alignment
        pass

    if seq.endswith("A"*n):
        return True
    else:
        return False
コード例 #2
0
ファイル: genbankOutput.py プロジェクト: ic4f/MitoFinder
def genbankOutput(resultGbFile,
                  resultFile,
                  listOfFeaturesToOutput,
                  buildCloroplast=False,
                  dLoopSize=800,
                  nWalk=20):
    '''
	Creates a genbank file based on a fasta file given (resultfile) and a list of features that the genbank
	file should present (listoffeaturestooutput)
	'''
    #creating the genbank file, not annotated, to be opened afterwards and have the features inserted
    with open(resultGbFile, "w") as outputResult:
        finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna)
        finalResults.seq = finalResults.seq.upper()
        finalResults.name = finalResults.name[0:10] + '_draft'
        finalResults.id = finalResults.name[0:10] + '_draft'
        if len(
                finalResults.name
        ) > 16:  #has to 16 characters long at max, or else genbank file throws error
            finalResults.name = finalResults.name[0:16]
            finalResults.id = finalResults.id[0:16]
        count = SeqIO.write(finalResults, outputResult, "genbank")

    dico_intron = {}
    for thisFeatureAlignment in listOfFeaturesToOutput:
        if not ('trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \
         or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower()):
            if dico_intron.has_key(thisFeatureAlignment.seq2.split("_")[0]):
                dico_intron[thisFeatureAlignment.seq2.split("_")[0]] += 1
            else:
                dico_intron[thisFeatureAlignment.seq2.split("_")[0]] = 1

    dico_gene = {}
    with open(
            resultGbFile, "rU"
    ) as outputResult:  #opening the output file, this time to insert the features
        finalResults = SeqIO.read(outputResult, "genbank", generic_dna)
        #lastFeatureAlignment = None
        dLoopFound = False
        for thisFeatureAlignment in listOfFeaturesToOutput:
            # 1. Define a feature type as a text string
            main_feature_qualifiers = {
            }  #create qualifiers dict where the name will be stored

            if 'trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \
            or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower():
                main_feature_qualifiers['product'] = thisFeatureAlignment.seq2
                if 'trn' in thisFeatureAlignment.seq2.lower():
                    main_feature_type = "tRNA"
                else:
                    main_feature_type = "rRNA"
            else:
                main_feature_qualifiers['gene'] = thisFeatureAlignment.seq2
                main_feature_type = "gene"
            gene = thisFeatureAlignment.seq2.split("_")[0]
            if dico_gene.has_key(gene):
                dico_gene[gene] += 1
            else:
                dico_gene[gene] = 1

            main_start_pos = SeqFeature.ExactPosition(
                thisFeatureAlignment.startBase)
            main_end_pos = SeqFeature.ExactPosition(
                thisFeatureAlignment.endBase)

            if main_feature_type == "gene":
                codonDiff = ((main_end_pos - main_start_pos + 1) % 3)
                if codonDiff == 2:
                    main_end_pos += 1
                elif codonDiff == 1:
                    main_end_pos -= 1
            #print main_start_pos
            #print main_end_pos
            #print thisFeatureAlignment.frame

            # 2. Use the locations do define a FeatureLocation
            if thisFeatureAlignment.frame < 0:
                strandToOutput = -1
            else:
                strandToOutput = 1
            main_feature_location = SeqFeature.FeatureLocation(
                main_start_pos - 1, main_end_pos, strand=strandToOutput)
            # 3. Create a SeqFeature
            main_feature = SeqFeature.SeqFeature(
                main_feature_location,
                type=main_feature_type,
                qualifiers=main_feature_qualifiers)
            '''
			#find d-loop part
			#basically just look for a big gap between last feature and this current feature, if there is a gap that
			#is about the size of a d-loop, it most likely is a dloop, since nothing aligned with it and it has that size
			#ignore this check if a cloroplast was built
			if lastFeatureAlignment != None and dLoopFound == False and buildCloroplast == False and dLoopSize > 0:
				if thisFeatureAlignment.startBase > lastFeatureAlignment.endBase + dLoopSize \
				 and thisFeatureAlignment.startBase < lastFeatureAlignment.endBase + 3200:
					dLoopFound = True
					dLoopStartPos = SeqFeature.ExactPosition(lastFeatureAlignment.endBase)
					dLoopEndPos = SeqFeature.ExactPosition(thisFeatureAlignment.startBase)
					dLoopLocation = SeqFeature.FeatureLocation(dLoopStartPos,dLoopEndPos,strand=-1)
					dLoopType = "D-loop"
					dLoopFeature = SeqFeature.SeqFeature(dLoopLocation,type=dLoopType)
					finalResults.features.append(dLoopFeature)

			lastFeatureAlignment = thisFeatureAlignment
			'''
            # 4. Append your newly created SeqFeature to your SeqRecord
            if main_feature_type == "gene":
                cds_qualifiers = dict(main_feature_qualifiers)
                coding_dna = Seq(
                    str(finalResults.seq[thisFeatureAlignment.startBase -
                                         1:thisFeatureAlignment.endBase]),
                    IUPAC.unambiguous_dna)
                if strandToOutput == -1:
                    coding_dna = coding_dna.reverse_complement()
                translationTable = thisFeatureAlignment.translationTable
                tableToUse = CodonTable.unambiguous_dna_by_id[translationTable]
                listOfStartCodons = []
                listOfStopCodons = []
                for startCodon in tableToUse.start_codons:
                    """startCodonSeq = Seq(startCodon, IUPAC.unambiguous_dna)
					startCodonTranslation = str(startCodonSeq.translate(table=translationTable))
					if startCodonTranslation not in listOfStartCodons:
						listOfStartCodons.append(startCodonTranslation)"""
                    if startCodon not in listOfStartCodons:
                        listOfStartCodons.append(startCodon)
                    startCodons = tuple(
                        listOfStartCodons
                    )  #need to make it a tuple so that startswith works with it
                for stopCodon in tableToUse.stop_codons:
                    if stopCodon not in listOfStopCodons:
                        listOfStopCodons.append(stopCodon)
                    stopCodons = tuple(listOfStopCodons)

                nWalkStart = int(nWalk)
                nWalkStop = int(nWalk)
                '''
				For genes in the -1 strand, we look for the stop codons at the start and the start codons at the end!
				'''
                """	if strandToOutput == -1:
					tempStartCodons = startCodons
					tempStopCodons = stopCodons
					startCodons = tempStopCodons
					stopCodons = tempStartCodons
					nWalkStart = nWalk
					nWalkStop = nWalk	"""

                try:
                    '''
					Making sure it starts with startCodons
					'''
                    try:
                        coding_dna_Forward = coding_dna
                        coding_dna_Backward = coding_dna
                        startBase = int(thisFeatureAlignment.startBase)
                        endBase = int(thisFeatureAlignment.endBase)
                        n = 0
                        if strandToOutput == 1:
                            while not coding_dna_Forward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(stopCodons) \
                            and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0:
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    '''print str(strandToOutput)
									print "looking for start ="+str(startCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.startswith(startCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward
									print coding_dna_Backward.startswith(startCodons)'''
                                except:
                                    pass
                            else:
                                if coding_dna_Forward.startswith(startCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.startswith(
                                        startCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.startswith(
                                        stopCodons
                                ):  # we look for a start inside the hit
                                    n = 0
                                    while not coding_dna_Forward.startswith(
                                            startCodons
                                    ) and n < nWalkStart and startBase + (
                                            3 * (n + 1)) <= endBase:
                                        try:
                                            n += 1
                                            coding_dna_Forward = Seq(
                                                str(finalResults.
                                                    seq[startBase - 1 +
                                                        (3 * n):endBase]),
                                                IUPAC.unambiguous_dna)
                                        except:
                                            pass
                                    else:
                                        if coding_dna_Forward.startswith(
                                                startCodons):
                                            main_start_pos = SeqFeature.ExactPosition(
                                                startBase + (3 * n))
                                            thisFeatureAlignment.startBase = main_start_pos
                                            main_feature_location = SeqFeature.FeatureLocation(
                                                main_start_pos - 1,
                                                main_end_pos,
                                                strand=strandToOutput)

                        if strandToOutput == -1:
                            while not coding_dna_Forward.endswith(stopCodons) \
                            and not coding_dna_Backward.endswith(stopCodons) \
                            and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0 and startBase + (3*(n+1)) <= endBase:
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase - 1 -
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Backward = coding_dna_Backward.reverse_complement(
                                    )
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase - 1 +
                                                             (3 * n):endBase]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                    )
                                    '''print str(strandToOutput)
									print "looking for stop ="+str(stopCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.endswith(stopCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward
									print coding_dna_Backward.endswith(stopCodons)'''
                                except:
                                    pass
                            else:
                                if coding_dna_Forward.endswith(stopCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase + (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Backward.endswith(stopCodons):
                                    main_start_pos = SeqFeature.ExactPosition(
                                        startBase - (3 * n))
                                    thisFeatureAlignment.startBase = main_start_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                    except:
                        pass
                    '''
					Updating coding_dna with (new) coordinates
					'''
                    coding_dna = Seq(
                        str(finalResults.seq[thisFeatureAlignment.startBase -
                                             1:thisFeatureAlignment.endBase]),
                        IUPAC.unambiguous_dna)
                    if strandToOutput == -1:
                        coding_dna = coding_dna.reverse_complement()
                    '''
					Making sure it ends with * (stop codon)
					'''
                    try:
                        coding_dna_Forward = coding_dna
                        coding_dna_Backward = coding_dna
                        startBase = int(thisFeatureAlignment.startBase)
                        endBase = int(thisFeatureAlignment.endBase)
                        n = 0
                        if strandToOutput == 1:
                            while not coding_dna_Forward.endswith(stopCodons) \
                            and not coding_dna_Backward.endswith(stopCodons) \
                            and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults):
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase -
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    '''print str(strandToOutput)
									print "looking for stop ="+str(stopCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.endswith(stopCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward	
									print coding_dna_Backward.endswith(stopCodons)'''

                                except:
                                    pass
                            else:
                                if coding_dna_Backward.endswith(stopCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase - (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.endswith(stopCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)

                        if strandToOutput == -1:
                            while not coding_dna_Forward.startswith(startCodons) \
                            and not coding_dna_Backward.startswith(startCodons) \
                            and not coding_dna_Forward.startswith(stopCodons) \
                            and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults):
                                try:
                                    n += 1
                                    coding_dna_Backward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Backward = coding_dna_Backward.reverse_complement(
                                    )
                                    coding_dna_Forward = Seq(
                                        str(finalResults.seq[startBase -
                                                             1:endBase +
                                                             (3 * n)]),
                                        IUPAC.unambiguous_dna)
                                    coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                    )
                                    '''print str(strandToOutput)
									print "looking for start ="+str(startCodons)
									print "coding_dna_Forward"
									print coding_dna_Forward
									print coding_dna_Forward.startswith(startCodons)
									print "coding_dna_Backward"
									print coding_dna_Backward	
									print coding_dna_Backward.startswith(startCodons)'''

                                except:
                                    pass
                            else:
                                if coding_dna_Backward.startswith(startCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.startswith(
                                        startCodons):
                                    main_end_pos = SeqFeature.ExactPosition(
                                        endBase + (3 * n))
                                    thisFeatureAlignment.endBase = main_end_pos
                                    main_feature_location = SeqFeature.FeatureLocation(
                                        main_start_pos - 1,
                                        main_end_pos,
                                        strand=strandToOutput)
                                elif coding_dna_Forward.startswith(
                                        stopCodons
                                ):  # we look for a start inside the hit
                                    n = 0
                                    while not coding_dna_Forward.startswith(
                                            startCodons
                                    ) and n < nWalkStop and endBase - (
                                            3 * (n + 1)) >= startBase:
                                        try:
                                            n += 1
                                            coding_dna_Forward = Seq(
                                                str(finalResults.
                                                    seq[startBase - 1:endBase -
                                                        (3 * n)]),
                                                IUPAC.unambiguous_dna)
                                            coding_dna_Forward = coding_dna_Forward.reverse_complement(
                                            )
                                        except:
                                            pass
                                    else:
                                        if coding_dna_Forward.startswith(
                                                startCodons):
                                            main_end_pos = SeqFeature.ExactPosition(
                                                endBase - (3 * n))
                                            thisFeatureAlignment.endBase = main_end_pos
                                            main_feature_location = SeqFeature.FeatureLocation(
                                                main_start_pos - 1,
                                                main_end_pos,
                                                strand=strandToOutput)
                    except:
                        pass

                    coding_dna = Seq(
                        str(finalResults.seq[thisFeatureAlignment.startBase -
                                             1:thisFeatureAlignment.endBase]),
                        IUPAC.unambiguous_dna)
                    '''print "\n\nFINAL SEQUENCE IS:"
					if strandToOutput == 1:
						print coding_dna+"\n"
					else:
						print coding_dna.reverse_complement()+"\n"'''
                    if strandToOutput == 1:
                        coding_dna_Translation = coding_dna.translate(
                            table=translationTable)
                    else:
                        coding_dna_Translation = coding_dna.reverse_complement(
                        ).translate(table=translationTable)
                    cds_qualifiers['translation'] = coding_dna_Translation
                except:
                    cds_qualifiers['translation'] = 'ERROR'
                cds_feature = SeqFeature.SeqFeature(main_feature_location,
                                                    type='CDS',
                                                    qualifiers=cds_qualifiers)
                main_feature = SeqFeature.SeqFeature(
                    main_feature_location,
                    type=main_feature_type,
                    qualifiers=main_feature_qualifiers)
                finalResults.features.append(main_feature)
                finalResults.features.append(cds_feature)
            else:  #if it's a tRNA or rRNA
                gene_feature = SeqFeature.SeqFeature(
                    main_feature_location,
                    type='gene',
                    qualifiers=main_feature_qualifiers)
                finalResults.features.append(gene_feature)
                finalResults.features.append(main_feature)

    #returns the final SeqRecord object, with all features, so that the script that called genbankOutput can output this result whatever way
    #it wants
    return finalResults