Esempi in Python per translate, esempi in Python per biocodeutils.translate

Esempio n. 1

0

Mostra file

File: check_gff_for_internal_stops.py Progetto: 452990729/biocode

def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))

Esempio n. 2

0

Mostra file

File: correct_gff3_CDS_phase_column.py Progetto: wyim-pgl/biocode

def check_and_update_phase(CDS):
    loc = CDS.location()
    CDS.get_residues()

    best_phase = None
    orig_phase_stop_count = None
    best_phase_stop_count = None

    for phase in [0, 1, 2]:
        protein_seq = biocodeutils.translate(CDS.residues[phase:]).rstrip('*')
        stop_count = protein_seq.count('*')

        if phase == loc.phase:
            orig_phase_stop_count = stop_count

        if best_phase is None or stop_count < best_phase_stop_count:
            best_phase = phase
            best_phase_stop_count = stop_count
            continue

    if best_phase != loc.phase:
        print("INFO: CDS {0} at coordinate:{1}, phase:{2} had {3} stops.  Updating to phase:{4} which had {5}".format( \
               CDS.id, loc.fmin, loc.phase, orig_phase_stop_count, best_phase, best_phase_stop_count) )
        loc.phase = best_phase

Esempio n. 3

0

Mostra file

File: correct_gff3_CDS_phase_column.py Progetto: IGS/biocode

def check_and_update_phase(CDS):
    loc = CDS.location()
    CDS.get_residues()

    best_phase = None
    orig_phase_stop_count = None
    best_phase_stop_count = None

    for phase in [ 0, 1, 2 ]:
        protein_seq = biocodeutils.translate(CDS.residues[phase:]).rstrip('*')
        stop_count = protein_seq.count('*')

        if phase == loc.phase:
            orig_phase_stop_count = stop_count

        if best_phase is None or stop_count < best_phase_stop_count:
            best_phase = phase
            best_phase_stop_count = stop_count
            continue
        
    if best_phase != loc.phase:
        print("INFO: CDS {0} at coordinate:{1}, phase:{2} had {3} stops.  Updating to phase:{4} which had {5}".format( \
               CDS.id, loc.fmin, loc.phase, orig_phase_stop_count, best_phase, best_phase_stop_count) )
        loc.phase = best_phase

Esempio n. 4

0

Mostra file

def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break

                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                coding_seq = mRNA.get_CDS_residues()

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                            .format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                            .format(stop_codon, mRNA.id))

                if args.type == 'cds':
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(translated_seq)))

Esempio n. 5

0

Mostra file

File: write_fasta_from_gff.py Progetto: Klortho/biocode

def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")
                
                coding_seq = mRNA.get_CDS_residues(for_translation=True)

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))

Esempio n. 6

0

Mostra file

File: extend_genes_to_stops.py Progetto: wyim-pgl/biocode

def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']

    for assembly_id in assemblies:
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = biocodeutils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax
                    else:
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(
                        mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                          end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                                codon_step_size > 0 and CDS_pos > mRNA_limit):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos),
                                  end='')

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))

Esempio n. 7

0

Mostra file

def print_biogene(gene=None, fh=None, on=None):
    '''
    This method accepts a Gene object located on an Assembly object (from biothings.py) and prints
    the feature graph for that gene in Genbank flat file format, including the gene, RNA and CDS
    '''
    if gene is None:
        raise Exception(
            "ERROR: The print_biogene() function requires a biogene to be passed via the 'gene' argument"
        )

    ## we can auto-detect the molecule if the user didn't pass one
    #   and if there's only one.
    if on is None:
        on = gene.location().on

    gene_loc = gene.location_on(on)
    gene_start = gene_loc.fmin + 1
    gene_stop = gene_loc.fmax

    # area to hack if you want to set default values, for debugging
    #gene.locus_tag = 'Tparva_0000002'

    if gene_loc.strand == 1:
        fh.write("     gene            {0}..{1}\n".format(
            gene_start, gene_stop))
    else:
        fh.write("     gene            complement({0}..{1})\n".format(
            gene_start, gene_stop))

    if gene.locus_tag is None:
        sys.stderr.write("WARNING: No locus_tag found on gene {0}\n".format(
            gene.id))
    else:
        fh.write("                     /locus_tag=\"{0}\"\n".format(
            gene.locus_tag))

    for mRNA in sorted(gene.mRNAs()):
        mRNA_loc = mRNA.location_on(on)

        ###########################
        ## write the mRNA feature (made up of exon fragments)
        mRNA_loc_segments = list()
        for exon in sorted(mRNA.exons()):
            exon_loc = exon.location_on(on)
            mRNA_loc_segments.append([exon_loc.fmin + 1, exon_loc.fmax])

        mRNA_loc_string = segments_to_string(mRNA_loc_segments)

        if mRNA_loc.strand == 1:
            fh.write("     mRNA            {0}\n".format(mRNA_loc_string))
        else:
            fh.write("     mRNA            complement({0})\n".format(
                mRNA_loc_string))

        # Handle the locus tag, but we've already warned if not present on the gene, so don't
        #  do it again here.
        if gene.locus_tag is not None:
            fh.write("                     /locus_tag=\"{0}\"\n".format(
                gene.locus_tag))

        if mRNA.annotation is not None:
            # debug:  You can try out some annotation defaults for printing here
            mRNA.annotation.product_name = "Hypothetical protein"

            if mRNA.annotation.product_name is not None:
                fh.write("                     /product=\"{0}\"\n".format(
                    mRNA.annotation.product_name))

        ###########################
        ## write the CDS feature (made up of CDS fragments)
        cds_loc_segments = list()

        if len(mRNA.CDSs()) < 1:
            raise Exception(
                "ERROR: Encountered an mRNA ({0}) without an CDS children".
                format(mRNA.id))

        for cds in sorted(mRNA.CDSs()):
            cds_loc = cds.location_on(on)
            cds_loc_segments.append([cds_loc.fmin + 1, cds_loc.fmax])

        cds_loc_string = segments_to_string(cds_loc_segments)

        if cds_loc.strand == 1:
            fh.write("     CDS             {0}\n".format(cds_loc_string))
        else:
            fh.write("     CDS             complement({0})\n".format(
                cds_loc_string))

        # Handle the locus tag, but we've already warned if not present on the gene, so don't
        #  do it again here.
        if gene.locus_tag is not None:
            fh.write("                     /locus_tag=\"{0}\"\n".format(
                gene.locus_tag))

        ## if there is annotation on the polypeptide, include it here
        polypeptides = mRNA.polypeptides()
        if len(polypeptides) == 1 and polypeptides[0].annotation is not None:
            annot = polypeptides[0].annotation
            if annot.product_name is not None:
                fh.write("                     /product=\"{0}\"\n".format(
                    annot.product_name))

            if len(annot.ec_numbers) > 0:
                for ec_num in annot.ec_numbers:
                    fh.write(
                        "                     /EC_number=\"{0}\"\n".format(
                            ec_num.number))

            if len(annot.go_annotations) > 0:
                for go_annot in annot.go_annotations:
                    fh.write(
                        "                     /db_xref=\"GO:{0}\"\n".format(
                            go_annot.go_id))

        cds_residues = mRNA.get_CDS_residues()
        polypeptide_residues = biocodeutils.translate(cds_residues)

        if len(polypeptide_residues) > 0:
            # This is the easiest case first, where no wrapping is needed.
            if len(polypeptide_residues) < MAX_FTABLE_CONTENT_WIDTH - 15:
                fh.write("                     /translation=\"{0}\"\n".format(
                    polypeptide_residues))
            else:
                # If we get here, we must wrap
                fh.write("                     /translation=\"{0}\n".format(
                    polypeptide_residues[0:MAX_FTABLE_CONTENT_WIDTH - 14]))
                remaining = polypeptide_residues[MAX_FTABLE_CONTENT_WIDTH -
                                                 14:]
                closing_parens_written = False

                while len(remaining) > 0:
                    if len(remaining) > MAX_FTABLE_CONTENT_WIDTH - 1:
                        fh.write("                     {0}\n".format(
                            remaining[0:MAX_FTABLE_CONTENT_WIDTH]))
                        remaining = remaining[MAX_FTABLE_CONTENT_WIDTH:]
                    else:
                        fh.write(
                            "                     {0}\"\n".format(remaining))
                        remaining = ""
                        closing_parens_written = True

                if closing_parens_written == False:
                    # G675_02159
                    fh.write("                     \"\n")

Esempio n. 8

0

Mostra file

File: biocodegenbank.py Progetto: 452990729/biocode

def print_biogene( gene=None, fh=None, on=None ):
    '''
    This method accepts a Gene object located on an Assembly object (from biothings.py) and prints
    the feature graph for that gene in Genbank flat file format, including the gene, RNA and CDS
    '''
    if gene is None:
        raise Exception( "ERROR: The print_biogene() function requires a biogene to be passed via the 'gene' argument" );

    ## we can auto-detect the molecule if the user didn't pass one
    #   and if there's only one.
    if on is None:
        on = gene.location().on

    gene_loc = gene.location_on( on )
    gene_start = gene_loc.fmin + 1
    gene_stop  = gene_loc.fmax

    # area to hack if you want to set default values, for debugging
    #gene.locus_tag = 'Tparva_0000002'

    if gene_loc.strand == 1:
        fh.write("     gene            {0}..{1}\n".format(gene_start, gene_stop))
    else:
        fh.write("     gene            complement({0}..{1})\n".format(gene_start, gene_stop))

    if gene.locus_tag is None:
        sys.stderr.write("WARNING: No locus_tag found on gene {0}\n".format(gene.id))
    else:
        fh.write("                     /locus_tag=\"{0}\"\n".format(gene.locus_tag))


    for mRNA in sorted(gene.mRNAs()):
        mRNA_loc = mRNA.location_on( on )

        ###########################
        ## write the mRNA feature (made up of exon fragments)
        mRNA_loc_segments = list()
        for exon in sorted(mRNA.exons()):
            exon_loc = exon.location_on(on)
            mRNA_loc_segments.append( [exon_loc.fmin + 1, exon_loc.fmax] )

        mRNA_loc_string = segments_to_string(mRNA_loc_segments)

        if mRNA_loc.strand == 1:
            fh.write("     mRNA            {0}\n".format(mRNA_loc_string))
        else:
            fh.write("     mRNA            complement({0})\n".format(mRNA_loc_string))

        # Handle the locus tag, but we've already warned if not present on the gene, so don't
        #  do it again here.
        if gene.locus_tag is not None:
            fh.write("                     /locus_tag=\"{0}\"\n".format(gene.locus_tag))

        if mRNA.annotation is not None:
            # debug:  You can try out some annotation defaults for printing here
            mRNA.annotation.product_name = "Hypothetical protein"

            if mRNA.annotation.product_name is not None:
                fh.write("                     /product=\"{0}\"\n".format(mRNA.annotation.product_name))

        ###########################
        ## write the CDS feature (made up of CDS fragments)
        cds_loc_segments = list()

        if len(mRNA.CDSs()) < 1:
            raise Exception("ERROR: Encountered an mRNA ({0}) without an CDS children".format(mRNA.id))
        
        for cds in sorted(mRNA.CDSs()):
            cds_loc = cds.location_on(on)
            cds_loc_segments.append( [cds_loc.fmin + 1, cds_loc.fmax] )

        cds_loc_string = segments_to_string(cds_loc_segments)

        if cds_loc.strand == 1:
            fh.write("     CDS             {0}\n".format(cds_loc_string))
        else:
            fh.write("     CDS             complement({0})\n".format(cds_loc_string))

        # Handle the locus tag, but we've already warned if not present on the gene, so don't
        #  do it again here.
        if gene.locus_tag is not None:
            fh.write("                     /locus_tag=\"{0}\"\n".format(gene.locus_tag))

        ## if there is annotation on the polypeptide, include it here
        polypeptides = mRNA.polypeptides()
        if len(polypeptides) == 1 and polypeptides[0].annotation is not None:
            annot = polypeptides[0].annotation
            if annot.product_name is not None:
                fh.write("                     /product=\"{0}\"\n".format(annot.product_name))

            if len(annot.ec_numbers) > 0:
                for ec_num in annot.ec_numbers:
                    fh.write("                     /EC_number=\"{0}\"\n".format(ec_num.number))

            if len(annot.go_annotations) > 0:
                for go_annot in annot.go_annotations:
                    fh.write("                     /db_xref=\"GO:{0}\"\n".format(go_annot.go_id))

        cds_residues = mRNA.get_CDS_residues()
        polypeptide_residues = biocodeutils.translate(cds_residues)

        if len(polypeptide_residues) > 0:
            # This is the easiest case first, where no wrapping is needed.
            if len(polypeptide_residues) < MAX_FTABLE_CONTENT_WIDTH - 15:
                fh.write("                     /translation=\"{0}\"\n".format(polypeptide_residues))
            else:
                # If we get here, we must wrap
                fh.write("                     /translation=\"{0}\n".format(polypeptide_residues[0:MAX_FTABLE_CONTENT_WIDTH - 14]))
                remaining = polypeptide_residues[MAX_FTABLE_CONTENT_WIDTH - 14:]
                closing_parens_written = False
                
                while len(remaining) > 0:
                    if len(remaining) > MAX_FTABLE_CONTENT_WIDTH - 1:
                        fh.write("                     {0}\n".format(remaining[0:MAX_FTABLE_CONTENT_WIDTH]))
                        remaining = remaining[MAX_FTABLE_CONTENT_WIDTH:]
                    else:
                        fh.write("                     {0}\"\n".format(remaining))
                        remaining = ""
                        closing_parens_written = True

                if closing_parens_written == False:
                    # G675_02159
                    fh.write("                     \"\n")

Esempio n. 9

0

Mostra file

File: check_gff_for_internal_stops.py Progetto: wyim-pgl/biocode

def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence report non-terminal internal stops.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-p',
        '--print_n_with_stops',
        type=int,
        required=False,
        default=0,
        help=
        'Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)'
    )
    parser.add_argument(
        '-o',
        '--output_fasta',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output (translated) FASTA file for all those features which had internal stops'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None

    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count(
                        '*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(
                            mRNA.id, assembly_id, loc.fmin + 1, loc.fmax,
                            loc.strand))
                        fasta_out_fh.write("{0}\n".format(
                            biocodeutils.wrapped_fasta(translated_seq)))

                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id))
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))