def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main():
    parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' )
    parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' )
    parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' )
    parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' )
    parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file')
    
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )
    
    if args.genomic_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta)
        
    new_assemblies = dict() 

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    # >gi|68352484|gb|AAGK01000001.1|
    # AAGK01000001	NC_007344.1	tp.assembly.567468735.1

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)

    mset = biothings.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters')
    parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' )
    parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created')
    parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 )
    biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")
    
    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1
            
            for mRNA in gene.mRNAs():
                
                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')


    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break

                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                coding_seq = mRNA.get_CDS_residues()

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                            .format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                            .format(stop_codon, mRNA.id))

                if args.type == 'cds':
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(translated_seq)))
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")
                
                coding_seq = mRNA.get_CDS_residues(for_translation=True)

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']

    for assembly_id in assemblies:
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = biocodeutils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax
                    else:
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(
                        mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                          end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                                codon_step_size > 0 and CDS_pos > mRNA_limit):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos),
                                  end='')

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_base',
                        type=str,
                        required=True,
                        help='Base name of output files to be created')
    parser.add_argument(
        '-ln',
        '--lab_name',
        type=str,
        required=True,
        help='Required by NCBI to identify the submitting group')
    parser.add_argument('-nap',
                        '--ncbi_acc_prefix',
                        type=str,
                        required=True,
                        help='Required and assigned by NCBI')
    parser.add_argument(
        '-gf',
        '--genomic_fasta',
        type=str,
        required=False,
        help='FASTA file of genomic sequence, if not embedded in GFF')
    parser.add_argument(
        '-go',
        '--go_obo',
        type=str,
        required=False,
        help=
        'GO terms will not be exported unless you pass the path to a GO OBO file'
    )

    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    if args.genomic_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta)

    new_assemblies = dict()

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(
                args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    biocodetbl.print_tbl_from_assemblies(assemblies=assemblies,
                                         ofh=ofh,
                                         go_obo=args.go_obo,
                                         lab_name=args.lab_name)

    mset = biothings.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence report non-terminal internal stops.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-p',
        '--print_n_with_stops',
        type=int,
        required=False,
        default=0,
        help=
        'Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)'
    )
    parser.add_argument(
        '-o',
        '--output_fasta',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output (translated) FASTA file for all those features which had internal stops'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None

    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count(
                        '*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(
                            mRNA.id, assembly_id, loc.fmin + 1, loc.fmax,
                            loc.strand))
                        fasta_out_fh.write("{0}\n".format(
                            biocodeutils.wrapped_fasta(translated_seq)))

                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id))
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff3',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-m',
                        '--masked_fasta',
                        type=str,
                        required=True,
                        help='FASTA with sequence masked with N characters')
    parser.add_argument(
        '-p',
        '--percent_repeat_coverage_cutoff',
        type=int,
        required=True,
        help=
        'Genes with an mRNA covered by this percentage of repeats will be excluded'
    )
    parser.add_argument('-o',
                        '--output_gff3',
                        type=str,
                        required=False,
                        help='Path to GFF3 output file to be created')
    parser.add_argument(
        '-r',
        '--removed_gff3',
        type=str,
        required=False,
        help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3)
    biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")

    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1

            for mRNA in gene.mRNAs():

                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')

    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(
        kept_count, gene_count, ((kept_count / gene_count) * 100)))