def main():
    parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters')

    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-hll', '--homopolymer_length_limit', type=int, required=True, help='Stretches of non-N residues longer than this will be replaced with Ns' )
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open( args.output, 'wt' )

    sys.stderr.write("INFO: Parsing input FASTA\n")
    sys.stderr.flush()
    seqs = biocodeutils.fasta_dict_from_file( args.input )

    sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(args.homopolymer_length_limit))
    sys.stderr.flush()
    for seq_id in seqs:
        seq = seqs[seq_id]
        current_seq = seq['s']
        current_homopolymer_base = None
        current_homopolymer_length = 0
        current_homopolymer_start_idx = 0
        base_index = 0

        for base in list(seq['s']):
            if base == current_homopolymer_base:
                current_homopolymer_length += 1
            else:
                if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
                    sys.stderr.write("WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n".format(
                        seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length))
                    sys.stderr.flush()

                    current_seq = "{0}{1}{2}".format(seq['s'][0:current_homopolymer_start_idx],
                                                     'N' * current_homopolymer_length,
                                                     seq['s'][base_index:])

                current_homopolymer_base = base
                current_homopolymer_length = 1
                current_homopolymer_start_idx = base_index

            base_index += 1

        ## check after the last row for any runs which terminate the sequence
        if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
             sys.stderr.write("WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n".format(
                 seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length))
             sys.stderr.flush()

             current_seq = "{0}{1}{2}".format(current_seq[0:current_homopolymer_start_idx],
                                              'N' * current_homopolymer_length,
                                              current_seq[base_index:])

        seqs[seq_id]['s'] = current_seq
        out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
        out_fh.write(biocodeutils.wrapped_fasta(seqs[seq_id]['s']))
        out_fh.write("\n")
Beispiel #2
0
    def write_fasta(self, fh=None, path=None):
        '''
        Writes the current set in FASTA format.  You can either pass the fh or path arguments.  If
        an open file handle already exists, fh is appropriate.  Instead, if you have just a path you
        want to be written to pass the 'path' argument instead.

        The header format in the FASTA entries depends on the type of elements in the set.  
        '''
        if path is not None:
            fh = open(path, 'wt')

        if self.__class__ == PolypeptideSet:
            molecules = self.polypeptides
        elif self.__class__ == AssemblySet:
            molecules = self.assemblies
        else:
            raise Exception("ERROR: writing FASTA not supported in MoleculeSets of this type: {0}".format(self.__class__))

        for molecule in molecules:
            if self.__class__ == PolypeptideSet:
                header = molecule.annotation_string()
            elif self.__class__ == AssemblySet:
                header = molecule.id
                
            fh.write(">{0}\n".format(header))
            fh.write("{0}\n".format(biocodeutils.wrapped_fasta(molecule.residues)))

        fh.close()
Beispiel #3
0
    def write_fasta(self, fh=None, path=None):
        '''
        Writes the current set in FASTA format.  You can either pass the fh or path arguments.  If
        an open file handle already exists, fh is appropriate.  Instead, if you have just a path you
        want to be written to pass the 'path' argument instead.

        The header format in the FASTA entries depends on the type of elements in the set.  
        '''
        if path is not None:
            fh = open(path, 'wt')

        if self.__class__ == PolypeptideSet:
            molecules = self.polypeptides
        elif self.__class__ == AssemblySet:
            molecules = self.assemblies
        else:
            raise Exception(
                "ERROR: writing FASTA not supported in MoleculeSets of this type: {0}"
                .format(self.__class__))

        for molecule in molecules:
            header = molecule.annotation_string()
            fh.write(">{0}\n".format(header))
            fh.write("{0}\n".format(
                biocodeutils.wrapped_fasta(molecule.residues)))

        fh.close()
def main():
    parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = biocodeutils.fasta_dict_from_file( args.input )

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(components[component_id]['s']):
                components[component_id] = seqs[seq_id]
        else:
            raise Exception("ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}".format(seq_id))

    for c_id in components:
        seq_wrapped = biocodeutils.wrapped_fasta(components[c_id]['s'], every=60)
        fout.write(">{0} {1}\n{2}\n".format(seq_id, components[c_id]['h'], seq_wrapped))
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Filters trinity output for longest subcomponents based on naming convention'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help='Output file to be created.  Default = STDOUT')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = biocodeutils.fasta_dict_from_file(args.input)

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(
                    components[component_id]['s']):
                components[component_id] = seqs[seq_id]
                components[component_id]['longest_id'] = seq_id
        else:
            raise Exception(
                "ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}"
                .format(seq_id))

    for c_id in components:
        seq_wrapped = biocodeutils.wrapped_fasta(components[c_id]['s'],
                                                 every=60)
        fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'],
                                            components[c_id]['h'],
                                            seq_wrapped))
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files')
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    files = args.fasta_files

    # pull off a file and index it
    seqs = biocodeutils.fasta_dict_from_file( files.pop() )

    # python strings are immutable, so we need to transform these into lists
    for seq_id in seqs:
        seqs[seq_id]['s'] = list(seqs[seq_id]['s'])

    for fasta_file in args.fasta_files:
        new_seqs = biocodeutils.fasta_dict_from_file( fasta_file )

        for seq_id in new_seqs:
            # make sure it exists in the source file
            if seq_id not in seqs:
                raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) )

            # they should also be the same length
            if len(seqs[seq_id]) != len(new_seqs[seq_id]):
                raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file))

            i = 0
            for base in new_seqs[seq_id]['s']:
                if base != seqs[seq_id]['s'][i]:
                    if base == 'N':
                        seqs[seq_id]['s'][i] = 'N'
                    elif seqs[seq_id]['s'][i] != 'N':
                        print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) )

                i += 1

    # now done, print out the results
    for seq_id in seqs:
        ofh.write( ">{0} {1}\n{2}\n".format( seq_id, seqs[seq_id]['h'], biocodeutils.wrapped_fasta(''.join(seqs[seq_id]['s'])) ) )
def main():
    parser = argparse.ArgumentParser( description='Merge masked FASTA files')

    ## output file to be written
    parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files')
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    files = args.fasta_files

    # pull off a file and index it
    seqs = biocodeutils.fasta_dict_from_file( files.pop() )

    # python strings are immutable, so we need to transform these into lists
    for seq_id in seqs:
        seqs[seq_id]['s'] = list(seqs[seq_id]['s'])

    for fasta_file in args.fasta_files:
        new_seqs = biocodeutils.fasta_dict_from_file( fasta_file )

        for seq_id in new_seqs:
            # make sure it exists in the source file
            if seq_id not in seqs:
                raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) )

            # they should also be the same length
            if len(seqs[seq_id]) != len(new_seqs[seq_id]):
                raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file))

            i = 0
            for base in new_seqs[seq_id]['s']:
                if base != seqs[seq_id]['s'][i]:
                    if base == 'N':
                        seqs[seq_id]['s'][i] = 'N'
                    elif seqs[seq_id]['s'][i] != 'N':
                        print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) )

                i += 1

    # now done, print out the results
    for seq_id in seqs:
        ofh.write( ">{0} {1}\n{2}\n".format( seq_id, seqs[seq_id]['h'], biocodeutils.wrapped_fasta(''.join(seqs[seq_id]['s'])) ) )
def main():
    parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = biocodeutils.fasta_dict_from_file( args.input )

    for seq_id in seqs:
        seq_wrapped = biocodeutils.wrapped_fasta(seqs[seq_id]['s'], every=args.width)
        fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def write_fasta_results(f, polypeptides):
    """
    Produces headers like:
    >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS

    Example:
    
    """
    for polypeptide_id in polypeptides:
        polypeptide = polypeptides[polypeptide_id]
        go_string = ""
        ec_string = ""

        for go_annot in polypeptide.annotation.go_annotations:
            go_string += "GO:{0},".format(go_annot.go_id)

        go_string = go_string.rstrip(',')

        for ec_annot in polypeptide.annotation.ec_numbers:
            ec_string += "{0},".format(ec_annot.number)

        ec_string = ec_string.rstrip(',')

        header = "{0} {1}".format(polypeptide_id,
                                  polypeptide.annotation.product_name)

        if polypeptide.annotation.gene_symbol is not None:
            header = "{0} gene::{1}".format(header,
                                            polypeptide.annotation.gene_symbol)

        if ec_string != "":
            header = "{0} ec::{1}".format(header, ec_string)

        if go_string != "":
            header = "{0} go::{1}".format(header, go_string)

        f.write(">{0}\n".format(header))
        f.write("{0}\n".format(biocodeutils.wrapped_fasta(
            polypeptide.residues)))
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    parser.add_argument('-uc', '--upper_case', action='store_true', required=False, help='Forces all bases to be upper-case' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = biocodeutils.fasta_dict_from_file( args.input )

    for seq_id in seqs:
        if args.upper_case == True:
            seqs[seq_id]['s'] = seqs[seq_id]['s'].upper()
            
        seq_wrapped = biocodeutils.wrapped_fasta(seqs[seq_id]['s'], every=args.width)
        fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Optional.  Sets the value for column 2 in all rows.  Default = .' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    fout = open(args.output_file, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for CDS in mRNA.CDSs():
                    check_and_update_phase(CDS)

            gene.print_as(fh=fout, source=args.source, format='gff3')

    fasta_header_written = False

    for assembly_id in assemblies:
        if assemblies[assembly_id].length > 0:
            if fasta_header_written is False:
                fout.write("##FASTA\n")
                fasta_header_written = True

            fout.write(">{0}\n".format(assemblies[assembly_id].id) )
            fout.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
def write_fasta_results( f, polypeptides ):
    """
    Produces headers like:
    >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS

    Example:
    
    """
    for polypeptide_id in polypeptides:
        polypeptide = polypeptides[polypeptide_id]
        go_string = ""
        ec_string = ""

        for go_annot in polypeptide.annotation.go_annotations:
            go_string += "GO:{0},".format(go_annot.go_id)
        
        go_string = go_string.rstrip(',')

        for ec_annot in polypeptide.annotation.ec_numbers:
            ec_string += "{0},".format(ec_annot.number)
        
        ec_string = ec_string.rstrip(',')

        header = "{0} {1}".format(polypeptide_id, polypeptide.annotation.product_name)

        if polypeptide.annotation.gene_symbol is not None:
            header = "{0} gene::{1}".format(header, polypeptide.annotation.gene_symbol)

        if ec_string != "":
            header = "{0} ec::{1}".format(header, ec_string)
            
        if go_string != "":
            header = "{0} go::{1}".format(header, go_string)
            
        f.write( ">{0}\n".format( header ) )
        f.write( "{0}\n".format( biocodeutils.wrapped_fasta(polypeptide.residues) ) )
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-s',
        '--source',
        type=str,
        required=False,
        default='.',
        help='Optional.  Sets the value for column 2 in all rows.  Default = .'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    fout = open(args.output_file, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for CDS in mRNA.CDSs():
                    check_and_update_phase(CDS)

            gene.print_as(fh=fout, source=args.source, format='gff3')

    fasta_header_written = False

    for assembly_id in assemblies:
        if assemblies[assembly_id].length > 0:
            if fasta_header_written is False:
                fout.write("##FASTA\n")
                fasta_header_written = True

            fout.write(">{0}\n".format(assemblies[assembly_id].id))
            fout.write("{0}\n".format(
                biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break

                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                coding_seq = mRNA.get_CDS_residues()

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                            .format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                            .format(stop_codon, mRNA.id))

                if args.type == 'cds':
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(translated_seq)))
def main():
    parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file.  (default)' )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false' )
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]
            
        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception("ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue
            
            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')
                
                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = biothings.Gene( id=locus_tag )
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                tRNA = biothings.tRNA( id=feat_id, parent=current_gene )
                tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                rRNA = biothings.rRNA( id=feat_id, parent=current_gene )
                rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0
            
            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                current_CDS_phase = 0
                
                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)
                    
                    CDS = biothings.CDS( id=cds_id, parent=current_RNA )
                    CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase )
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                    exon = biothings.Exon( id=exon_id, parent=current_RNA )
                    exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand )
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1
                
                product = feat.qualifiers['product'][0]

            else:
                print("WARNING: The following feature was skipped:\n{0}".format(feat))
                features_skipped_count += 1

        # don't forget to do the last gene, if there were any
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence report non-terminal internal stops.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-p',
        '--print_n_with_stops',
        type=int,
        required=False,
        default=0,
        help=
        'Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)'
    )
    parser.add_argument(
        '-o',
        '--output_fasta',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output (translated) FASTA file for all those features which had internal stops'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None

    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count(
                        '*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(
                            mRNA.id, assembly_id, loc.fmin + 1, loc.fmax,
                            loc.strand))
                        fasta_out_fh.write("{0}\n".format(
                            biocodeutils.wrapped_fasta(translated_seq)))

                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id))
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
Beispiel #18
0
def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")
                
                coding_seq = mRNA.get_CDS_residues(for_translation=True)

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' )
    parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument' )
    parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = biocodeutils.fasta_dict_from_file( args.fasta_in )

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception("ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue
        
        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1
            
            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1
                
                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }


    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = biocodeutils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main():
    parser = argparse.ArgumentParser(
        description='Replaces long homopolymeric stretches with N characters')

    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument(
        '-hll',
        '--homopolymer_length_limit',
        type=int,
        required=True,
        help=
        'Stretches of non-N residues longer than this will be replaced with Ns'
    )
    args = parser.parse_args()

    if args.output is None:
        out_fh = sys.stdout
    else:
        out_fh = open(args.output, 'wt')

    sys.stderr.write("INFO: Parsing input FASTA\n")
    sys.stderr.flush()
    seqs = biocodeutils.fasta_dict_from_file(args.input)

    sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(
        args.homopolymer_length_limit))
    sys.stderr.flush()
    for seq_id in seqs:
        seq = seqs[seq_id]
        current_seq = seq['s']
        current_homopolymer_base = None
        current_homopolymer_length = 0
        current_homopolymer_start_idx = 0
        base_index = 0

        for base in list(seq['s']):
            if base == current_homopolymer_base:
                current_homopolymer_length += 1
            else:
                if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
                    sys.stderr.write(
                        "WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n"
                        .format(seq_id, current_homopolymer_start_idx + 1,
                                current_homopolymer_base,
                                current_homopolymer_length))
                    sys.stderr.flush()

                    current_seq = "{0}{1}{2}".format(
                        seq['s'][0:current_homopolymer_start_idx],
                        'N' * current_homopolymer_length,
                        seq['s'][base_index:])

                current_homopolymer_base = base
                current_homopolymer_length = 1
                current_homopolymer_start_idx = base_index

            base_index += 1

        ## check after the last row for any runs which terminate the sequence
        if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N':
            sys.stderr.write(
                "WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n"
                .format(seq_id, current_homopolymer_start_idx,
                        current_homopolymer_base, current_homopolymer_length))
            sys.stderr.flush()

            current_seq = "{0}{1}{2}".format(
                current_seq[0:current_homopolymer_start_idx],
                'N' * current_homopolymer_length, current_seq[base_index:])

        seqs[seq_id]['s'] = current_seq
        out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h']))
        out_fh.write(biocodeutils.wrapped_fasta(seqs[seq_id]['s']))
        out_fh.write("\n")
def main():
    parser = argparse.ArgumentParser( description='')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file( args.fasta )
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]
        
        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag
            
            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id))
                continue

            
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin
                
                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax))
                        print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset))
                        print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) )

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp))
                            print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp))
                            
                        gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))
                    
            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main():
    parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file.  (default)' )
    parser.add_argument('--no_fasta', dest='fasta', action='store_false' )
    parser.set_defaults(fasta=True)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    seqs_pending_writes = False

    features_skipped_count = 0

    # each gb_record is a SeqRecord object
    for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"):
        mol_id = gb_record.name

        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        if len(str(gb_record.seq)) > 0:
            seqs_pending_writes = True
            assemblies[mol_id].residues = str(gb_record.seq)
            assemblies[mol_id].length = len(str(gb_record.seq))

        current_assembly = assemblies[mol_id]
            
        # each feat is a SeqFeature object
        for feat in gb_record.features:
            #print(feat)
            fmin = int(feat.location.start)
            fmax = int(feat.location.end)

            if feat.location.strand == 1:
                strand = '+'
            elif feat.location.strand == -1:
                strand = '-'
            else:
                raise Exception("ERROR: unstranded feature encountered: {0}".format(feat))

            #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) )
            if feat.type == 'source':
                continue
            
            if feat.type == 'gene':
                # print the previous gene (if there is one)
                if current_gene is not None:
                    gene.print_as(fh=ofh, source='GenBank', format='gff3')
                
                locus_tag = feat.qualifiers['locus_tag'][0]
                gene = biothings.Gene( id=locus_tag )
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene
                current_RNA = None

            elif feat.type == 'mRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'tRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                tRNA = biothings.tRNA( id=feat_id, parent=current_gene )
                tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_tRNA(tRNA)
                current_RNA = tRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0

            elif feat.type == 'rRNA':
                locus_tag = feat.qualifiers['locus_tag'][0]
                rna_count_by_gene[locus_tag] += 1
                feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                
                rRNA = biothings.rRNA( id=feat_id, parent=current_gene )
                rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_rRNA(rRNA)
                current_RNA = rRNA

                if feat_id in exon_count_by_RNA:
                    raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_RNA[feat_id] = 0
            
            elif feat.type == 'CDS':
                locus_tag = feat.qualifiers['locus_tag'][0]
                # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to
                #  manually make one
                if current_RNA is None:
                    feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] )
                    mRNA = biothings.mRNA( id=feat_id, parent=current_gene )
                    mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                    gene.add_mRNA(mRNA)
                    current_RNA = mRNA
                
                exon_count_by_RNA[current_RNA.id] += 1
                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                current_CDS_phase = 0
                
                for loc in feat.location.parts:
                    subfmin = int(loc.start)
                    subfmax = int(loc.end)
                    
                    CDS = biothings.CDS( id=cds_id, parent=current_RNA )
                    CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase )
                    current_RNA.add_CDS(CDS)

                    # calculate the starting phase for the next CDS feature (in case there is one)
                    # 0 + 6 = 0     TTGCAT
                    # 0 + 7 = 2     TTGCATG
                    # 1 + 6 = 1     TTGCAT
                    # 2 + 7 = 1     TTGCATG
                    # general: 3 - ((length - previous phase) % 3)
                    current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3)
                    if current_CDS_phase == 3:
                        current_CDS_phase = 0

                    exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                    exon = biothings.Exon( id=exon_id, parent=current_RNA )
                    exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand )
                    current_RNA.add_exon(exon)
                    exon_count_by_RNA[current_RNA.id] += 1
                
            else:
                print("WARNING: The following feature was skipped:\n{0}".format(feat))
                features_skipped_count += 1

        # don't forget to do the last gene, if there were any
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    if args.fasta is True:
        if seqs_pending_writes is True:
            ofh.write("##FASTA\n")
            for assembly_id in assemblies:
                ofh.write(">{0}\n".format(assembly_id))
                ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))

    if features_skipped_count > 0:
        print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s',
                        '--sam_file',
                        type=str,
                        required=True,
                        help='Input SAM file with reads aligned to reference')
    parser.add_argument(
        '-fi',
        '--fasta_in',
        type=str,
        required=False,
        help=
        'Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument'
    )
    parser.add_argument(
        '-fo',
        '--fasta_out',
        type=str,
        required=False,
        help=
        'If passed along with -fi, the orientation-corrected sequences will be written here.'
    )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = biocodeutils.fasta_dict_from_file(args.fasta_in)

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception(
                "ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}}
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue

        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1

            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1

                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(
                    last_transcript_id, counts['1']['T'], counts['1']['F'],
                    counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}}

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = biocodeutils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(
            seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(
        correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(
        incorrect_orientation_count))
def main():
    parser = argparse.ArgumentParser(description='')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='mRNA',
                        choices=['mRNA', 'CDS'],
                        help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]

        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag

            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print(
                    "INFO: Processing gene with length {0} at {1}-{2}".format(
                        len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print(
                    "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)"
                    .format(gene.id))
                continue

            for mRNA in gene.mRNAs():
                introns = mRNA.introns(on=assembly)

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin

                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin -
                                         offset:intron_loc.fmax -
                                         offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin -
                                        offset] + lower_mid + gene_seq[
                                            intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(
                            intron_loc.fmin, intron_loc.fmax))
                        print(
                            "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}"
                            .format(intron_loc.fmin - offset,
                                    intron_loc.fmax - offset))
                        print(
                            "INFO:\tgenerating lower case seq of length: {0}\n"
                            .format(len(lower_mid)))

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".
                          format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print(
                            "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}"
                            .format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(
                                fmin_chomp, fmax_chomp))
                            print(
                                "\tpulling range: gene_seq[{0} : {1}]".format(
                                    fmin_chomp,
                                    len(gene_seq) - fmax_chomp))

                        gene_seq = gene_seq[fmin_chomp:len(gene_seq) -
                                            fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(
                                gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(
                gene_label, biocodeutils.wrapped_fasta(gene_seq)))