Beispiel #1
0
def main():
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--alignment_file', type=str, required=True, help='GFF3 file with RNA-seq assembly transcript features aligned to the same reference genome.  Usually with something like GMAP.' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_features) = gff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_features) = gff.get_gff3_features(args.alignment_file)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks for genes with multiple mRNA children and creates new genes for each.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    for assembly_id in assemblies:
        current_assembly = assemblies[assembly_id]

        for gene in assemblies[assembly_id].genes():
            rnas_found = 0
            mRNAs = gene.mRNAs()

            for mRNA in mRNAs:
                mRNA_loc = mRNA.location_on(current_assembly)
                rnas_found += 1

                if rnas_found > 1:
                    gene.remove_mRNA(mRNA)

                    print("INFO: splitting mRNA off gene {0}".format(gene.id))
                    new_gene = things.Gene(
                        id="{0}_{1}".format(gene.id, rnas_found))
                    new_gene.locate_on(target=current_assembly,
                                       fmin=mRNA_loc.fmin,
                                       fmax=mRNA_loc.fmax,
                                       strand=mRNA_loc.strand)
                    new_gene.add_RNA(mRNA)
                    new_gene.print_as(fh=ofh, format='gff3')

            if len(mRNAs) > 1:
                gene_loc = gene.location_on(current_assembly)
                mRNA_loc = mRNAs[0].location_on(current_assembly)
                gene_loc.fmin = mRNA_loc.fmin
                gene_loc.fmax = mRNA_loc.fmax
                gene_loc.strand = mRNA_loc.strand

            gene.print_as(fh=ofh, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Split an annotation GFF3 into training and evaluation sets')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-ot', '--output_training_file', type=str, required=True, help='GFF3 file to be created with the training genes' )
    parser.add_argument('-oe', '--output_evaluation_file', type=str, required=True, help='GFF3 file to be created with the evaluation genes' )
    parser.add_argument('-ts', '--training_set_size', type=int, required=False, default=200, help='Number of transcripts to select for training' )
    parser.add_argument('-es', '--evaluation_set_size', type=int, required=False, default=100, help='Number of transcripts to select for evaluation' )
    parser.add_argument('-me', '--max_exon_count', type=int, required=False, help='Skips any mRNAs with more exons than this' )
    parser.add_argument('--retain_composition', dest='retain_composition',action='store_true')
    parser.add_argument('--no_retain_composition', dest='retain_composition',action='store_false')
    parser.set_defaults(retain_composition=False)
    args = parser.parse_args()

    if args.retain_composition is True:
        raise Exception("ERROR: --retain_composition option not yet implemented")

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # key: exon count, value = list of mRNA objects with that count
    # which of these gets used depends on whether --retain_composition is passed
    mRNAs_by_exon_count = defaultdict(lambda: list())
    mRNAs = list()
    mRNA_count = 0

    for asm_id in assemblies:
        for gene in assemblies[asm_id].genes():
            for mRNA in gene.mRNAs():
                exon_count = mRNA.exon_count()

                if args.max_exon_count is None or exon_count <= args.max_exon_count:
                    mRNA_count += 1
                    
                    if args.retain_composition is True:
                        mRNAs_by_exon_count[exon_count].append(mRNA)
                    else:
                        mRNAs.append(mRNA)

    # if you feel like printing a profile
    #for exon_count in mRNAs_by_exon_count:
    #    print("DEBUG: exons:{0}\tcount:{1}".format( exon_count, len(mRNAs_by_exon_count[exon_count]) ) )

    # sanity check on the number of available mRNAs
    if (args.training_set_size + args.evaluation_set_size) > mRNA_count:
        raise Exception("ERROR: acceptable mRNA count ({0}) is less than combined training_set_size ({1}) and evaluation_set_size ({2}) options".format(mRNA_count, args.training_set_size, args.evaluation_set_size) )

    training_mRNAs = list()
    evaluation_mRNAs = list()
    
    if args.retain_composition is True:
        print("DEBUG: retaining composition")
        pass
    else:
        training_mRNAs = random.sample( mRNAs, args.training_set_size )
        unselected_mRNAs = list(set(mRNAs) & set(set(mRNAs) ^ set(training_mRNAs)))
        evaluation_mRNAs = random.sample( unselected_mRNAs, args.evaluation_set_size )

    export_mRNAs_to_file(training_mRNAs, args.output_training_file)
    export_mRNAs_to_file(evaluation_mRNAs, args.output_evaluation_file)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser( description='Splits all GFF3 mRNA isoforms into their own gene models')

    ## Get the variables
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Input GFF3 file' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Output GFF3 file' )
    args = parser.parse_args()
    ofh = open(args.output_file, 'wt')

    print("INFO: Parsing GFF3 features\n")
    (assemblies, ref_features) = gff.get_gff3_features(args.input_file)

    print("INFO: Finding genes with isoforms and splitting them\n")
    ofh.write("##gff-version 3\n")
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            # only changing the gene features with isoforms
            if len(gene.mRNAs()) > 1:
                counter = 1
                for mRNA in gene.mRNAs():
                    new_gene_id = str(gene.id) + "_" + str(counter)
                    counter += 1
                    mRNA_loc = mRNA.location() 
                    print("Splitting " + gene.id)
                    # create a new gene model, correcting the gene coords to the mRNA coords
                    new_gene = things.Gene(id = new_gene_id)
                    new_gene.locate_on( target=assemblies[assembly_id], fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand )
                    mRNA.parent.id = new_gene_id
                    #Now add the mRNA to the gene model
                    new_gene.add_mRNA(mRNA)
                    # print out the new gene model
                    new_gene.print_as(fh=ofh, source='IGS', format='gff3')
            else:
                gene.print_as(fh=ofh, source='IGS', format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Converts GFF3 files to GO Gene Association Format (GAF)')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-go', '--go_file', type=str, required=True, help='Gene Ontology (GO) file' )
    parser.add_argument('-db', '--database', type=str, required=True, help='Database issuing that IDs.  Example: UniProtKB' )
    parser.add_argument('-dbref', '--db_reference', type=str, required=True, help='DB reference, like PMID:2676709 (column 6)' )
    parser.add_argument('-ec', '--evidence_code', type=str, required=False, default='IEA', help='Like IEA (column 7)' )
    parser.add_argument('-t', '--taxon_id', type=int, required=True, help='NCBI taxon ID (column 13)' )
    parser.add_argument('-ad', '--annotation_date', type=str, required=False, help='Annotation date in YYYYMMDD format.  Default = GFF3 file datestamp' )
    parser.add_argument('-ab', '--assign_by', type=str, required=False, help='Assign by (column 15)  Defaults to --database argument value' )
    args = parser.parse_args()

    print("INFO: Parsing GFF3 objects", file=sys.stderr)
    (assemblies, features) = gff.get_gff3_features(args.input_file)

    print("INFO: Parsing GO file", file=sys.stderr)
    go_lookup = parse_go_file(args.go_file)

    annot_date = args.annotation_date
    if annot_date is None:
        annot_date = time.strftime('%Y%m%d', time.gmtime(os.path.getmtime(args.input_file)))

    assign_by = args.assign_by
    if assign_by is None:
        assign_by = args.database

    ofh = open(args.output_file, 'wt')
    
    ofh.write("!gaf-version: 2.0\n")
     
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    for go_annot in polypeptide.annotation.go_annotations:
                        go_id = "GO:{0}".format(go_annot.go_id)
                        product = None
                        gene_sym = None
                        
                        if go_id not in go_lookup:
                            raise Exception("ERROR: GO ID {0} not found in provided go.obo file".format(go_id))

                        if polypeptide.annotation.product_name is not None: product = polypeptide.annotation.product_name
                        if polypeptide.annotation.gene_symbol is not None:  gene_sym = polypeptide.annotation.gene_symbol
                        
                        
                        # Aspect is F, P or C, depending on which component/ontology the term comes from
                        ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}"
                                  "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t"
                                  "\t\n".format(args.database, polypeptide.id, go_id, args.db_reference,
                                                args.evidence_code, go_lookup[go_id], product, gene_sym,
                                                args.taxon_id, annot_date, assign_by))

    print("INFO: Conversion complete.", file=sys.stderr)
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence report non-terminal internal stops.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if utils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = utils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(utils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'
    
    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = gff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = things.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) )

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()
                    
                for CDS in CDSs:
                    keep = True
                    
                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))                    

            gene.print_as(fh=fout, source=source, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Basic comparison of two GFF3 files')

    ## output file to be written
    parser.add_argument('-r', '--ref', type=str, required=True, help='Path to the reference GFF3 file' )
    parser.add_argument('-q', '--qry', type=str, required=True, help='Path to the query GFF3 file' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of the output files to be created' )
    args = parser.parse_args()

    (assemblies, ref_features) = gff.get_gff3_features(args.ref)
    ref_genes = get_genes_from_dict(ref_features)
    
    (assemblies, qry_features) = gff.get_gff3_features(args.qry, assemblies=assemblies)
    qry_genes = get_genes_from_dict(qry_features)

    ref_matches_found = dict()
    qry_matches_found = dict()

    for ref_gene in ref_genes:
        for qry_gene in qry_genes:
            if ref_gene.has_same_coordinates_as( thing=qry_gene ) and \
               ref_gene.shares_exon_structure_with( thing=qry_gene ) and \
               ref_gene.shares_CDS_structure_with( thing=qry_gene ):

                ref_matches_found[ref_gene.id] = qry_gene.id
                qry_matches_found[qry_gene.id] = ref_gene.id

    # open our output files
    out_matches = open("{0}.matches".format(args.output_base), 'wt')
    out_summary = open("{0}.summary".format(args.output_base), 'wt')

    print("INFO: {0}/{1} reference genes had a match to a qry gene".format( len(ref_matches_found), len(ref_genes) ))
    print("INFO: {0}/{1} qry genes had a match to a reference gene".format( len(qry_matches_found), len(qry_genes) ))

    for ref_gene_id in ref_matches_found:
        out_matches.write("{0}\t{1}\n".format(ref_gene_id, ref_matches_found[ref_gene_id]))

    out_summary.write("Reference\t{0}\n".format(args.ref) )
    out_summary.write("Query\t{0}\n".format(args.ref) )
    out_summary.write("Total identical models (with respect to reference)\t{0}\n".format(len(ref_matches_found)))
    out_summary.write("Models in REF not in QRY\t{0}\n".format( len(ref_genes) - len(ref_matches_found) ))
    out_summary.write("Models in QRY not in REF\t{0}\n".format( len(qry_genes) - len(qry_matches_found) ))
def main():
    parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_feats) = gff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = gff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id))
            continue
        
        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id))
                continue
                
            ref_annot = ref_gene.polypeptides()[0].annotation
            
            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)
                
                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps)))
    
    gff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser( description='Create a BED file from GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to output file to be created' )
    
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)
    
    ofh = open(args.output_file, 'wt')
    bed.print_bed_from_assemblies(assemblies=assemblies, ofh=ofh)
Beispiel #11
0
def parse_go_terms_from_gff(file):
    terms = dict()
    assemblies, features = gff.get_gff3_features(file)
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    annot = polypeptide.annotation
                    for go_annot in annot.go_annotations:
                        if go_annot.go_id in terms:
                            terms[go_annot.go_id] += 1
                        else:
                            terms[go_annot.go_id] = 1

    return terms
def parse_go_terms_from_gff(file):
    terms = dict()
    assemblies, features = gff.get_gff3_features(file)
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    annot = polypeptide.annotation
                    for go_annot in annot.go_annotations:
                        if go_annot.go_id in terms:
                            terms[go_annot.go_id] += 1
                        else:
                            terms[go_annot.go_id] = 1

    return terms
def main():
    parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters')
    parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' )
    parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created')
    parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_gff3)
    utils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")
    
    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1
            
            for mRNA in gene.mRNAs():
                
                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')


    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' )
    parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' )
    parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' )
    parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' )
    parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file')
    
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)
    
    if args.genomic_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genomic_fasta)
        
    new_assemblies = dict() 

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    tbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)

    mset = things.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
def main():
    parser = argparse.ArgumentParser( description='Checks for genes with multiple mRNA children and creates new genes for each.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    for assembly_id in assemblies:
        current_assembly = assemblies[assembly_id]
        
        for gene in assemblies[assembly_id].genes():
            rnas_found = 0
            mRNAs = gene.mRNAs()
            
            for mRNA in mRNAs:
                mRNA_loc = mRNA.location_on(current_assembly)
                rnas_found += 1

                if rnas_found > 1:
                    gene.remove_mRNA(mRNA)
                    
                    print("INFO: splitting mRNA off gene {0}".format(gene.id))
                    new_gene = things.Gene(id="{0}_{1}".format(gene.id, rnas_found))
                    new_gene.locate_on(target=current_assembly, fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand)
                    new_gene.add_RNA(mRNA)
                    new_gene.print_as(fh=ofh, format='gff3')

            if len(mRNAs) > 1:
                gene_loc = gene.location_on(current_assembly)
                mRNA_loc = mRNAs[0].location_on(current_assembly)
                gene_loc.fmin = mRNA_loc.fmin
                gene_loc.fmax = mRNA_loc.fmax
                gene_loc.strand = mRNA_loc.strand

            gene.print_as(fh=ofh, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Converts biocode GFF3 into NCBI-spec GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to a Genbank GFF3 file to be created.' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]

        for gene in assemblies[assembly_id].genes():
            ncbigff.print_biogene(gene=gene, fh=ofh, on=assembly, source='INCOMPLETE')
def main():
    parser = argparse.ArgumentParser( description='Creates a single GFF from the output of a few different model prediction tools (coding and non-coding)')

    ## output file to be written
    parser.add_argument('-m', '--model_gff', type=str, required=True, help='Input (pass-through) GFF file' )
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Output file to be written.  Default=STDOUT' )
    parser.add_argument('-b', '--barrnap_gff', type=str, required=False, help='GFF file from Barrnap prediction' )
    parser.add_argument('-a', '--aragorn_out', type=str, required=False, help='Raw output file (with -w) from ARAGORN prediction' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.model_gff)

    if args.barrnap_gff:
        add_barrnap_features(assemblies, features, args.barrnap_gff)

    if args.aragorn_out:
        add_aragorn_features(assemblies, features, args.aragorn_out)

    utils.serialize_gff3(path=args.output_gff, assemblies=assemblies, features=features)
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Optional.  Sets the value for column 2 in all rows.  Default = .' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    fout = open(args.output_file, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for CDS in mRNA.CDSs():
                    check_and_update_phase(CDS)

            gene.print_as(fh=fout, source=args.source, format='gff3')

    fasta_header_written = False

    for assembly_id in assemblies:
        if assemblies[assembly_id].length > 0:
            if fasta_header_written is False:
                fout.write("##FASTA\n")
                fasta_header_written = True

            fout.write(">{0}\n".format(assemblies[assembly_id].id) )
            fout.write("{0}\n".format(utils.wrapped_fasta(assemblies[assembly_id].residues)))
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    type_counts = defaultdict(int)
    type_lengths = defaultdict(int)
    assembly_lengths_found = False

    # key is number of exons, value is number of mRNAs with that many
    CDS_profile = defaultdict(int)

    for assembly_id in assemblies:
        type_counts['assembly'] += 1

        if assemblies[assembly_id].length is not None:
            type_lengths['assembly'] += assemblies[assembly_id].length
            assembly_lengths_found = True

        for gene in assemblies[assembly_id].genes():
            type_counts['gene'] += 1
            type_lengths['gene'] += gene.length

            for mRNA in gene.mRNAs():
                type_counts['mRNA'] += 1
                type_lengths['mRNA'] += mRNA.length
                CDS_profile[mRNA.CDS_count()] += 1

                for exon in mRNA.exons():
                    type_counts['exon'] += 1
                    type_lengths['exon'] += exon.length

                for CDS in mRNA.CDSs():
                    type_counts['CDS fragments'] += 1
                    type_lengths['CDS fragments'] += CDS.length

    ofh.write("Assembly count\t{0}\n".format(type_counts['assembly']))
    if assembly_lengths_found:
        ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly']))
    else:
        ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n")

    gene_length_mean = type_lengths['gene'] / type_counts['gene']
    mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA']
    exon_length_mean = type_lengths['exon'] / type_counts['exon']
    CDS_length_mean = type_lengths['CDS fragments'] / type_counts[
        'CDS fragments']

    mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene']
    exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA']
    CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA']

    ofh.write("\nGene count\t{0}\n".format(type_counts['gene']))
    ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean))
    ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene']))

    ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA']))
    ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean))
    ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA']))
    ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean))

    ofh.write("\nexon count\t{0}\n".format(type_counts['exon']))
    ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean))
    ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon']))
    ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean))

    ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments']))
    ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean))
    ofh.write("CDS fragment length (sum)\t{0}\n".format(
        type_lengths['CDS fragments']))
    ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean))

    ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n")
    for cds_count in sorted(CDS_profile):
        perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100
        ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format(
            cds_count, CDS_profile[cds_count], perc))
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

    print(
        "Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print(
        "Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

    print ("Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print ("Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    type_counts = defaultdict(int)
    type_lengths = defaultdict(int)
    assembly_lengths_found = False

    # key is number of exons, value is number of mRNAs with that many
    CDS_profile = defaultdict(int)
        
    for assembly_id in assemblies:
        type_counts['assembly'] += 1

        if assemblies[assembly_id].length is not None:
            type_lengths['assembly'] += assemblies[assembly_id].length
            assembly_lengths_found = True
        
        for gene in assemblies[assembly_id].genes():
            type_counts['gene'] += 1
            type_lengths['gene'] += gene.length
            
            for mRNA in gene.mRNAs():
                type_counts['mRNA'] += 1
                type_lengths['mRNA'] += mRNA.length
                CDS_profile[mRNA.CDS_count()] += 1

                for exon in mRNA.exons():
                    type_counts['exon'] += 1
                    type_lengths['exon'] += exon.length
                
                for CDS in mRNA.CDSs():
                    type_counts['CDS fragments'] += 1
                    type_lengths['CDS fragments'] += CDS.length
                    

    ofh.write("Assembly count\t{0}\n".format(type_counts['assembly']))
    if assembly_lengths_found:
        ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly']))
    else:
        ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n")

    gene_length_mean = type_lengths['gene'] / type_counts['gene']
    mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA']
    exon_length_mean = type_lengths['exon'] / type_counts['exon']
    CDS_length_mean = type_lengths['CDS fragments'] / type_counts['CDS fragments']

    mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene']
    exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA']
    CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA']
    
    ofh.write("\nGene count\t{0}\n".format(type_counts['gene']))
    ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean))
    ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene']))
    
    
    ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA']))
    ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean))
    ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA']))
    ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean) )
    
    ofh.write("\nexon count\t{0}\n".format(type_counts['exon']))
    ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean))
    ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon']))
    ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean) )

    ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments']))
    ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean))
    ofh.write("CDS fragment length (sum)\t{0}\n".format(type_lengths['CDS fragments']))
    ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean) )
    
    ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n")
    for cds_count in sorted(CDS_profile):
        perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100
        ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format(cds_count, CDS_profile[cds_count], perc) )
def main():
    parser = argparse.ArgumentParser( description='')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]
        
        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag
            
            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id))
                continue

            
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin
                
                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax))
                        print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset))
                        print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) )

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp))
                            print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp))
                            
                        gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))
                    
            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(gene_label, utils.wrapped_fasta(gene_seq)))
Beispiel #24
0
def main():
    parser = argparse.ArgumentParser(
        description="Checks the CDS features against a genome sequence to report/correct phase columns."
    )

    ## output file to be written
    parser.add_argument("-i", "--input_file", type=str, required=True, help="Path to the input GFF3")
    parser.add_argument(
        "-g",
        "--genome_fasta",
        type=str,
        required=False,
        help="Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF",
    )
    parser.add_argument(
        "-o",
        "--output_gff",
        type=str,
        required=False,
        help="Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop",
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ["TAG", "TAA", "TGA"]

    mRNA_extension_limit = 100
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith("*"):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit
                    else:
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                        end="",
                    )

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                            codon_step_size > 0 and CDS_pos > mRNA_limit
                        ):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos : CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos), end="")

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file (along with FASTA data) and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.

    FASTA:
    If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need
    to specify the --fasta option in this script and pass it as a separate file.

    Definitions:
    Intergenic space was a little ambiguous to me as I started writing this.  Does one count the space from
    the beginning of the contig until the first gene, or only between them?  What about short contigs which
    have no annotated genes at all?  From the Sequence Ontology:

    SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or
    bounded by a gene and the end of the chromosome.

    To my reading, this includes contig ends but not gene-less contigs.  To that end, I include the
    former in intergenic space reporting but include the latter as a separate statistic.

    Author: Joshua Orvis (jorvis AT gmail)
    '''
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' )
    parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' )
    parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_gff3)

    if args.fasta is not None:
        seqs = utils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0
    
    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0
    
    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_contig_residues = 0
    empty_contig_residues = 0

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################

    for asm_id in assemblies:
        #print("DEBUG: processing assembly: {0}".format(asm_id))
        assembly = assemblies[asm_id]
        genes = sorted(assembly.genes())
        total_gene_count += len(genes)
        previous_gene_loc = None

        # we should have a length here
        if assembly.length is None or assembly.length == 0:
            raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id))

        if total_gene_count == 0:
            empty_contig_residues += assembly.length
            continue

        total_contig_residues += assembly.length
        first_gene_loc = None
        last_gene_loc = None

        for gene in genes:
            gene_loc = gene.location_on(assembly)

            # if this is the first gene, track the number of bases from the start of the molecule here
            if first_gene_loc is None:
                total_intergenic_space_count += 1
                intergenic_distance = gene_loc.fmin
                total_intergenic_space_residues += intergenic_distance
                intergenic_distances.append(intergenic_distance)
                first_gene_loc = gene_loc

            if previous_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < previous_gene_loc.fmax:
                    if gene_loc.fmax > previous_gene_loc.fmax:
                        previous_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)
                    
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    #if intron_size > 0:
                        #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size))

                    if intron_size < 0:
                        print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size))
                    
                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size
                
            previous_gene_loc = gene_loc
            last_gene_loc = previous_gene_loc
        
        if last_gene_loc is not None:
            total_intergenic_space_count += 1
            intergenic_distance = assembly.length - last_gene_loc.fmax
            total_intergenic_space_residues += intergenic_distance
            intergenic_distances.append(intergenic_distance)

    if total_intergenic_space_count == 0:
        avg_intergenic_space_dist = None
        intergenic_distances = None
        median_int_space_dist = None
    else:
        avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
        intergenic_distances = sorted(intergenic_distances)
        median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes)/2)]
            
    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count) )

    print("\nTotal molecule bases: {0} bp".format(total_contig_residues) )
    print("Empty molecule bases: {0} bp".format(empty_contig_residues) )

    if total_intergenic_space_count > 0:
        print("Intergenic space count: {0}".format(total_intergenic_space_count) )
        print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) )
        print("Median intergenic space distance: {0} bp".format(median_int_space_dist) )
        print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) )
        print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) )
    else:
        print("There were no intergenic spaces found.  This might mean there were no molecules with at least 2 genes.")
 
    print("Intron count: {0}".format(total_intron_count) )
    print("Intron space count: {0}".format(total_intron_residues) )

    print("Average intron size: {0:.1f} bp".format(avg_intron_size) )
    print("Median intron size: {0} bp".format(median_intron_size) )
    print("Minimum intron size: {0} bp".format(intron_sizes[0]) )
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) )
    
    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' )
        plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' )

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])
        
        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)
Beispiel #26
0
def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('-ft', '--feature_type', type=str, required=False, default='mRNA', choices=['mRNA', 'polypeptide'], help='IDs and coordinates will come from this feature type' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.add_argument('--check_internal_stops', dest='check_internal_stops', action='store_true')
    parser.set_defaults(check_ends=False, check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    # sanity option check
    if args.check_internal_stops == True and args.type == 'cds':
        raise Exception("Error:  Checking internal stops for CDS features not currently supported.")

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        utils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            if args.feature_type == 'mRNA':
                feats = gene.mRNAs()
            elif args.feature_type == 'polypeptide':
                feats = gene.polypeptides()
            
            for feat in feats:
                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = feat.id
                export_header = None

                ## Add the gene product name if there is one
                if args.feature_type == 'mRNA':
                    for polypeptide in feat.polypeptides():
                        if polypeptide.annotation is not None:
                            if polypeptide.annotation.product_name is not None:
                                export_header = polypeptide.annotation.product_name
                                break

                    coding_seq = feat.get_CDS_residues(for_translation=True)
                    if feat.locus_tag is not None:
                        export_id = feat.locus_tag
                        
                elif args.feature_type == 'polypeptide':
                    export_header = feat.annotation.product_name
                    coding_seq = feat.parent.get_CDS_residues(for_translation=True)
                    if feat.parent.locus_tag is not None:
                        export_id = feat.parent.locus_tag
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, feat.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, feat.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(utils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = utils.translate(coding_seq)

                    if args.check_internal_stops == True:
                        internal_stop_count = translated_seq[:-1].count('*')
                        if internal_stop_count > 0:
                            sys.stderr.write("Found {0} internal stops in mRNA {1}\n".format(internal_stop_count, feat.id))
                    
                    fout.write("{0}\n".format(utils.wrapped_fasta(translated_seq)))
def main():
    parser = argparse.ArgumentParser( description='Parses multiple sources of evidence to generate a consensus functional annotation')

    ## output file to be written
    parser.add_argument('-f', '--input_fasta', type=str, required=True, help='Protein FASTA file of source molecules' )
    parser.add_argument('-m', '--hmm_htab_list', type=str, required=False, help='List of htab files from hmmpfam3' )
    parser.add_argument('-bs', '--blast_sprot_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/SWISS-PROT' )
    parser.add_argument('-rs', '--rapsearch_sprot_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniProtKB/SWISS-PROT' )
    parser.add_argument('-bt', '--blast_trembl_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/Trembl' )
    parser.add_argument('-bk', '--blast_kegg_btab_list', type=str, required=False, help='List of btab files from BLAST against KEGG' )
    parser.add_argument('-bu100', '--blast_uniref100_btab_list', type=str, required=False, help='List of btab files from BLAST against UniRef100' )
    parser.add_argument('-ru100', '--rapsearch_uniref100_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniRef100' )
    parser.add_argument('-u100f', '--uniref100_fasta', type=str, required=False, help='Only required if also passing RAPSEARCH2 against UniRef100 evidence' )
    parser.add_argument('-tm', '--tmhmm_raw_list', type=str, required=False, help='List of raw files from a tmhmm search' )
    parser.add_argument('-d', '--hmm_db', type=str, required=False, help='SQLite3 db with HMM information' )
    parser.add_argument('-u', '--uniprot_sprot_db', type=str, required=False, help='SQLite3 db with UNIPROT/SWISSPROT information' )
    parser.add_argument('-ur', '--uniref_db', type=str, required=False, help='SQLite3 db with UNIREF information' )
    parser.add_argument('-a', '--format', type=str, required=False, default='tab', help='Output format.  Current options are: "tab", "fasta", "gff3"' )
    parser.add_argument('-s', '--source_gff', type=str, required=False, help='Source GFF file from which proteins were derived.  Required if you want to export any format other than tab-delimited.' )
    parser.add_argument('-e', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='Skip BLAST hits unless they have an E-value at least as low as this' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-r', '--organism_table', type=str, required=False, help='Optional table with counts of organism frequency based on top BLAST match for each protein' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=False, help='If passed, the genomic FASTA sequence will be included in the exported GFF3')
    parser.add_argument('-eon', '--export_organism_names', help='If passed, includes organism names from top BLAST hit into 9th column when available.  Mostly useful for metagenomic samples.', action='store_true')
    args = parser.parse_args()

    check_arguments(args)

    # If --rapsearch_uniref100_btab_list passed, --uniref100_fasta is required
    if args.rapsearch_uniref100_btab_list is not None:
        if args.uniref100_fasta is None:
            raise Exception("ERROR: --uniref100_fasta required if --rapsearch_uniref100_btab_list is passed")

    sources_log_fh = open("{0}.sources.log".format(args.output_file), 'wt')
    
    # this is a dict of biothings.Polypeptide objects
    polypeptides = initialize_polypeptides( sources_log_fh, args.input_fasta )

    # Keyed on polypeptide ID (from the FASTA, which is actually the mRNA gff feature ID), the
    #  values here are the organism name for the top BLAST match of each
    polypeptide_blast_org = dict()

    # get source structural annotation, if necessary:
    if args.source_gff is not None:
        print("INFO: parsing source GFF")
        (assemblies, features) = gff.get_gff3_features(args.source_gff)

    if args.hmm_htab_list is not None:
        # connection to the HMM-associated SQLite3 database
        hmm_db_conn = sqlite3.connect(args.hmm_db)
        hmm_db_curs = hmm_db_conn.cursor()
        
        if args.hmm_db is None:
            raise Exception("ERROR: You specified HMM results but not the db with the -d option")
        
        print("INFO: parsing HMM evidence")
        parse_hmm_evidence( sources_log_fh, polypeptides, args.hmm_htab_list, hmm_db_curs )
        hmm_db_curs.close()

    if args.blast_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified BLAST evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing BLAST (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.blast_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'blast' )
        usp_db_curs.close()

    if args.rapsearch_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified RAPSEARCH2 evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing RAPSEARCH2 (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.rapsearch_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'rapsearch2' )
        usp_db_curs.close()

    if args.blast_trembl_btab_list is not None:
        print("INFO: parsing BLAST (TrEMBL) evidence")
        parse_trembl_blast_evidence(polypeptides, args.blast_trembl_btab_list, args.blast_eval_cutoff)

    if args.blast_kegg_btab_list is not None:
        print("INFO: parsing BLAST (KEGG) evidence")
        parse_kegg_blast_evidence(sources_log_fh, polypeptides, args.blast_kegg_btab_list, args.blast_eval_cutoff)

    if args.blast_uniref100_btab_list is not None:
        print("INFO: parsing BLAST (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.blast_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'blast', args.uniref100_fasta)
        uniref_db_curs.close()

    if args.rapsearch_uniref100_btab_list is not None:
        print("INFO: parsing RAPSEARCH2 (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.rapsearch_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'rapsearch2', args.uniref100_fasta)
        uniref_db_curs.close()
        
    if args.tmhmm_raw_list is not None:
        print("INFO: parsing TMHMM evidence")
        parse_tmhmm_evidence(sources_log_fh, polypeptides, args.tmhmm_raw_list)

    ## output will either be a file or STDOUT
    print("INFO: writing output")
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    if args.format == 'tab':
        write_tab_results(fout, polypeptides)
    elif args.format == 'fasta':
        write_fasta_results(fout, polypeptides)
    elif args.format == 'gff3':
        write_gff3_results(fout, polypeptides, assemblies, features, args.genomic_fasta)
    
    fout.close()

    ## There isn't a method in biocodegff3 to add arbitrary key=value pairs.  So we have to cheat here.
    if args.export_organism_names is True:
        if args.output_file:
            append_organism_names_to_gff(args.output_file, polypeptide_blast_org)
        else:
            raise Exception("ERROR: an --output_file must be specified when using the --export_organism_names option.")

    if args.organism_table is not None:
        create_organism_table(args.organism_table, polypeptide_blast_org)
Beispiel #28
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']

    mRNA_extension_limit = 100
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit
                    else:
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(
                        mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                          end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                                codon_step_size > 0 and CDS_pos > mRNA_limit):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos),
                                  end='')

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
Beispiel #29
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Input file with one term per line. This is your file with ALL terms.' )
    parser.add_argument('-s', '--slim_terms', type=str, required=True, help='Plain text file with one term per line - your slim.' )
    parser.add_argument('-obo', '--ontology_file', type=str, required=True, help='Full obo file, providing the network of terms' )
    args = parser.parse_args()

    # Parse the OBO file and store a term lookup as well as graphs for each namespace
    terms, g = parse_obo_graph(args.ontology_file)

    # parse list of source GO terms
    source_go_terms = dict()
    assemblies, features = gff.get_gff3_features(args.input_file)
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    annot = polypeptide.annotation
                    for go_annot in annot.go_annotations:
                        if go_annot.go_id in source_go_terms:
                            source_go_terms[go_annot.go_id] += 1
                        else:
                            source_go_terms[go_annot.go_id] = 1

    print("Slimming {0} unique source GO terms".format(len(source_go_terms)))

    # For testing, show all descendents of
    # GO:0008150 - biological_process
    # GO:0005575 - cellular_component
    # GO:0003674 - molecular_function
    biological_process_idx = terms['GO:0008150']['idx']
    slim_targets = {'unknown': 0}
    for edge_id in g['biological_process'].incident(biological_process_idx, mode='IN'):
        #print("Found edge id {0}".format(edge_id))
        # only looking at those getting more specific
        source_idx = g['biological_process'].es[edge_id].source
        target_idx = g['biological_process'].es[edge_id].target
        slim_targets[source_idx] = 0
  
    # how many do we find of each target?
    for source_id in source_go_terms:
        source_id = "GO:{0}".format(source_id)
        matching_targets = list()
        best_path_dist = 1000

        # check this annotated GO ID against the SLIM targets
        if source_id in terms:
            for target_idx in slim_targets:
                if target_idx != 'unknown':
                    paths = g['biological_process'].shortest_paths_dijkstra(source=terms[source_id]['idx'], target=target_idx)
                    path_dist = paths[0][0]
                    if path_dist != float('inf'):
                        if path_dist < best_path_dist:
                            best_path_dist = path_dist
                            matching_targets = [target_idx]
                        elif path_dist == best_path_dist:
                            matching_targets.append(target_idx)

        if len(matching_targets) > 0:
            for t_idx in matching_targets:
                slim_targets[t_idx] += 1
        else:
            slim_targets['unknown'] += 1

    print("Slim counts:")
    for id in slim_targets:
        print("\t{0} - {1}".format(id, slim_targets[id]))
Beispiel #30
0
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    parser.add_argument(
        '-el',
        '--extension_limit',
        type=int,
        required=False,
        default=100,
        help=
        'Optional.  Limits how far an extension will happen looking for an in-frame stop codon'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print(
                        "\tcoding sequence ends with {0}, last three a.a.: {1}"
                        .format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print(
                        "\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t"
                        .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos,
                                mRNA_loc.strand),
                        end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (
                                mRNA_loc.strand == -1
                                and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]

                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(
                                    next_codon)
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos, CDS_pos - 3),
                                      end='')
                            else:
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos - 3, CDS_pos),
                                      end='')

                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id],
                                        to=(CDS_pos + 3))
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id], to=CDS_pos)
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument(
        '-ft',
        '--feature_type',
        type=str,
        required=False,
        default='mRNA',
        choices=['mRNA', 'polypeptide'],
        help='IDs and coordinates will come from this feature type')
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.add_argument('--check_internal_stops',
                        dest='check_internal_stops',
                        action='store_true')
    parser.set_defaults(check_ends=False, check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    # sanity option check
    if args.check_internal_stops == True and args.type == 'cds':
        raise Exception(
            "Error:  Checking internal stops for CDS features not currently supported."
        )

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        utils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            if args.feature_type == 'mRNA':
                feats = gene.mRNAs()
            elif args.feature_type == 'polypeptide':
                feats = gene.polypeptides()

            for feat in feats:
                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = feat.id
                export_header = None

                ## Add the gene product name if there is one
                if args.feature_type == 'mRNA':
                    for polypeptide in feat.polypeptides():
                        if polypeptide.annotation is not None:
                            if polypeptide.annotation.product_name is not None:
                                export_header = polypeptide.annotation.product_name
                                break

                    coding_seq = feat.get_CDS_residues(for_translation=True)
                    if feat.locus_tag is not None:
                        export_id = feat.locus_tag

                elif args.feature_type == 'polypeptide':
                    export_header = feat.annotation.product_name
                    coding_seq = feat.parent.get_CDS_residues(
                        for_translation=True)
                    if feat.parent.locus_tag is not None:
                        export_id = feat.parent.locus_tag

                if len(coding_seq) > 0:
                    fout.write(">{0}".format(export_id))
                    if export_header is not None:
                        fout.write(" {0}\n".format(export_header))
                    else:
                        fout.write("\n")

                    if args.check_ends == True:
                        # check the starting codon
                        start_codon = coding_seq[0:3].upper()
                        if start_codon not in start_codons:
                            sys.stderr.write(
                                "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                                .format(start_codon, feat.id))

                        stop_codon = coding_seq[-3:].upper()
                        if stop_codon not in stop_codons:
                            sys.stderr.write(
                                "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                                .format(stop_codon, feat.id))

                    if args.type == 'cds':
                        fout.write("{0}\n".format(
                            utils.wrapped_fasta(coding_seq)))
                    else:
                        translated_seq = utils.translate(coding_seq)

                        if args.check_internal_stops == True:
                            internal_stop_count = translated_seq[:-1].count(
                                '*')
                            if internal_stop_count > 0:
                                sys.stderr.write(
                                    "Found {0} internal stops in mRNA {1}\n".
                                    format(internal_stop_count, feat.id))

                        fout.write("{0}\n".format(
                            utils.wrapped_fasta(translated_seq)))
                else:
                    print(
                        "WARNING: Skipped feature {0} because it had no associated CDS features"
                        .format(export_id),
                        file=sys.stderr)
Beispiel #32
0
def main():
    parser = argparse.ArgumentParser(
        description='Split an annotation GFF3 into training and evaluation sets'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-ot',
                        '--output_training_file',
                        type=str,
                        required=True,
                        help='GFF3 file to be created with the training genes')
    parser.add_argument(
        '-oe',
        '--output_evaluation_file',
        type=str,
        required=True,
        help='GFF3 file to be created with the evaluation genes')
    parser.add_argument('-ts',
                        '--training_set_size',
                        type=int,
                        required=False,
                        default=200,
                        help='Number of transcripts to select for training')
    parser.add_argument('-es',
                        '--evaluation_set_size',
                        type=int,
                        required=False,
                        default=100,
                        help='Number of transcripts to select for evaluation')
    parser.add_argument('-me',
                        '--max_exon_count',
                        type=int,
                        required=False,
                        help='Skips any mRNAs with more exons than this')
    parser.add_argument('--retain_composition',
                        dest='retain_composition',
                        action='store_true')
    parser.add_argument('--no_retain_composition',
                        dest='retain_composition',
                        action='store_false')
    parser.set_defaults(retain_composition=False)
    args = parser.parse_args()

    if args.retain_composition is True:
        raise Exception(
            "ERROR: --retain_composition option not yet implemented")

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # key: exon count, value = list of mRNA objects with that count
    # which of these gets used depends on whether --retain_composition is passed
    mRNAs_by_exon_count = defaultdict(lambda: list())
    mRNAs = list()
    mRNA_count = 0

    for asm_id in assemblies:
        for gene in assemblies[asm_id].genes():
            for mRNA in gene.mRNAs():
                exon_count = mRNA.exon_count()

                if args.max_exon_count is None or exon_count <= args.max_exon_count:
                    mRNA_count += 1

                    if args.retain_composition is True:
                        mRNAs_by_exon_count[exon_count].append(mRNA)
                    else:
                        mRNAs.append(mRNA)

    # if you feel like printing a profile
    #for exon_count in mRNAs_by_exon_count:
    #    print("DEBUG: exons:{0}\tcount:{1}".format( exon_count, len(mRNAs_by_exon_count[exon_count]) ) )

    # sanity check on the number of available mRNAs
    if (args.training_set_size + args.evaluation_set_size) > mRNA_count:
        raise Exception(
            "ERROR: acceptable mRNA count ({0}) is less than combined training_set_size ({1}) and evaluation_set_size ({2}) options"
            .format(mRNA_count, args.training_set_size,
                    args.evaluation_set_size))

    training_mRNAs = list()
    evaluation_mRNAs = list()

    if args.retain_composition is True:
        print("DEBUG: retaining composition")
        pass
    else:
        training_mRNAs = random.sample(mRNAs, args.training_set_size)
        unselected_mRNAs = list(
            set(mRNAs) & set(set(mRNAs) ^ set(training_mRNAs)))
        evaluation_mRNAs = random.sample(unselected_mRNAs,
                                         args.evaluation_set_size)

    export_mRNAs_to_file(training_mRNAs, args.output_training_file)
    export_mRNAs_to_file(evaluation_mRNAs, args.output_evaluation_file)
def main():
    parser = argparse.ArgumentParser( description='Converts BLAST and RAPSearch2 tabular output to BED')

    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-m', '--map_to_gff_coords', type=str, required=False, help='GFF3 file to use to map coordinates onto genomic space' )
    parser.add_argument('-s', '--score_by', type=str, required=False, default='bitscore', help='Populates the score column by bitscore or evalue' )
    args = parser.parse_args()

    if not os.path.exists(args.input_file):
        raise Exception("Input file passed {0} wasn't found".format(args.input_file))

    ofh = open(args.output_file, 'wt')

    if args.map_to_gff_coords is not None:
        (assemblies, features) = gff.get_gff3_features(args.map_to_gff_coords)

    rapsearch_detected = False

    for line in open(args.input_file):
        if line.startswith('#'): 
            m = re.search('RAPSearch', line)
            if m:
                rapsearch_detected = True

            continue

        line = line.rstrip()
        cols = line.split("\t")
        if len(cols) < 12: continue

        if args.map_to_gff_coords is None:
            start = cols[6]
            stop = cols[7]
            mol_id = cols[0]
            strand = '+'
        else:
            if cols[0] in features:
                feat_loc = features[cols[0]].location()
                strand = '+' if feat_loc.strand == 1 else '-'
                
                protein_length = int(cols[7]) - int(cols[6]) + 1

                start = feat_loc.fmin + ((int(cols[6]) - 1) * 3)
                stop = start + ((protein_length + 1) * 3)
                    
                mol_id = feat_loc.on.id
            else:
                raise Exception("ERROR: Failed to find feature {0} in GFF3 feature set".format(cols[0]))

        if args.score_by == 'bitscore':
            adj_score = int(float(cols[11]))
        else:
            e_val = float(cols[10])
            if rapsearch_detected == True:
                adj_score = int(0 - e_val)
            else:
                adj_score = int(0 - math.log(e_val))

        if adj_score < 0:
            adj_score = 0
        elif adj_score > 1000:
            adj_score = 1000
            
        ofh.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(mol_id, start, stop, cols[1], adj_score, strand))

    ofh.close()
def main():

    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-g', '--genemark', type=str, required=True, help='Path to the results from GeneMark-ES' )
    parser.add_argument('-c', '--cegma', type=str, required=True, help='Path to the results from CEGMA, converted to GFF3' )
    parser.add_argument('-a', '--aat', type=str, required=True, help='Path to the results from AAT, converted to GFF3' )
    parser.add_argument('-e', '--expression', type=str, required=False, help='Any expression data aligned using GMAP (in gff3_gene mode)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-m', '--max_genes', type=int, required=False, help='Limits gene IDs exported to the top N by strongest evidence class' )
    args = parser.parse_args()

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = gff.get_gff3_features(args.genemark)
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies, cegma_features) = gff.get_gff3_features(args.cegma, assemblies=assemblies)
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing AAT results")
    (assemblies, aat_features) = gff.get_gff3_features(args.aat, assemblies=assemblies)
    aat_genes = get_genes_from_dict(aat_features)
    print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes)))

    expression_genes = list()
    if args.expression is not None:
        print("INFO: parsing expression results")
        (assemblies, expression_features) = gff.get_gff3_features(args.expression, assemblies=assemblies)
        expression_genes = get_genes_from_dict(expression_features)
        print("\tINFO: Got {0} expression 'genes'".format(len(expression_genes)))

    genemark_cegma_shared_genes = list()
    gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ):
                if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) )

    #############################################################################

    genemark_cegma_expression_shared_genes = list()
    gmes_cegma_exp_fh = open('gmes_cegma_exp.shared.ids', 'wt')

    for gm_es_gene in genemark_cegma_shared_genes:
        for exp_gene in expression_genes:
            if gm_es_gene.shares_CDS_structure_with( exp_gene ):
                genemark_cegma_expression_shared_genes.append(gm_es_gene)
                break

    print("{0} genes were shared perfectly between Genemark-ES and CEGMA and expression data".format(len(genemark_cegma_expression_shared_genes)) )

    #############################################################################

    genemark_aat_shared_genes = list()
    gmes_aat_fh = open('gmes_aat.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_genes:
            if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
            #if gm_es_gene.shares_exon_structure_with( thing=aat_gene ) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                gmes_aat_fh.write("{0}\n".format(gm_es_gene.id))
                break

    print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) )    

    ##############################################################################
    cegma_matching_gm_es = list()
    genemark_aat_cegma_shared_genes = list()
    gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ):
                if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True:
                    match_found = True

                    if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes:
                        genemark_aat_cegma_shared_genes.append(gm_es_gene)
                        gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                        
                    break

        if match_found == True:
            cegma_matching_gm_es.append(cegma_gene)

    
    print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) )
    training_fh = open('training_gene.ids', 'wt')
    
    for gene in genemark_aat_cegma_shared_genes:
        training_fh.write("{0}\n".format(gene.id) )

    ##############################################################################
    cegma_with_aat_not_gm_es = list()
    cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_matching_gm_es:
            continue

        for aat_gene in aat_genes:
            #if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
            if cegma_gene.shares_exon_structure_with( thing=aat_gene ) == True:
                cegma_with_aat_not_gm_es.append(cegma_gene)
                cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id))
                break
            
    print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) )


    ##############################################################################
    ## now to assemble the results
    training_ids = list()

    # 0. Start with genes shared between GeneMark-ES, CEGMA and expression evidence
    recruit_training_genes( training_ids, genemark_cegma_expression_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and expression data".format(len(training_ids)))
    
    # 1. Pull in the genes with shared evidence across GeneMark-ES, CEGMA and AAT
    recruit_training_genes( training_ids, genemark_aat_cegma_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and AAT".format(len(training_ids)))

    # 2. Next include those genes 
    recruit_training_genes( training_ids, cegma_with_aat_not_gm_es, args.max_genes )
    print("DEBUG: {0} genes after recruitment of CEGMA + AAT without GM-ES".format(len(training_ids)))

    recruit_training_genes( training_ids, genemark_cegma_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES + CEGMA".format(len(training_ids)))

    recruit_training_genes( training_ids, genemark_aat_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES + AAT".format(len(training_ids)))

    output_list_fh = open(args.output_file, 'wt')
    for training_id in training_ids:
        output_list_fh.write("{0}\n".format(training_id))
Beispiel #35
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional.  Limits how far an extension will happen looking for an in-frame stop codon')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                    
                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print("\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3]
                            
                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(next_codon)
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='')
                            else:
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='')
                        
                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3))
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos)
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():

    gm_es_file = 'genemark_hmm.gff3'
    cegma_file = 'output.cegma.gff3'
    #aat_file = 'bail_training_genes.aat.1500maxintron.80percid.gff3'
    aat_file = 'aat.bail_hominis_filtered_training.gff3'
    #aat_file = 'aat.merged.gff3'
    

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = gff.get_gff3_features(gm_es_file)
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies, cegma_features) = gff.get_gff3_features(cegma_file, assemblies=assemblies)
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing AAT results")
    (assemblies, aat_muris_features) = gff.get_gff3_features(aat_file, assemblies=assemblies)
    aat_genes = get_genes_from_dict(aat_muris_features)
    print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes)))

    genemark_cegma_shared_genes = list()
    gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ):
                if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) )

    #############################################################################

    genemark_aat_shared_genes = list()
    gmes_aat_fh = open('gmes_aat.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_genes:
            if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                gmes_aat_fh.write("{0}\n".format(gm_es_gene.id))
                break

    print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) )    

    ##############################################################################
    cegma_matching_gm_es = list()
    genemark_aat_cegma_shared_genes = list()
    gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ):
                if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True:
                    match_found = True

                    if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes:
                        genemark_aat_cegma_shared_genes.append(gm_es_gene)
                        gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                        
                    break

        if match_found == True:
            cegma_matching_gm_es.append(cegma_gene)

    
    print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) )
    training_fh = open('training_gene.ids', 'wt')
    
    for gene in genemark_aat_cegma_shared_genes:
        training_fh.write("{0}\n".format(gene.id) )

    ##############################################################################
    cegma_with_aat_not_gm_es = list()
    cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_matching_gm_es:
            continue

        for aat_gene in aat_genes:
            if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
                cegma_with_aat_not_gm_es.append(cegma_gene)
                cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id))
                break
            
    print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) )
def main():
    bin_dir = os.path.abspath(os.path.dirname(__file__))
    test_gff_file = bin_dir + '/biothings_coordinate_comparisons.data'
    
    (assemblies, features) = gff.get_gff3_features(test_gff_file)


###########################################################################################

    if features['TP03_0010'] < features['TP03_0012.t01_polypeptide']:
        print("INFO: < positive check successful")
    else:
        print("ERROR: < check unsuccessful")

    if features['TP03_0012'] < features['TP03_0012.t01_polypeptide']:
        print("ERROR: < check unsuccessful")
    else:
        print("INFO: < negative check successful")

###########################################################################################
        
    if features['TP03_0012'] > features['TP03_0010']:
        print("INFO: > positive check successful")
    else:
        print("ERROR: > check unsuccessful")

    if features['TP03_0010'] > features['TP03_0012.t01_polypeptide']:
        print("ERROR: > check unsuccessful")
    else:
        print("INFO: > negative check successful")
        
###########################################################################################

    if features['TP03_0012.t01_exon-auto15079'] <= features['TP03_0012.t01_polypeptide']:
        print("INFO: <= positive check successful")
    else:
        print("ERROR: <= check unsuccessful")

    if features['TP03_0010'] <= features['TP03_0012']:
        print("ERROR: <= check unsuccessful")
    else:
        print("INFO: <= negative check successful")

###########################################################################################

    if features['TP03_0012.t01_exon-auto15085'] >= features['TP03_0012.t01_polypeptide']:
        print("INFO: >= positive check successful")
    else:
        print("ERROR: >= check unsuccessful")

    if features['TP03_0010'] >= features['TP03_0012']:
        print("ERROR: >= check unsuccessful")
    else:
        print("INFO: >= negative check successful")

###########################################################################################

    if features['TP03_0012.t01_exon-auto15079'].overlaps_with(features['TP03_0012.t01_polypeptide']):
        print("INFO: overlaps_with() positive check successful")
    else:
        print("ERROR: overlaps_with() positive check unsuccessful")

    if features['TP03_0002'].overlaps_with(features['TP03_0010']):
        print("ERROR: overlaps_with() negative check unsuccessful")
    else:
        print("INFO: overlaps_with() negative check successful")

###########################################################################################
    overlap_size = features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15079'])

    if overlap_size == 224:
        print("INFO: overlap_size_with() positive check successful")
    else:
        print("ERROR: overlap_size_with() positive check unsuccessful (overlap returned: {0})".format(overlap_size))

    if features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15085']) == 224:
        print("INFO: overlap_size_with() negative check unsuccessful")
    else:
        print("ERROR: overlap_size_with() negative check successful")
Beispiel #38
0
def main():
    parser = argparse.ArgumentParser( description='Converts HMMer tabular output to BED')

    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-m', '--map_to_gff_coords', type=str, required=False, help='GFF3 file to use to map coordinates onto genomic space' )
    parser.add_argument('-s', '--score_by', type=str, required=False, default='totalscore', help='Populates the score column by totalscore or evalue' )
    args = parser.parse_args()

    if not os.path.exists(args.input_file):
        raise Exception("Input file passed {0} wasn't found".format(args.input_file))

    ofh = open(args.output_file, 'wt')

    if args.map_to_gff_coords is not None:
        (assemblies, features) = gff.get_gff3_features(args.map_to_gff_coords)

    line_num = 0

    for line in open(args.input_file):
        line_num += 1

        if line.startswith('#'): 
            continue

        line = line.rstrip()
        cols = line.split("\t")
        if len(cols) < 24: continue

        if args.map_to_gff_coords is None:
            start = cols[8]
            stop = cols[9]
            mol_id = cols[5]
            strand = '+'
        else:
            if cols[5] in features:
                feat_loc = features[cols[5]].location()
                strand = '+' if feat_loc.strand == 1 else '-'

                if strand == '+':
                    start = feat_loc.fmin + ((int(cols[8]) - 1) * 3)
                    stop  = feat_loc.fmin + (int(cols[9]) * 3)
                else:
                    start = feat_loc.fmin + ((int(cols[8]) - 1) * 3)
                    stop  = feat_loc.fmin + ((int(cols[9]) + 1) * 3)

                mol_id = feat_loc.on.id
            else:
                raise Exception("ERROR: Failed to find feature {0} in GFF3 feature set".format(cols[0]))

        if args.score_by == 'totalscore':
            adj_score = int(float(cols[12]))
        else:
            e_val = float(cols[19])
            
            if e_val == 0:
                adj_score = 1000
            else:
                try:
                    adj_score = int(0 - math.log(e_val))
                except ValueError:
                    raise Exception("ERROR doing math on this log value: ({0}) on line {1}".format(e_val, line_num))

        if adj_score < 0:
            adj_score = 0
        elif adj_score > 1000:
            adj_score = 1000
            
        ofh.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(mol_id, start, stop, cols[0], adj_score, strand))

    ofh.close()
def process_files(args):
    (assemblies_1, features_1) = gff.get_gff3_features(args.annotation_1)
    (assemblies_2, features_2) = gff.get_gff3_features(args.annotation_2)


    a_exons = []                                    ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file.  
    p_exons = []                                    ## For predicted annotation

    a_gene = []
    p_gene = []

    a_mrna = []
    p_mrna = []

    exon_pred_all = set()
    gene_true = set()
    mrna_true = set()



    chr = []

    a_cds = []                                   
    p_cds = []                                   

    a_cd = []
    p_cd= []
    chr = []

    true_pred_file = args.output_dir + '/true_predicted_genes.txt'
    true_file = open(true_pred_file,'w')
    true_file.write("Known\tPredicted\n")
    
    for asm_id in assemblies_1:                                                                                     ## Iterate through each chromosome from the known ref annotation        
        assembly_1 = assemblies_1[asm_id]
        assembly_2 = assemblies_2.get(asm_id,-1)                                                                    ## Find that chromosome in the predicted gff file
        genes_1 = assembly_1.genes()                                                                                ## All genes from known annotation
        anno_exons = set()

        for gene_1 in sorted(genes_1) :                                                                                     ## Add unique gene, mrna , exon features from known annotation to get each known feature total count 
            gene_1_loc = gene_1.location_on(assembly_1)
            cord_a = cordinate(asm_id,gene_1_loc)      ## Use chromosome id+start+stop+strand as a string to determine uniqueness.
            if (cord_a not in a_gene) :
                a_gene.append(cord_a)

            ex_start = []
            ex_stop = []
            for mrna_1 in sorted(gene_1.mRNAs()) :
                mrna_1_loc = mrna_1.location_on(assembly_1)
                cord = cordinate(asm_id,mrna_1_loc)
                if (cord not in a_mrna) :
                    a_mrna.append(cord)
                    
                if (args.feature == "Exon") :
                    feat_1 = mrna_1.exons()
                    
                if (args.feature == "CDS") :
                    feat_1 = mrna_1.CDSs()
                    
                for exon_1 in sorted(feat_1) :
                    exon_1_loc = exon_1.location_on(assembly_1)
                    cord = cordinate(asm_id, exon_1_loc)
                    if (cord not in a_exons) :
                        a_exons.append(cord)
                    anno_exons.add(cord)

                    
                    ex_start.append(exon_1_loc.fmin)
                    ex_stop.append(exon_1_loc.fmax)
                    
            ex_start.sort()
            ex_stop.sort()
            if (len(ex_start) >= 1) :
                cds1 = asm_id + ":" + gene_1.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" +  str(gene_1_loc.strand)
                
            else :
                cds1 = asm_id + ":" + gene_1.id + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax) + ":" +  str(gene_1_loc.strand)
                
                
            if (cord_a not in a_cd) :
                a_cds.append(cds1)
                a_cd.append(cord_a)
             
                    

        if (type(assembly_2) is int) :                     ##    If the chromosome is not found in prediected file, move to next chromosome.
            continue
        

        genes_2 = assembly_2.genes()                      ## All genes from predicted annotation.
        chr.append(asm_id)                                ## Append all found chromosome in a list.
        pred_exons = set()

        for gene_2 in sorted(genes_2) :                           ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count.  
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_p = cordinate(asm_id, gene_2_loc)
            if (cord_p not in p_gene) :
                p_gene.append(cord_p)

            ex_start = []
            ex_stop = []
            
            for mrna_2 in sorted(gene_2.mRNAs()) :
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord = cordinate(asm_id, mrna_2_loc)
                if (cord not in p_mrna) :
                    p_mrna.append(cord)

                if (args.feature == "Exon") :
                    feat_2 = mrna_2.exons()
                    
                if (args.feature == "CDS") :
                    feat_2 = mrna_2.CDSs()
                    
                for exon_2 in sorted(feat_2) :
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = cordinate(asm_id ,exon_2_loc)
                    pred_exons.add(cord)
                    if (cord not in p_exons) :
                        p_exons.append(cord)
                        
                    ex_start.append(exon_2_loc.fmin)
                    ex_stop.append(exon_2_loc.fmax)
                    
            ex_start.sort()
            ex_stop.sort()
            
            if (len(ex_start) >= 1) :   
                cds2 = asm_id  + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand)
                
            else :
                cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
                

            if (cord_p not in p_cd) :
                p_cds.append(cds2)
                p_cd.append(cord_p)

                    
        exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons
        
        
        for gene_2 in sorted(genes_2) :                                         ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_g = cordinate(asm_id, gene_2_loc)
            
            if (cord_g in gene_true) :                                          ## To prevent duplication, check if the feature already exists in the set of truly predicted gene.
                continue
            
            ex_mrna2 = set()
            			
        
            for gene_1 in sorted(genes_1) :
                ex_mrna1 = set()
                gene_1_loc = gene_1.location_on(assembly_1)
                if (gene_1_loc.strand != gene_2_loc.strand) :
                    continue
                if (gene_2.overlaps_with(gene_1)) :
                    
                    for mrna_2 in sorted(gene_2.mRNAs()) :
                        if (args.feature == "Exon") :
                            feat_2 = mrna_2.exons()
                        if (args.feature == "CDS") :
                            feat_2 = mrna_2.CDSs()
                            
                        for exon_2 in sorted(feat_2) :
                            exon_2_loc = exon_2.location_on(assembly_2)
                            cord2 = cordinate(asm_id , exon_2_loc)
                            ex_mrna2.add(cord2)
                            
                    for mrna_1 in sorted(gene_1.mRNAs()) :
                        if (args.feature == "Exon") :
                            feat_1 = mrna_1.exons()
                    
                        if (args.feature == "CDS") :
                            feat_1 = mrna_1.CDSs()
                        
                        for exon_1 in sorted(feat_1) :
                            exon_1_loc = exon_1.location_on(assembly_1)
                            cord1 = cordinate(asm_id, exon_1_loc)
                            ex_mrna1.add(cord1)
                    
                    ex_union = ex_mrna1.union(ex_mrna2)
                    if (len(ex_union) ==  len(ex_mrna1) and len(ex_union) == len(ex_mrna2)) :
                        gene_true.add(cord_g)
                        true_file.write(gene_1.id+"\t"+gene_2.id+"\n")
                        break
          
    for asm_id in assemblies_2:                                                  ## Iterate through each chromosome from the predicted annotation
        if asm_id not in chr :
            assembly_2 = assemblies_2.get(asm_id,-1)                             ## Find that chromosome in the predicted gff file which is not found in known annotation
            genes_2 = assembly_2.genes()                                         ## Add  genes, mrna, exon features from predicted annotation to total predicted feature set.
            
            for gene_2 in sorted(genes_2) :
                gene_2_loc = gene_2.location_on(assembly_2)
                cord_p = cordinate(asm_id ,gene_2_loc)
                if (cord_p not in p_gene) :
                    p_gene.append(cord_p)

                ex_start = []
                ex_stop = []
                
                for mrna_2 in sorted(gene_2.mRNAs()) :
                    mrna_2_loc = mrna_2.location_on(assembly_2)
                    cord = cordinate(asm_id , mrna_2_loc)
                    if (cord not in p_mrna) :
                        p_mrna.append(cord)

                    if (args.feature == "Exon") :
                        feat_2 = mrna_2.exons()
                    if (args.feature == "CDS") :
                        feat_2 = mrna_2.CDSs()
                        
                    for exon_2 in sorted(feat_2) :
                        exon_2_loc = exon_2.location_on(assembly_2)
                        cord = cordinate(asm_id ,exon_2_loc)
                        if (cord not in p_exons) :
                            p_exons.append(cord)
                            
                
                        ex_start.append(exon_2_loc.fmin)
                        ex_stop.append(exon_2_loc.fmax)

                ex_start.sort()
                ex_stop.sort()
                if (len(ex_start) >= 1) :
                    cds2 = asm_id  + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand)
                    
                else :
                    cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
                    

                if (cord_p not in p_cd) :
                    p_cds.append(cds2)
                    p_cd.append(cord_p)
                            

    

    #Calculate SN/SP for bases 

    (a_base_val, p_base_val, true_base) = base_comparison(p_exons,a_exons)

    base_sn = (true_base/a_base_val) * 100                                 
    base_sp = (true_base/p_base_val) * 100


    #Calculate SN/SP for exons 
    annotated_exon = len(a_exons)
    predicted_exon = len(p_exons)
    true_pred_exon = len(exon_pred_all)
    
    exon_sn = (true_pred_exon/annotated_exon) * 100                                 
    exon_sp = (true_pred_exon/predicted_exon) * 100

    #Calculate SN/SP for genes 

    annotated_gene = len(a_gene)
    predicted_gene = len(p_gene)
    true_pred_gene = len(gene_true)

    
    gene_sn = (true_pred_gene/annotated_gene) * 100                                 
    gene_sp = (true_pred_gene/predicted_gene) * 100
    print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp))
    print(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp))
    print("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp))
    
    out_file = args.output_dir + '/summary.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')

    fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n")
    fout.write(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n")
    fout.write("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)+"\n\n")


    arr_pred = compare_cds(p_cds,a_cds,"pred")
    arr_known = compare_cds(a_cds,p_cds,"known")
    arr_pred_same = compare_cds(p_cds,p_cds,"pred_same")
    
    new_gene = arr_pred[2]
    gene_merge = arr_pred[3]
    gene_found = arr_pred[0]
    gene_opp = arr_pred[1]       
    gene_missing = arr_known[2]
    gene = arr_known[0]
    gene_opp_known = arr_known[1]
    gene_split = arr_known[3]
    gene_pred_overlap_opp = arr_pred_same[1]


            
    print ("1. No. of known gene : ",len(a_cds))
    print ("2. No. of predicted gene : ",len(p_cds))
    print ("3. No. of predicted gene overlapping  0 known gene (new gene): ",new_gene)
    print ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : ",gene_merge)
    print ("5. No. of predicted gene overlaping 1 known gene : ",gene_found)
    print ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : ",gene_opp)
    print ("7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundaries) : ",true_pred_gene)
    print ("8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : ",gene_pred_overlap_opp)
    
    print ("9. No. of known gene overlapping  0 predicted gene (gene missing): ",gene_missing)
    print ("10. No. of known gene overlapping > 1 predicted gene(gene split) : ",gene_split)
    print ("11. No. of known gene overlaping 1 predicted gene : ",gene)
    print ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : ",gene_opp_known)

    
    out_file = args.output_dir + '/final_stats.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')
    
    fout.write ("1. No. of known gene : " + str(len(a_cds)) + "\n")
    fout.write ("2. No. of predicted gene : " + str(len(p_cds)) + "\n")
    fout.write ("3. No. of predicted gene overlapping  0 known gene (new gene): " + str(new_gene) + "\n")
    fout.write ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : " + str(gene_merge) + "\n")
    fout.write ("5. No. of predicted gene overlaping 1 known gene : " + str(gene_found) + "\n")
    fout.write ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : " + str(gene_opp) + "\n")
    fout.write ("7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundary) : " + str(true_pred_gene) + "\n")
    fout.write ("8. No. of predicted gene overlapping >= 1  predicted gene in opp strand : " + str(gene_pred_overlap_opp) + "\n")
    fout.write ("9. No. of known gene overlapping  0 predicted gene (gene missing): " + str(gene_missing) + "\n")
    fout.write ("10. No. of known gene overlapping > 1 predicted gene (gene_split): " + str(gene_split) + "\n")
    fout.write ("11. No. of known gene overlaping 1 predicted gene : " + str(gene) + "\n")
    fout.write ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : " + str(gene_opp_known) + "\n")



    true_pred_file = args.output_dir + '/true_pred.txt'
    fout_true = open(true_pred_file,'w')
    for true_gene in gene_true :
        fout_true.write(true_gene+"\n")
    


    #Clean up
    delete_file = ['exon_1.bed','exon_2.bed','exon_1_merged.bed','exon_2_merged.bed','exon_1_2_intersect.bed']
    for f in delete_file :
        cmd = "rm " + args.output_dir + "/" + f
        os.system(cmd)
Beispiel #40
0
def main():
    parser = argparse.ArgumentParser(
        description='Converts GFF3 files to GO Gene Association Format (GAF)')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument('-go',
                        '--go_file',
                        type=str,
                        required=True,
                        help='Gene Ontology (GO) file')
    parser.add_argument('-db',
                        '--database',
                        type=str,
                        required=True,
                        help='Database issuing that IDs.  Example: UniProtKB')
    parser.add_argument('-dbref',
                        '--db_reference',
                        type=str,
                        required=True,
                        help='DB reference, like PMID:2676709 (column 6)')
    parser.add_argument('-ec',
                        '--evidence_code',
                        type=str,
                        required=False,
                        default='IEA',
                        help='Like IEA (column 7)')
    parser.add_argument('-t',
                        '--taxon_id',
                        type=int,
                        required=True,
                        help='NCBI taxon ID (column 13)')
    parser.add_argument(
        '-ad',
        '--annotation_date',
        type=str,
        required=False,
        help=
        'Annotation date in YYYYMMDD format.  Default = GFF3 file datestamp')
    parser.add_argument(
        '-ab',
        '--assign_by',
        type=str,
        required=False,
        help='Assign by (column 15)  Defaults to --database argument value')
    args = parser.parse_args()

    print("INFO: Parsing GFF3 objects", file=sys.stderr)
    (assemblies, features) = gff.get_gff3_features(args.input_file)

    print("INFO: Parsing GO file", file=sys.stderr)
    go_lookup = parse_go_file(args.go_file)

    annot_date = args.annotation_date
    if annot_date is None:
        annot_date = time.strftime(
            '%Y%m%d', time.gmtime(os.path.getmtime(args.input_file)))

    assign_by = args.assign_by
    if assign_by is None:
        assign_by = args.database

    ofh = open(args.output_file, 'wt')

    ofh.write("!gaf-version: 2.0\n")

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    for go_annot in polypeptide.annotation.go_annotations:
                        go_id = "GO:{0}".format(go_annot.go_id)
                        product = None
                        gene_sym = None

                        if go_id not in go_lookup:
                            raise Exception(
                                "ERROR: GO ID {0} not found in provided go.obo file"
                                .format(go_id))

                        if polypeptide.annotation.product_name is not None:
                            product = polypeptide.annotation.product_name
                        if polypeptide.annotation.gene_symbol is not None:
                            gene_sym = polypeptide.annotation.gene_symbol

                        # Aspect is F, P or C, depending on which component/ontology the term comes from
                        ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}"
                                  "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t"
                                  "\t\n".format(args.database, polypeptide.id,
                                                go_id, args.db_reference,
                                                args.evidence_code,
                                                go_lookup[go_id], product,
                                                gene_sym, args.taxon_id,
                                                annot_date, assign_by))

    print("INFO: Conversion complete.", file=sys.stderr)