def main():
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--alignment_file', type=str, required=True, help='GFF3 file with RNA-seq assembly transcript features aligned to the same reference genome.  Usually with something like GMAP.' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_features) = biocodegff.get_gff3_features( args.reference_file )
    (qry_assemblies, qry_features) = biocodegff.get_gff3_features( args.alignment_file )
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks for genes with multiple mRNA children and creates new genes for each.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    for assembly_id in assemblies:
        current_assembly = assemblies[assembly_id]

        for gene in assemblies[assembly_id].genes():
            rnas_found = 0
            mRNAs = gene.mRNAs()

            for mRNA in mRNAs:
                mRNA_loc = mRNA.location_on(current_assembly)
                rnas_found += 1

                if rnas_found > 1:
                    gene.remove_mRNA(mRNA)

                    print("INFO: splitting mRNA off gene {0}".format(gene.id))
                    new_gene = biothings.Gene(
                        id="{0}_{1}".format(gene.id, rnas_found))
                    new_gene.locate_on(target=current_assembly,
                                       fmin=mRNA_loc.fmin,
                                       fmax=mRNA_loc.fmax,
                                       strand=mRNA_loc.strand)
                    new_gene.add_RNA(mRNA)
                    new_gene.print_as(fh=ofh, format='gff3')

            if len(mRNAs) > 1:
                gene_loc = gene.location_on(current_assembly)
                mRNA_loc = mRNAs[0].location_on(current_assembly)
                gene_loc.fmin = mRNA_loc.fmin
                gene_loc.fmax = mRNA_loc.fmax
                gene_loc.strand = mRNA_loc.strand

            gene.print_as(fh=ofh, format='gff3')
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Optional.  Sets the value for column 2 in all rows.  Default = .' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    fout = open(args.output_file, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for CDS in mRNA.CDSs():
                    check_and_update_phase(CDS)

            gene.print_as(fh=fout, source=args.source, format='gff3')
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser( description='Splits all GFF3 mRNA isoforms into their own gene models')

    ## Get the variables
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Input GFF3 file' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Output GFF3 file' )
    args = parser.parse_args()
    ofh = open(args.output_file, 'wt')

    print("INFO: Parsing GFF3 features\n")
    (assemblies, ref_features) = biocodegff.get_gff3_features( args.input_file )

    print("INFO: Finding genes with isoforms and splitting them\n")
    ofh.write("##gff-version 3\n")
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            # only changing the gene features with isoforms
            if len(gene.mRNAs()) > 1:
                counter = 1
                for mRNA in gene.mRNAs():
                    new_gene_id = str(gene.id) + "_" + str(counter)
                    counter += 1
                    mRNA_loc = mRNA.location() 
                    print("Splitting " + gene.id)
                    # create a new gene model, correcting the gene coords to the mRNA coords
                    new_gene = biothings.Gene( id = new_gene_id)
                    new_gene.locate_on( target=assemblies[assembly_id], fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand )
                    mRNA.parent.id = new_gene_id
                    #Now add the mRNA to the gene model
                    new_gene.add_mRNA(mRNA)
                    # print out the new gene model
                    new_gene.print_as(fh=ofh, source='IGS', format='gff3')
            else:
                gene.print_as(fh=ofh, source='IGS', format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Split an annotation GFF3 into training and evaluation sets')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-ot', '--output_training_file', type=str, required=True, help='GFF3 file to be created with the training genes' )
    parser.add_argument('-oe', '--output_evaluation_file', type=str, required=True, help='GFF3 file to be created with the evaluation genes' )
    parser.add_argument('-ts', '--training_set_size', type=int, required=False, default=200, help='Number of transcripts to select for training' )
    parser.add_argument('-es', '--evaluation_set_size', type=int, required=False, default=100, help='Number of transcripts to select for evaluation' )
    parser.add_argument('-me', '--max_exon_count', type=int, required=False, help='Skips any mRNAs with more exons than this' )
    parser.add_argument('--retain_composition', dest='retain_composition',action='store_true')
    parser.add_argument('--no_retain_composition', dest='retain_composition',action='store_false')
    parser.set_defaults(retain_composition=False)
    args = parser.parse_args()

    if args.retain_composition is True:
        raise Exception("ERROR: --retain_composition option not yet implemented")

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # key: exon count, value = list of mRNA objects with that count
    # which of these gets used depends on whether --retain_composition is passed
    mRNAs_by_exon_count = defaultdict(lambda: list())
    mRNAs = list()
    mRNA_count = 0

    for asm_id in assemblies:
        for gene in assemblies[asm_id].genes():
            for mRNA in gene.mRNAs():
                exon_count = mRNA.exon_count()

                if args.max_exon_count is None or exon_count <= args.max_exon_count:
                    mRNA_count += 1
                    
                    if args.retain_composition is True:
                        mRNAs_by_exon_count[exon_count].append(mRNA)
                    else:
                        mRNAs.append(mRNA)

    # if you feel like printing a profile
    #for exon_count in mRNAs_by_exon_count:
    #    print("DEBUG: exons:{0}\tcount:{1}".format( exon_count, len(mRNAs_by_exon_count[exon_count]) ) )

    # sanity check on the number of available mRNAs
    if (args.training_set_size + args.evaluation_set_size) > mRNA_count:
        raise Exception("ERROR: acceptable mRNA count ({0}) is less than combined training_set_size ({1}) and evaluation_set_size ({2}) options".format(mRNA_count, args.training_set_size, args.evaluation_set_size) )

    training_mRNAs = list()
    evaluation_mRNAs = list()
    
    if args.retain_composition is True:
        print("DEBUG: retaining composition")
        pass
    else:
        training_mRNAs = random.sample( mRNAs, args.training_set_size )
        unselected_mRNAs = list(set(mRNAs) & set(set(mRNAs) ^ set(training_mRNAs)))
        evaluation_mRNAs = random.sample( unselected_mRNAs, args.evaluation_set_size )

    export_mRNAs_to_file(training_mRNAs, args.output_training_file)
    export_mRNAs_to_file(evaluation_mRNAs, args.output_evaluation_file)
def main():
    parser = argparse.ArgumentParser( description='Converts GFF3 files to GO Gene Association Format (GAF)')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-go', '--go_file', type=str, required=True, help='Gene Ontology (GO) file' )
    parser.add_argument('-db', '--database', type=str, required=True, help='Database issuing that IDs.  Example: UniProtKB' )
    parser.add_argument('-dbref', '--db_reference', type=str, required=True, help='DB reference, like PMID:2676709 (column 6)' )
    parser.add_argument('-ec', '--evidence_code', type=str, required=False, default='IEA', help='Like IEA (column 7)' )
    parser.add_argument('-t', '--taxon_id', type=int, required=True, help='NCBI taxon ID (column 13)' )
    parser.add_argument('-ad', '--annotation_date', type=str, required=False, help='Annotation date in YYYYMMDD format.  Default = GFF3 file datestamp' )
    parser.add_argument('-ab', '--assign_by', type=str, required=False, help='Assign by (column 15)  Defaults to --database argument value' )
    args = parser.parse_args()

    print("INFO: Parsing GFF3 objects", file=sys.stderr)
    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    print("INFO: Parsing GO file", file=sys.stderr)
    go_lookup = parse_go_file(args.go_file)

    annot_date = args.annotation_date
    if annot_date is None:
        annot_date = time.strftime('%Y%m%d', time.gmtime(os.path.getmtime(args.input_file)))

    assign_by = args.assign_by
    if assign_by is None:
        assign_by = args.database

    ofh = open(args.output_file, 'wt')
    
    ofh.write("!gaf-version: 2.0\n")
     
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    for go_annot in polypeptide.annotation.go_annotations:
                        go_id = "GO:{0}".format(go_annot.go_id)
                        product = None
                        gene_sym = None
                        
                        if go_id not in go_lookup:
                            raise Exception("ERROR: GO ID {0} not found in provided go.obo file".format(go_id))

                        if polypeptide.annotation.product_name is not None: product = polypeptide.annotation.product_name
                        if polypeptide.annotation.gene_symbol is not None:  gene_sym = polypeptide.annotation.gene_symbol
                        
                        
                        # Aspect is F, P or C, depending on which component/ontology the term comes from
                        ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}"
                                  "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t"
                                  "\t\n".format(args.database, polypeptide.id, go_id, args.db_reference,
                                                args.evidence_code, go_lookup[go_id], product, gene_sym,
                                                args.taxon_id, annot_date, assign_by))

    print("INFO: Conversion complete.", file=sys.stderr)
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'
    
    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = biocodegff.get_gff3_features( flawed_gff_file )

    print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = biothings.Polypeptide( id=id, parent=parent )
        polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) )

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()
                    
                for CDS in CDSs:
                    keep = True
                    
                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))                    

            gene.print_as(fh=fout, source=source, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Basic comparison of two GFF3 files')

    ## output file to be written
    parser.add_argument('-r', '--ref', type=str, required=True, help='Path to the reference GFF3 file' )
    parser.add_argument('-q', '--qry', type=str, required=True, help='Path to the query GFF3 file' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of the output files to be created' )
    args = parser.parse_args()

    (assemblies, ref_features) = biocodegff.get_gff3_features( args.ref )
    ref_genes = get_genes_from_dict(ref_features)
    
    (assemblies, qry_features) = biocodegff.get_gff3_features( args.qry, assemblies=assemblies )
    qry_genes = get_genes_from_dict(qry_features)

    ref_matches_found = dict()
    qry_matches_found = dict()

    for ref_gene in ref_genes:
        for qry_gene in qry_genes:
            if ref_gene.has_same_coordinates_as( thing=qry_gene ) and \
               ref_gene.shares_exon_structure_with( thing=qry_gene ) and \
               ref_gene.shares_CDS_structure_with( thing=qry_gene ):

                ref_matches_found[ref_gene.id] = qry_gene.id
                qry_matches_found[qry_gene.id] = ref_gene.id

    # open our output files
    out_matches = open("{0}.matches".format(args.output_base), 'wt')
    out_summary = open("{0}.summary".format(args.output_base), 'wt')

    print("INFO: {0}/{1} reference genes had a match to a qry gene".format( len(ref_matches_found), len(ref_genes) ))
    print("INFO: {0}/{1} qry genes had a match to a reference gene".format( len(qry_matches_found), len(qry_genes) ))

    for ref_gene_id in ref_matches_found:
        out_matches.write("{0}\t{1}\n".format(ref_gene_id, ref_matches_found[ref_gene_id]))

    out_summary.write("Reference\t{0}\n".format(args.ref) )
    out_summary.write("Query\t{0}\n".format(args.ref) )
    out_summary.write("Total identical models (with respect to reference)\t{0}\n".format(len(ref_matches_found)))
    out_summary.write("Models in REF not in QRY\t{0}\n".format( len(ref_genes) - len(ref_matches_found) ))
    out_summary.write("Models in QRY not in REF\t{0}\n".format( len(qry_genes) - len(qry_matches_found) ))
def main():
    parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id))
            continue
        
        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id))
                continue
                
            ref_annot = ref_gene.polypeptides()[0].annotation
            
            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)
                
                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps)))
    
    biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' )
    parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' )
    parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' )
    parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' )
    parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file')
    
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )
    
    if args.genomic_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta)
        
    new_assemblies = dict() 

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    # >gi|68352484|gb|AAGK01000001.1|
    # AAGK01000001	NC_007344.1	tp.assembly.567468735.1

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)

    mset = biothings.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file')
    parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )
    
    ofh = open(args.output_file, 'wt')
    biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters')
    parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' )
    parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created')
    parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 )
    biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")
    
    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1
            
            for mRNA in gene.mRNAs():
                
                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')


    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        description='Splits all GFF3 mRNA isoforms into their own gene models')

    ## Get the variables
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Input GFF3 file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Output GFF3 file')
    args = parser.parse_args()
    ofh = open(args.output_file, 'wt')

    print("INFO: Parsing GFF3 features\n")
    (assemblies, ref_features) = biocodegff.get_gff3_features(args.input_file)

    print("INFO: Finding genes with isoforms and splitting them\n")
    ofh.write("##gff-version 3\n")
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            # only changing the gene features with isoforms
            if len(gene.mRNAs()) > 1:
                counter = 1
                for mRNA in gene.mRNAs():
                    new_gene_id = str(gene.id) + "_" + str(counter)
                    counter += 1
                    mRNA_loc = mRNA.location()
                    print("Splitting " + gene.id)
                    # create a new gene model, correcting the gene coords to the mRNA coords
                    new_gene = biothings.Gene(id=new_gene_id)
                    new_gene.locate_on(target=assemblies[assembly_id],
                                       fmin=mRNA_loc.fmin,
                                       fmax=mRNA_loc.fmax,
                                       strand=mRNA_loc.strand)
                    mRNA.parent.id = new_gene_id
                    #Now add the mRNA to the gene model
                    new_gene.add_mRNA(mRNA)
                    # print out the new gene model
                    new_gene.print_as(fh=ofh, source='IGS', format='gff3')
            else:
                gene.print_as(fh=ofh, source='IGS', format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Checks for genes with multiple mRNA children and creates new genes for each.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    for assembly_id in assemblies:
        current_assembly = assemblies[assembly_id]
        
        for gene in assemblies[assembly_id].genes():
            rnas_found = 0
            mRNAs = gene.mRNAs()
            
            for mRNA in mRNAs:
                mRNA_loc = mRNA.location_on(current_assembly)
                rnas_found += 1

                if rnas_found > 1:
                    gene.remove_mRNA(mRNA)
                    
                    print("INFO: splitting mRNA off gene {0}".format(gene.id))
                    new_gene = biothings.Gene( id="{0}_{1}".format(gene.id, rnas_found) )
                    new_gene.locate_on(target=current_assembly, fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand)
                    new_gene.add_RNA(mRNA)
                    new_gene.print_as(fh=ofh, format='gff3')

            if len(mRNAs) > 1:
                gene_loc = gene.location_on(current_assembly)
                mRNA_loc = mRNAs[0].location_on(current_assembly)
                gene_loc.fmin = mRNA_loc.fmin
                gene_loc.fmax = mRNA_loc.fmax
                gene_loc.strand = mRNA_loc.strand

            gene.print_as(fh=ofh, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Shortens gene feature coordinates to their longest child mRNA')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_gff3', type=str, required=True, help='Path to GFF3 output file to be created')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 )
    gff_out = open(args.output_gff3, 'wt')

    gff_out.write("##gff-version 3\n")

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            gene_loc = gene.location()

            # loop through the mRNAs and store the outer boundaries of those found
            min_coord = None
            max_coord = None

            mRNAs = gene.mRNAs()

            if len(mRNAs) >= 1:
                for mRNA in mRNAs:
                    mRNA_loc = mRNA.location()

                    if min_coord is None or mRNA_loc.fmin < min_coord:
                        min_coord = mRNA_loc.fmin

                    if max_coord is None or mRNA_loc.fmax > max_coord:
                        max_coord = mRNA_loc.fmax

                if min_coord != gene_loc.fmin or max_coord != gene_loc.fmax:
                    print("DEBUG: Changed gene {0} from {1}-{2} to {3}-{4}".format(gene.id, gene_loc.fmin, gene_loc.fmax, min_coord, max_coord))
                    gene_loc.fmin = min_coord
                    gene_loc.fmax = max_coord
                
            gene.print_as(fh=gff_out, source='IGS', format='gff3')
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence report non-terminal internal stops.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-p',
        '--print_n_with_stops',
        type=int,
        required=False,
        default=0,
        help=
        'Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)'
    )
    parser.add_argument(
        '-o',
        '--output_fasta',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output (translated) FASTA file for all those features which had internal stops'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None

    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if biocodeutils.translate(coding_seq).rstrip('*').count(
                        '*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = biocodeutils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(
                            mRNA.id, assembly_id, loc.fmin + 1, loc.fmax,
                            loc.strand))
                        fasta_out_fh.write("{0}\n".format(
                            biocodeutils.wrapped_fasta(translated_seq)))

                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id))
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(
                            translated_seq, mRNA.id))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main():

    gm_es_file = 'genemark_hmm.gff3'
    cegma_file = 'output.cegma.gff3'
    #aat_file = 'bail_training_genes.aat.1500maxintron.80percid.gff3'
    aat_file = 'aat.bail_hominis_filtered_training.gff3'
    #aat_file = 'aat.merged.gff3'
    

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = biocodegff.get_gff3_features( gm_es_file )
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies, cegma_features) = biocodegff.get_gff3_features( cegma_file, assemblies=assemblies )
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing AAT results")
    (assemblies, aat_muris_features) = biocodegff.get_gff3_features( aat_file, assemblies=assemblies)
    aat_genes = get_genes_from_dict(aat_muris_features)
    print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes)))

    genemark_cegma_shared_genes = list()
    gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ):
                if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) )

    #############################################################################

    genemark_aat_shared_genes = list()
    gmes_aat_fh = open('gmes_aat.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_genes:
            if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                gmes_aat_fh.write("{0}\n".format(gm_es_gene.id))
                break

    print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) )    

    ##############################################################################
    cegma_matching_gm_es = list()
    genemark_aat_cegma_shared_genes = list()
    gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ):
                if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True:
                    match_found = True

                    if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes:
                        genemark_aat_cegma_shared_genes.append(gm_es_gene)
                        gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                        
                    break

        if match_found == True:
            cegma_matching_gm_es.append(cegma_gene)

    
    print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) )
    training_fh = open('training_gene.ids', 'wt')
    
    for gene in genemark_aat_cegma_shared_genes:
        training_fh.write("{0}\n".format(gene.id) )

    ##############################################################################
    cegma_with_aat_not_gm_es = list()
    cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_matching_gm_es:
            continue

        for aat_gene in aat_genes:
            if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
                cegma_with_aat_not_gm_es.append(cegma_gene)
                cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id))
                break
            
    print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) )
def main():
    parser = argparse.ArgumentParser( description='')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file( args.fasta )
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]
        
        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag
            
            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id))
                continue

            
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin
                
                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax))
                        print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset))
                        print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) )

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp))
                            print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp))
                            
                        gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))
                    
            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies'
    )

    ## output file to be written
    parser.add_argument('-r',
                        '--reference_file',
                        type=str,
                        required=True,
                        help='GFF3 file of a reference annotation')
    parser.add_argument(
        '-q',
        '--query_file',
        type=str,
        required=True,
        help=
        'GFF3 file with alternative annotation (such as an RNA-seq assemby)')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (ref_assemblies,
     ref_feats) = biocodegff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print(
                "WARN: expected to find assembly_id {0} in both reference and query sets"
                .format(assembly_id))
            continue

        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".
                      format(ref_gene.id))
                continue

            ref_annot = ref_gene.polypeptides()[0].annotation

            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)

                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(
                        qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(
                    ref_gene.id, len(overlaps)))

    biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies,
                                          ofh=open(args.output_file, 'w'))
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(
        description='Provides coverage information for features in a GFF3 file'
    )

    ## output file to be written
    parser.add_argument(
        'evidence_files',
        metavar='N',
        type=str,
        nargs='+',
        help='Path to one or more evidence files, separated by spaces')
    parser.add_argument(
        '-r',
        '--reference',
        type=str,
        required=True,
        help=
        'Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE'
    )
    parser.add_argument('-f',
                        '--fasta',
                        type=str,
                        required=True,
                        help='Input path to the reference FASTA file.')
    parser.add_argument(
        '-o',
        '--output_file',
        type=str,
        required=False,
        help=
        'Optional path to an output file to be created, else prints on STDOUT')
    args = parser.parse_args()

    ## parse the fasta
    fasta = biocodeutils.fasta_dict_from_file(args.fasta)

    ## open the output file
    fout = None
    if args.output_file is None:
        fout = codecs.getwriter('utf8')(sys.stdout.buffer)
    else:
        fout = open(args.output_file, "w")

    ####################################################
    ## Sanity checks

    allowed_extensions = ['bed', 'gff3', 'pileup', 'sam']
    for ev_file in args.evidence_files:
        valid_ext_found = False

        for ext in allowed_extensions:
            if ev_file.endswith(ext):
                valid_ext_found = True

        if valid_ext_found == False:
            raise Exception(
                "ERROR: Evidence file passed with unsupported file extension: {0}.  Supported extensions are {1}"
                .format(ev_file, allowed_extensions))

    ## The input file should be defined as $path:$feattype
    if ':' not in args.reference:
        raise Exception(
            "ERROR: input_file must be like /path/to/some.gff3:mRNA")

    ref_file_parts = args.reference.split(':')
    print("DEBUG: part count: {0}".format(len(ref_file_parts)))

    if ref_file_parts[0].endswith('.gff3'):
        (ref_assemblies,
         ref_features) = biocodegff.get_gff3_features(ref_file_parts[0])
    else:
        raise Exception(
            "ERROR: Expected input file (-i) to have a gff3 extension, got {0}"
            .format(ref_file_parts[0]))

    ####################################################
    ## Initialize the coverage arrays

    fasta_cov = dict()
    for seq_id in fasta:
        # create a list of 0s the length of the molecule
        fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s'])

    ####################################################
    ## Now parse the evidence files

    for ev_file in args.evidence_files:
        if ev_file.endswith('pileup'):
            parse_pileup(fasta_cov, ev_file)
        elif ev_file.endswith('sam'):
            parse_sam(fasta_cov, ev_file)
        else:
            print(
                "INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented"
                .format(ev_file))

    for id in fasta_cov:
        covered_bases = 0

        for i in fasta_cov[id]:
            if fasta_cov[id][i] > 0:
                covered_bases += 1

        fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']),
                                            covered_bases))
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(
        description='Converts GFF3 into a GenBank flat file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument(
        '-o',
        '--output_file',
        type=str,
        required=False,
        help=
        'Path to a Genbank flat file to be created. Supersedes --output_dir if both are specified.'
    )
    parser.add_argument(
        '-od',
        '--output_dir',
        type=str,
        required=False,
        help=
        'Path to an output directory. If this option is specified then each input assembly will be written to a separate GenBank output file, named with the assembly_id.'
    )
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument('-mt',
                        '--molecule_type',
                        type=str,
                        required=False,
                        default='DNA',
                        help='Molecule type')
    parser.add_argument('-gbd',
                        '--genbank_division',
                        type=str,
                        required=False,
                        default='.',
                        help='GenBank Division (3-letter abbreviation)')
    parser.add_argument(
        '-md',
        '--modification_date',
        type=str,
        required=False,
        default='DD-MMM-YYYY',
        help='The modification date for header in format like 21-JUN-1999')
    parser.add_argument('-org',
                        '--organism',
                        type=str,
                        required=False,
                        default='.',
                        help='Full organism name (including strain)')
    parser.add_argument(
        '-str',
        '--strain',
        type=str,
        required=False,
        help=
        "Only the strain designation, which is written to the FEATURES.source element"
    )
    parser.add_argument(
        '-d',
        '--definition',
        type=str,
        required=False,
        default='.',
        help=
        'Brief description of sequence; includes information such as source organism, gene name/protein name, or some description of the sequence\'s function.'
    )
    parser.add_argument(
        '-s',
        '--source',
        type=str,
        required=False,
        default='.',
        help=
        'Free-format information including an abbreviated form of the organism name, sometimes followed by a molecule type.'
    )
    parser.add_argument('-t',
                        '--taxon_id',
                        type=int,
                        required=False,
                        help='NCBI taxon ID, if known')
    parser.add_argument(
        '-l',
        '--lineage',
        type=str,
        required=False,
        default='Unknown',
        help=
        'Semicolon-delimited lineage of the organism e.g., "Eukaryota; Alveolata; Apicomplexa; Aconoidasida; Piroplasmida; Theileriidae; Theileria"'
    )
    parser.add_argument(
        '-seq',
        '--include_sequence',
        action='store_true',
        help='Include sequence (if present) in the output GenBank flat file(s).'
    )
    parser.add_argument(
        '-p',
        '--locus_id_prefix',
        required=False,
        default='',
        help=
        'Prefix to add to the GenBank LOCUS id in the output GenBank flat file(s).'
    )
    args = parser.parse_args()

    # check that output directory exists
    if args.output_dir is not None:
        if not os.path.isdir(args.output_dir):
            sys.stderr.write("FATAL: the specified output directory (" +
                             args.output_dir + ") does not exist\n")
            exit(1)

    # line-wrap lineage to stay below 79 character GenBank flat file width
    lineage = biocodegenbank.line_wrap_lineage_string(args.lineage)

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)
    ofh = sys.stdout
    if args.output_file is not None:
        if args.output_dir is None:
            ofh = open(args.output_file, 'wt')
        else:
            sys.stderr.write(
                "WARN: both -o/--output_file and -od/--output_dir were passed so the former will be ignored\n"
            )

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        locus_id = args.locus_id_prefix + assembly_id
        if args.output_dir is not None:
            ofn = args.output_dir + "/" + locus_id + ".gbk"
            ofh = open(ofn, 'wt')
        assembly = assemblies[assembly_id]

        context = {
            'locus': locus_id,
            'molecule_size': assembly.length,
            'molecule_type': args.molecule_type,
            'division': args.genbank_division,
            'modification_date': args.modification_date,
            'accession': '.',
            'version': '.',
            'source': args.source,
            'definition': args.definition,
            'organism': args.organism,
            'lineage': lineage
        }
        header = TEMPLATE_ENVIRONMENT.get_template(
            'genbank_flat_file_header.template').render(context)
        ofh.write(header)
        ofh.write("\nFEATURES             Location/Qualifiers\n")
        ofh.write("     source          1..{0}\n".format(assembly.length))
        ofh.write("                     /organism=\"{0}\"\n".format(
            args.organism))
        ofh.write("                     /mol_type=\"genomic DNA\"\n")

        if args.strain is not None:
            ofh.write("                     /strain=\"{0}\"\n".format(
                args.strain))

        if args.taxon_id is not None:
            ofh.write("                     /db_xref=\"taxon:{0}\"\n".format(
                args.taxon_id))

        for gene in assemblies[assembly_id].genes():
            biocodegenbank.print_biogene(gene=gene, fh=ofh, on=assembly)

        if args.include_sequence:
            ofh.write("ORIGIN\n")
            biocodegenbank.print_sequence(seq=assembly.residues, fh=ofh)

        ofh.write("//\n")
        # there may be multiple output files
        if args.output_dir is not None:
            ofh.close()

    # there is only one output file
    if args.output_dir is None:
        ofh.close()
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(
        description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff3',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-m',
                        '--masked_fasta',
                        type=str,
                        required=True,
                        help='FASTA with sequence masked with N characters')
    parser.add_argument(
        '-p',
        '--percent_repeat_coverage_cutoff',
        type=int,
        required=True,
        help=
        'Genes with an mRNA covered by this percentage of repeats will be excluded'
    )
    parser.add_argument('-o',
                        '--output_gff3',
                        type=str,
                        required=False,
                        help='Path to GFF3 output file to be created')
    parser.add_argument(
        '-r',
        '--removed_gff3',
        type=str,
        required=False,
        help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3)
    biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")

    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1

            for mRNA in gene.mRNAs():

                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')

    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(
        kept_count, gene_count, ((kept_count / gene_count) * 100)))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-s',
        '--source',
        type=str,
        required=False,
        default='.',
        help='Optional.  Sets the value for column 2 in all rows.  Default = .'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    fout = open(args.output_file, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for CDS in mRNA.CDSs():
                    check_and_update_phase(CDS)

            gene.print_as(fh=fout, source=args.source, format='gff3')

    fasta_header_written = False

    for assembly_id in assemblies:
        if assemblies[assembly_id].length > 0:
            if fasta_header_written is False:
                fout.write("##FASTA\n")
                fasta_header_written = True

            fout.write(">{0}\n".format(assemblies[assembly_id].id))
            fout.write("{0}\n".format(
                biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break

                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                coding_seq = mRNA.get_CDS_residues()

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                            .format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write(
                            "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                            .format(stop_codon, mRNA.id))

                if args.type == 'cds':
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(
                        biocodeutils.wrapped_fasta(translated_seq)))
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file (along with FASTA data) and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.

    FASTA:
    If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need
    to specify the --fasta option in this script and pass it as a separate file.

    Definitions:
    Intergenic space was a little ambiguous to me as I started writing this.  Does one count the space from
    the beginning of the contig until the first gene, or only between them?  What about short contigs which
    have no annotated genes at all?  From the Sequence Ontology:

    SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or
    bounded by a gene and the end of the chromosome.

    To my reading, this includes contig ends but not gene-less contigs.  To that end, I include the
    former in intergenic space reporting but include the latter as a separate statistic.

    Author: Joshua Orvis (jorvis AT gmail)
    '''
    parser = argparse.ArgumentParser(
        description=
        'Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff3',
                        type=str,
                        required=True,
                        help='GFF3 file of a reference annotation')
    parser.add_argument(
        '-g',
        '--histogram',
        type=str,
        required=False,
        help=
        'Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)'
    )
    parser.add_argument(
        '-x',
        '--xlimit',
        type=int,
        required=False,
        help=
        'Use this if you want to limit the X-axis of the histogram (feature length)'
    )
    parser.add_argument(
        '-y',
        '--ylimit',
        type=int,
        required=False,
        help=
        'Use this if you want to limit the Y-axis of the histogram (feature count)'
    )
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help='Required if you don\'t have GFF3 with embedded FASTA')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3)

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0

    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0

    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_contig_residues = 0
    empty_contig_residues = 0

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################

    for asm_id in assemblies:
        #print("DEBUG: processing assembly: {0}".format(asm_id))
        assembly = assemblies[asm_id]
        genes = sorted(assembly.genes())
        total_gene_count += len(genes)
        previous_gene_loc = None

        # we should have a length here
        if assembly.length is None or assembly.length == 0:
            raise Exception(
                "ERROR: Detected assembly with undefined or 0 length: {0}".
                format(assembly.id))

        if total_gene_count == 0:
            empty_contig_residues += assembly.length
            continue

        total_contig_residues += assembly.length
        first_gene_loc = None
        last_gene_loc = None

        for gene in genes:
            gene_loc = gene.location_on(assembly)

            # if this is the first gene, track the number of bases from the start of the molecule here
            if first_gene_loc is None:
                total_intergenic_space_count += 1
                intergenic_distance = gene_loc.fmin
                total_intergenic_space_residues += intergenic_distance
                intergenic_distances.append(intergenic_distance)
                first_gene_loc = gene_loc

            if previous_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < previous_gene_loc.fmax:
                    if gene_loc.fmax > previous_gene_loc.fmax:
                        previous_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)

            for mRNA in gene.mRNAs():
                introns = mRNA.introns(on=assembly)

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    #if intron_size > 0:
                    #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size))

                    if intron_size < 0:
                        print(
                            "\tWARN: Intron size ({1}) < 0 reported in gene {0}"
                            .format(gene.id, intron_size))

                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size

            previous_gene_loc = gene_loc
            last_gene_loc = previous_gene_loc

        if last_gene_loc is not None:
            total_intergenic_space_count += 1
            intergenic_distance = assembly.length - last_gene_loc.fmax
            total_intergenic_space_residues += intergenic_distance
            intergenic_distances.append(intergenic_distance)

    if total_intergenic_space_count == 0:
        avg_intergenic_space_dist = None
        intergenic_distances = None
        median_int_space_dist = None
    else:
        avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
        intergenic_distances = sorted(intergenic_distances)
        median_int_space_dist = intergenic_distances[int(
            len(intergenic_distances) / 2)]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes) / 2)]

    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count))

    print("\nTotal molecule bases: {0} bp".format(total_contig_residues))
    print("Empty molecule bases: {0} bp".format(empty_contig_residues))

    if total_intergenic_space_count > 0:
        print(
            "Intergenic space count: {0}".format(total_intergenic_space_count))
        print("Average intergenic space distance: {0:.1f} bp".format(
            avg_intergenic_space_dist))
        print("Median intergenic space distance: {0} bp".format(
            median_int_space_dist))
        print("Minimum intergenic space distance: {0} bp".format(
            intergenic_distances[0]))
        print("Maximum intergenic space distance: {0} bp\n".format(
            intergenic_distances[-1]))
    else:
        print(
            "There were no intergenic spaces found.  This might mean there were no molecules with at least 2 genes."
        )

    print("Intron count: {0}".format(total_intron_count))
    print("Intron space count: {0}".format(total_intron_residues))

    print("Average intron size: {0:.1f} bp".format(avg_intron_size))
    print("Median intron size: {0} bp".format(median_intron_size))
    print("Minimum intron size: {0} bp".format(intron_sizes[0]))
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]))

    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances,
                 bins=50,
                 histtype='stepfilled',
                 color='b',
                 label='Intergenic distances')
        plt.hist(intron_sizes,
                 bins=50,
                 histtype='stepfilled',
                 color='r',
                 alpha=0.5,
                 label='Intron sizes')

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])

        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1' )
    parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)' )
    parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT' )
    parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins' )
    parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff' )
    parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff' )
    parser.add_argument('-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match' )
    parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed' )
    args = parser.parse_args()

    debugging_transcript = None
    
    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff)

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = biocodegff.get_gff3_features( args.organism1_annotation )

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = biocodeutils.fasta_dict_from_file( args.aat_fasta_db )
    
    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith('#') or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()
        
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = biocodegff.column_9_value(cols[8], 'ID').replace('"', '')
        target = biocodegff.column_9_value(cols[8], 'Target')
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == 'nucleotide_to_protein_match':
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1
            
            current_match = biothings.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin )
            current_match.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand )

        elif cols[2] == 'match_part':
            parent_id = biocodegff.column_9_value(cols[8], 'Parent').replace('"', '')
            match_part = biothings.MatchPart( id=feature_id, parent=parent_id, length=fmax - fmin )
            match_part.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand )
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue
        
        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception("ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length))

                        if aat_match.target_id not in aat_seqs:
                            raise Exception("ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(aat_match.target_id))

                        # this is a protein length, so x3
                        match_target_length = len(aat_seqs[aat_match.target_id]['s']) * 3

                        (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(mRNA, aat_match, match_target_length)

                        #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )
                        
                        if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff:
                            o1_with_aat.append(mRNA.id)
                            #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            #print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break   # only need to see if one matched

    print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue
        
        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print("INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2)))

    id_list_fh = open(args.output_id_list, 'wt')
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-r', '--reference', type=str, required=True, help='Reference GFF3' )
    parser.add_argument('-q', '--query', type=str, required=True, help='Query GFF3' )
    args = parser.parse_args()

    print("INFO: parsing reference features\n")
    (assemblies, ref_features) = biocodegff.get_gff3_features( args.reference )

    print("INFO: parsing query features\n")
    (assemblies, qry_features) = biocodegff.get_gff3_features( args.query, assemblies=assemblies )

    ref_genes = get_genes_from_dict( ref_features )
    qry_genes = get_genes_from_dict( qry_features )
    ref_gene_one_qry_overlap = {}
    qry_gene_one_ref_overlap = {}
#Find all of the query genes that overlap the reference gene at all
    for ref_gene in sorted(ref_genes):
        ref_loc = ref_gene.location()
        num_qry_overlaps = {} #keep track of the number of query RNAs that have at least one CDS that overlaps
        qry_to_ref = {} #keep track of the query:ref relationships for printing out later
        for qry_gene in sorted(qry_genes):
            qry_loc = qry_gene.location()
            if ref_gene.overlaps_with( qry_gene ):
                for ref_RNA in ref_gene.RNAs():
                    for qry_RNA in qry_gene.RNAs():
                        for ref_CDS in ref_RNA.CDSs(): 
                            ref_CDS_loc = ref_CDS.location()
                            for qry_CDS in qry_RNA.CDSs():
                                qry_CDS_loc = qry_CDS.location()
                                if ref_CDS.overlaps_with( qry_CDS ) and qry_CDS_loc.strand is ref_CDS_loc.strand: #Does the ref CDS overlap the query CDS?
                                    num_qry_overlaps[qry_gene.id] = 1 #If so, add the qry_gene.id to the list of overlaps
                                    qry_to_ref[qry_gene.id] = ref_gene.id #Also, keep track of the query:ref relationships
        #Store all of the reference genes that overlap only a single query gene
        for qry_gene in num_qry_overlaps:
            if len(num_qry_overlaps) == 1:
                ref_gene_one_qry_overlap[qry_to_ref[qry_gene]] = qry_gene
                #print(str(qry_gene) + "\t" + str(qry_to_ref[qry_gene]))$
    #Now do the same thing finding all ref overlaps for each query gene
    for qry_gene in sorted(qry_genes):
        qry_loc = qry_gene.location()
        num_ref_overlaps = {}
        ref_to_qry = {}
        for ref_gene in sorted(ref_genes):
            ref_loc = ref_gene.location()
            if qry_gene.overlaps_with( ref_gene ):
                for qry_RNA in qry_gene.RNAs():
                    for ref_RNA in ref_gene.RNAs():
                        for qry_CDS in qry_RNA.CDSs():
                            qry_CDS_loc = qry_CDS.location()
                            for ref_CDS in ref_RNA.CDSs():
                                ref_CDS_loc = ref_CDS.location()
                                if qry_CDS.overlaps_with( ref_CDS ) and qry_CDS_loc.strand is ref_CDS_loc.strand:
                                    num_ref_overlaps[ref_gene.id] = 1
                                    ref_to_qry[ref_gene.id] = qry_gene.id
        #Store all of the wry genes that overlap only a single reference gene
        for ref_gene in num_ref_overlaps:
            if len(num_ref_overlaps) == 1:
                qry_gene_one_ref_overlap[ref_to_qry[ref_gene]] = ref_gene
#Find all of the reference genes with only one query overlap and vice versa and print them out
    for qry_gene_id in qry_gene_one_ref_overlap:
       for ref_gene_id in ref_gene_one_qry_overlap:
            if qry_gene_id is ref_gene_one_qry_overlap[ref_gene_id] and ref_gene_id is qry_gene_one_ref_overlap[qry_gene_id]:
                print(qry_gene_id + "\t" + ref_gene_id)
def main():
    parser = argparse.ArgumentParser(
        description='Converts GFF3 files to GO Gene Association Format (GAF)')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument('-go',
                        '--go_file',
                        type=str,
                        required=True,
                        help='Gene Ontology (GO) file')
    parser.add_argument('-db',
                        '--database',
                        type=str,
                        required=True,
                        help='Database issuing that IDs.  Example: UniProtKB')
    parser.add_argument('-dbref',
                        '--db_reference',
                        type=str,
                        required=True,
                        help='DB reference, like PMID:2676709 (column 6)')
    parser.add_argument('-ec',
                        '--evidence_code',
                        type=str,
                        required=False,
                        default='IEA',
                        help='Like IEA (column 7)')
    parser.add_argument('-t',
                        '--taxon_id',
                        type=int,
                        required=True,
                        help='NCBI taxon ID (column 13)')
    parser.add_argument(
        '-ad',
        '--annotation_date',
        type=str,
        required=False,
        help=
        'Annotation date in YYYYMMDD format.  Default = GFF3 file datestamp')
    parser.add_argument(
        '-ab',
        '--assign_by',
        type=str,
        required=False,
        help='Assign by (column 15)  Defaults to --database argument value')
    args = parser.parse_args()

    print("INFO: Parsing GFF3 objects", file=sys.stderr)
    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    print("INFO: Parsing GO file", file=sys.stderr)
    go_lookup = parse_go_file(args.go_file)

    annot_date = args.annotation_date
    if annot_date is None:
        annot_date = time.strftime(
            '%Y%m%d', time.gmtime(os.path.getmtime(args.input_file)))

    assign_by = args.assign_by
    if assign_by is None:
        assign_by = args.database

    ofh = open(args.output_file, 'wt')

    ofh.write("!gaf-version: 2.0\n")

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                for polypeptide in mRNA.polypeptides():
                    for go_annot in polypeptide.annotation.go_annotations:
                        go_id = "GO:{0}".format(go_annot.go_id)
                        product = None
                        gene_sym = None

                        if go_id not in go_lookup:
                            raise Exception(
                                "ERROR: GO ID {0} not found in provided go.obo file"
                                .format(go_id))

                        if polypeptide.annotation.product_name is not None:
                            product = polypeptide.annotation.product_name
                        if polypeptide.annotation.gene_symbol is not None:
                            gene_sym = polypeptide.annotation.gene_symbol

                        # Aspect is F, P or C, depending on which component/ontology the term comes from
                        ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}"
                                  "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t"
                                  "\t\n".format(args.database, polypeptide.id,
                                                go_id, args.db_reference,
                                                args.evidence_code,
                                                go_lookup[go_id], product,
                                                gene_sym, args.taxon_id,
                                                annot_date, assign_by))

    print("INFO: Conversion complete.", file=sys.stderr)
def main():
    parser = argparse.ArgumentParser(description='')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help='Required if you don\'t have GFF3 with embedded FASTA')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='mRNA',
                        choices=['mRNA', 'CDS'],
                        help='Feature type to export (mRNA or CDS)')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # set this to None if you don't want the debug print statements
    #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C'
    debugging_gene = None

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file(args.fasta)
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    for assembly_id in assemblies:
        assembly = assemblies[assembly_id]

        for gene in assembly.genes():

            if debugging_gene is not None:
                debug_mode = True
                if gene.id != debugging_gene: continue
            else:
                debug_mode = False

            if gene.locus_tag is None:
                gene_label = gene.id
            else:
                gene_label = gene.locus_tag

            gene_seq = gene.get_residues().upper()
            gene_loc = gene.location_on(assembly)

            ## we have to do this here because of the coordinates
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            if debug_mode:
                print(
                    "INFO: Processing gene with length {0} at {1}-{2}".format(
                        len(gene_seq), gene_loc.fmin, gene_loc.fmax))

            if len(gene.mRNAs()) > 1:
                #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id))
                print(
                    "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)"
                    .format(gene.id))
                continue

            for mRNA in gene.mRNAs():
                introns = mRNA.introns(on=assembly)

                # this helps us get where the intron is on the gene
                offset = gene_loc.fmin

                for intron in introns:
                    intron_loc = intron.location_on(assembly)
                    lower_mid = gene_seq[intron_loc.fmin -
                                         offset:intron_loc.fmax -
                                         offset].lower()
                    gene_seq = gene_seq[0:intron_loc.fmin -
                                        offset] + lower_mid + gene_seq[
                                            intron_loc.fmax - offset:]

                    if debug_mode:
                        print("INFO:\tfound intron at {0}-{1}".format(
                            intron_loc.fmin, intron_loc.fmax))
                        print(
                            "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}"
                            .format(intron_loc.fmin - offset,
                                    intron_loc.fmax - offset))
                        print(
                            "INFO:\tgenerating lower case seq of length: {0}\n"
                            .format(len(lower_mid)))

                if debug_mode:
                    print("INFO: seq length before CDS processing is: {0}".
                          format(len(gene_seq)))

                ## do we need to trim down to the CDS range?
                if args.type == 'CDS':
                    CDSs = sorted(mRNA.CDSs())
                    CDS_min = CDSs[0].location_on(assembly).fmin
                    CDS_max = CDSs[-1].location_on(assembly).fmax

                    if debug_mode:
                        print(
                            "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}"
                            .format(CDS_max, CDS_min, CDS_max - CDS_min))

                    if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max:
                        fmin_chomp = CDS_min - offset
                        fmax_chomp = gene_loc.fmax - CDS_max

                        if debug_mode:
                            print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \
                                                                                      gene_loc.fmax, gene_loc.strand, \
                                                                                      CDS_min, CDS_max \
                                                                                     ))

                            print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(
                                fmin_chomp, fmax_chomp))
                            print(
                                "\tpulling range: gene_seq[{0} : {1}]".format(
                                    fmin_chomp,
                                    len(gene_seq) - fmax_chomp))

                        gene_seq = gene_seq[fmin_chomp:len(gene_seq) -
                                            fmax_chomp]

                        if debug_mode:
                            print("\tGene {0} CDS seq: {1}".format(
                                gene.id, gene_seq))

            ## make sure to switch it back
            if gene_loc.strand == -1:
                gene_seq = "".join(reversed(gene_seq))

            #print("INFO: Got gene with length {0} after modification".format(len(gene_seq)))
            ofh.write(">{0}\n{1}\n".format(
                gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'

    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = biocodegff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(
        len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = biothings.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]],
                                       fmin=int(cols[3]) - 1,
                                       fmax=int(cols[4]),
                                       strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(
        len(polypeptides)))

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print(
                        "DEBUG: {0} not found as a parent to any polypeptide".
                        format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()

                for CDS in CDSs:
                    keep = True

                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))

            gene.print_as(fh=fout, source=source, format='gff3')
def main():

    gm_es_file = 'genemark_hmm.gff3'
    cegma_file = 'output.cegma.gff3'
    transcript_file = 'expression_data.gff3'
    aat_muris_file = 'cmuris.aat.gff3'
    aat_parvum_file = 'cparvum.aat.gff3'

    html_out_file = 'gene_classifications.html'
    html_out = open(html_out_file, 'wt')

    # for each gene, how many flanking bases should be shown on either side?
    flanking_bases = 1000

    type1_best = list()
    type2_best = list()
    type2_better = list()
    type3_still_better = list()

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = biocodegff.get_gff3_features(gm_es_file)
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies,
     cegma_features) = biocodegff.get_gff3_features(cegma_file,
                                                    assemblies=assemblies)
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing expression data (Trinity, Cufflinks, GMAP cDNAs)")
    (assemblies,
     transcript_features) = biocodegff.get_gff3_features(transcript_file,
                                                         assemblies=assemblies)
    transcript_genes = get_genes_from_dict(transcript_features)
    print("\tINFO: Got {0} expression 'genes'".format(len(transcript_genes)))

    print("INFO: parsing AAT results (C. muris)")
    (assemblies,
     aat_muris_features) = biocodegff.get_gff3_features(aat_muris_file,
                                                        assemblies=assemblies)
    aat_muris_genes = get_genes_from_dict(aat_muris_features)
    print("\tINFO: Got {0} AAT (C. muris) 'genes'".format(
        len(aat_muris_genes)))

    print("INFO: parsing AAT results (C. parvum)")
    (assemblies,
     aat_parvum_features) = biocodegff.get_gff3_features(aat_parvum_file,
                                                         assemblies=assemblies)
    aat_parvum_genes = get_genes_from_dict(aat_parvum_features)
    print("\tINFO: Got {0} AAT (C. parvum) 'genes'".format(
        len(aat_parvum_genes)))

    #biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta)

    genemark_cegma_shared_genes = list()

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as(thing=cegma_gene):
                if gm_es_gene.shares_exon_structure_with(
                        thing=cegma_gene) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".
          format(len(genemark_cegma_shared_genes)))
    #############################################################################

    genemark_aat_shared_genes = list()

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_muris_genes:
            if gm_es_gene.shares_exon_structure_with(thing=aat_gene) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                break

        if gm_es_gene not in genemark_aat_shared_genes:
            for aat_gene in aat_parvum_genes:
                if gm_es_gene.shares_exon_structure_with(
                        thing=aat_gene) == True:
                    genemark_aat_shared_genes.append(gm_es_gene)
                    break

    print("{0} Genemark-ES genes had an exact AAT match".format(
        len(genemark_aat_shared_genes)))

    ##############################################################################
    cegma_not_matching_gm_es = list()

    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as(thing=gm_es_gene):
                if cegma_gene.shares_exon_structure_with(
                        thing=gm_es_gene) == True:
                    match_found = True
                    break

        if match_found == False:
            cegma_not_matching_gm_es.append(cegma_gene)

    print("{0} CEGMA genes don't have a structural match to a Genemark-ES one".
          format(len(cegma_not_matching_gm_es)))

    #############################################################################
    gm_expression_shared_genes = list()

    for gm_es_gene in gm_es_genes:
        for tf in transcript_genes:
            if gm_es_gene.shares_CDS_structure_with(tf):
                gm_expression_shared_genes.append(gm_es_gene)
                break

    print("{0} Genemark-ES genes had an exact expression match".format(
        len(gm_expression_shared_genes)))

    #############################################################################

    gm_cegma_expression_shared_genes = list()

    for shared_gene in genemark_cegma_shared_genes:
        if shared_gene in gm_expression_shared_genes:
            gm_cegma_expression_shared_genes.append(shared_gene)

    print(
        "{0} genes were shared perfectly between Genemark-ES and CEGMA with expression support"
        .format(len(gm_cegma_expression_shared_genes)))
    ##############################################################################

    gm_cegma_expression_aat_shared_genes = list()

    for shared_gene in gm_cegma_expression_shared_genes:
        if shared_gene in genemark_aat_shared_genes:
            gm_cegma_expression_aat_shared_genes.append(shared_gene)
        else:
            type2_best.append(shared_gene)

    for gene in gm_cegma_expression_aat_shared_genes:
        type1_best.append(gene)

    print(
        "{0} genes were shared with Genemark-ES, CEGMA, expression, AAT support"
        .format(len(gm_cegma_expression_aat_shared_genes)))
    ##############################################################################

    for gm_es_gene in gm_es_genes:
        if gm_es_gene not in genemark_cegma_shared_genes:
            if gm_es_gene in gm_expression_shared_genes:
                type2_better.append(gm_es_gene)

    ##############################################################################
    cegma_expression_shared_genes = list()

    for cegma_gene in cegma_genes:
        for tf in transcript_genes:
            if cegma_gene.shares_CDS_structure_with(tf):
                cegma_expression_shared_genes.append(cegma_gene)
                break

    print("{0} CEGMA genes had an exact expression match".format(
        len(cegma_expression_shared_genes)))

    ##############################################################################

    cegma_not_gmes_with_aat = list()

    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_not_matching_gm_es:

            for aat_gene in aat_muris_genes:
                if cegma_gene.shares_exon_structure_with(
                        thing=aat_gene) == True:
                    cegma_not_gmes_with_aat.append(cegma_gene)
                    break

            if cegma_gene not in cegma_not_gmes_with_aat:
                for aat_gene in aat_parvum_genes:
                    if cegma_gene.shares_exon_structure_with(
                            thing=aat_gene) == True:
                        cegma_not_gmes_with_aat.append(gm_es_gene)
                        break

            if cegma_gene in cegma_not_gmes_with_aat:
                if cegma_gene in cegma_expression_shared_genes:
                    type3_still_better.append(cegma_gene)

    print("TYPE 1 - BEST: {0}".format(len(type1_best)))
    print("TYPE 2 - BEST: {0}".format(len(type2_best)))
    print("TYPE 2 - BETTER: {0}".format(len(type2_better)))
    print("TYPE 3 - STILL BETTER: {0}".format(len(type3_still_better)))

    html_out.write("<!doctype html>\n")
    html_out.write("<html lang=\"en\">\n")
    html_out.write(
        "<head><meta charset=\"utf-8\"><title>Gene classification</title></head>\n"
    )
    html_out.write("<body>\n")

    html_out.write("<h3>Type 1 - Best ({0})</h3>\n".format(len(type1_best)))
    print_gene_list(html_out, type1_best, flanking_bases)

    html_out.write("<h3>Type 2 - Best ({0})</h3>\n".format(len(type2_best)))
    print_gene_list(html_out, type2_best, flanking_bases)

    html_out.write("<h3>Type 2 - Better ({0})</h3>\n".format(
        len(type2_better)))
    print_gene_list(html_out, type2_better, flanking_bases)

    html_out.write("<h3>Type 3 - Still better ({0})</h3>\n".format(
        len(type3_still_better)))
    print_gene_list(html_out, type3_still_better, flanking_bases)

    html_out.write("</body>\n")
    html_out.write("</html>\n")
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.
    '''
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' )
    parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' )
    parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 )

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0
    
    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0
    
    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################
    
    for asm_id in assemblies:
        assembly = assemblies[asm_id]
        genes = assembly.genes()
        total_gene_count += len(genes)
        last_gene_loc = None

        for gene in sorted(genes):
            gene_loc = gene.location_on(assembly)

            if last_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < last_gene_loc.fmax:
                    if gene_loc.fmax > last_gene_loc.fmax:
                        last_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - last_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)
                    
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    if intron_size < 0:
                        print("WARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size))
                    
                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size
                
            last_gene_loc = gene_loc


    avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
    intergenic_distances = sorted(intergenic_distances)
    median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes)/2)]
            
    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}\n".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count) )
    print("Intergenic space count: {0}".format(total_intergenic_space_count) )

    print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) )
    print("Median intergenic space distance: {0} bp".format(median_int_space_dist) )
    print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) )
    print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) )
 
    print("Intron count: {0}".format(total_intron_count) )
    print("Intron space count: {0}".format(total_intron_residues) )

    print("Average intron size: {0:.1f} bp".format(avg_intron_size) )
    print("Median intron size: {0} bp".format(median_intron_size) )
    print("Minimum intron size: {0} bp".format(intron_sizes[0]) )
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) )
    
    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' )
        plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' )

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])
        
        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)
Ejemplo n.º 34
0
def main():
    parser = argparse.ArgumentParser(
        description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_base',
                        type=str,
                        required=True,
                        help='Base name of output files to be created')
    parser.add_argument(
        '-ln',
        '--lab_name',
        type=str,
        required=True,
        help='Required by NCBI to identify the submitting group')
    parser.add_argument('-nap',
                        '--ncbi_acc_prefix',
                        type=str,
                        required=True,
                        help='Required and assigned by NCBI')
    parser.add_argument(
        '-gf',
        '--genomic_fasta',
        type=str,
        required=False,
        help='FASTA file of genomic sequence, if not embedded in GFF')
    parser.add_argument(
        '-go',
        '--go_obo',
        type=str,
        required=False,
        help=
        'GO terms will not be exported unless you pass the path to a GO OBO file'
    )

    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    if args.genomic_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta)

    new_assemblies = dict()

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(
                args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    biocodetbl.print_tbl_from_assemblies(assemblies=assemblies,
                                         ofh=ofh,
                                         go_obo=args.go_obo,
                                         lab_name=args.lab_name)

    mset = biothings.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
def process_files(args):
    (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1)
    (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2)


    a_exons = []                                    ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file.  
    p_exons = []                                    ## For predicted annotation

    a_gene = []
    p_gene = []

    a_mrna = []
    p_mrna = []

    exon_pred_all = set()
    gene_true = set()
    mrna_true = set()



    a_base = 0
    p_base = 0
    true_base = 0
    
    chr = []
    
    for asm_id in assemblies_1:                                                                                     ## Iterate through each chromosome from the known ref annotation        
        assembly_1 = assemblies_1[asm_id]
        assembly_2 = assemblies_2.get(asm_id,-1)                                                                    ## Find that chromosome in the predicted gff file
        genes_1 = assembly_1.genes()                                                                                ## All genes from known annotation
        anno_exons = set()

        for gene_1 in sorted(genes_1) :                                                                                     ## Add unique gene, mrna , exon features from known annotation to get each known feature total count 
            gene_1_loc = gene_1.location_on(assembly_1)
            cord = asm_id  + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax)+ ":"  + str(gene_1_loc.strand)        ## Use chromosome id+start+stop+strand as a string to determine uniqueness.
            if (cord not in a_gene) :
                a_gene.append(cord)
            
            for mrna_1 in sorted(gene_1.mRNAs()) :
                mrna_1_loc = mrna_1.location_on(assembly_1)
                cord = asm_id  + ":" + str(mrna_1_loc.fmin) + ":" + str(mrna_1_loc.fmax) + ":" + str(mrna_1_loc.strand)
                if (cord not in a_mrna) :
                    a_mrna.append(cord)

                for exon_1 in sorted(mrna_1.exons()) :
                    exon_1_loc = exon_1.location_on(assembly_1)
                    cord = asm_id + ":"  + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" + str(exon_1_loc.strand)
                    if (cord not in a_exons) :
                        a_exons.append(cord)
                    anno_exons.add(cord)

        if (type(assembly_2) is int) :                     ##    If the chromosome is not found in prediected file, move to next chromosome.
            continue
        

        genes_2 = assembly_2.genes()                      ## All genes from predicted annotation.
        chr.append(asm_id)                                ## Append all found chromosome in a list.
        pred_exons = set()

        for gene_2 in sorted(genes_2) :                           ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count.  
            gene_2_loc = gene_2.location_on(assembly_2)
            cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
            if (cord not in p_gene) :
                p_gene.append(cord)
            
            for mrna_2 in sorted(gene_2.mRNAs()) :
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord = asm_id  + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax)+ ":" +  str(mrna_2_loc.strand)
                if (cord not in p_mrna) :
                    p_mrna.append(cord)
                
                for exon_2 in sorted(mrna_2.exons()) :
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = asm_id  + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax)+ ":" + str(exon_2_loc.strand)
                    pred_exons.add(cord)
                    if (cord not in p_exons) :
                        p_exons.append(cord)
                        
        exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons
        
        
        for gene_2 in sorted(genes_2) :                                         ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_g = asm_id  + ":"+ str(gene_2_loc.fmin) + ":" +  str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand)
            
            if (cord_g in gene_true) :                                          ## To prevent duplication, check if the feature already exists in the set of truly predicted gene.
                continue
            ex_mrna1 = set()
            ex_mrna2 = set()
			
        
            for gene_1 in sorted(genes_1) :
                gene_1_loc = gene_1.location_on(assembly_1)
                if (gene_1_loc.strand != gene_2_loc.strand) :
                    continue
                if (gene_2.overlaps_with(gene_1)) :
                    for mrna_2 in sorted(gene_2.mRNAs()) :
                        for exon_2 in sorted(mrna_2.exons()) :
                            exon_2_loc = exon_2.location_on(assembly_2)
                            cord2 = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" +  str(exon_2_loc.strand)
                            ex_mrna2.add(cord2)
                            
                    for mrna_1 in sorted(gene_1.mRNAs()) :
                        for exon_1 in sorted(mrna_1.exons()) :
                            exon_1_loc = exon_1.location_on(assembly_1)
                            cord1 = asm_id + ":" + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" +  str(exon_1_loc.strand)
                            ex_mrna1.add(cord1)
                    
                    ex_union = ex_mrna1.union(ex_mrna2)
                    if (len(ex_union) ==  len(ex_mrna1) and len(ex_union) == len(ex_mrna2)) :
                    	gene_true.add(cord_g)
                    	break
          
    for asm_id in assemblies_2:                                                  ## Iterate through each chromosome from the predicted annotation
        if asm_id not in chr :
            assembly_2 = assemblies_2.get(asm_id,-1)                             ## Find that chromosome in the predicted gff file which is not found in known annotation
            genes_2 = assembly_2.genes()                                         ## Add  genes, mrna, exon features from predicted annotation to total predicted feature set.
            
            for gene_2 in sorted(genes_2) :
                gene_2_loc = gene_2.location_on(assembly_2)
                cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax)  + ":"+ str(gene_2_loc.strand)
                if (cord not in p_gene) :
                    p_gene.append(cord)
            
                for mrna_2 in sorted(gene_2.mRNAs()) :
                    mrna_2_loc = mrna_2.location_on(assembly_2)
                    cord = asm_id  + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax) + ":" + str(mrna_2_loc.strand)
                    if (cord not in p_mrna) :
                        p_mrna.append(cord)
                    
                    for exon_2 in sorted(mrna_2.exons()) :
                        exon_2_loc = exon_2.location_on(assembly_2)
                        cord = asm_id  + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand)
                        if (cord not in p_exons) :
                            p_exons.append(cord)

    exon2_bed = args.output_dir + '/exon_2.bed'
    e_bed = open(exon2_bed, 'w')
    for exon in p_exons :
        chrom = (exon.split(':'))[0]
        start = int((exon.split(':'))[1])
        stop = int((exon.split(':'))[2])
        strand = (exon.split(':'))[3]
        if (strand == str(1)) :
            strand = "+"
        else :
            strand = "-"
        e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n")

    e_bed.close()
    
    out2 = args.output_dir + '/exon_2_merged.bed'
    cmd = "bedtools merge -nms -scores sum -i " + exon2_bed + " -s >"+out2
    #print(cmd)
    os.system(cmd)
    
    exon1_bed = args.output_dir + '/exon_1.bed'
    e_bed = open(exon1_bed, 'w')
    for exon in a_exons :
        chrom = (exon.split(':'))[0]
        start = int((exon.split(':'))[1])
        stop = int((exon.split(':'))[2])
        strand = (exon.split(':'))[3]
        if (strand == str(1)) :
            strand = "+"
        else :
            strand = "-"
        e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n")
    e_bed.close()

    out1 = args.output_dir + '/exon_1_merged.bed'
    cmd = "bedtools merge -nms -scores sum -i " + exon1_bed + " -s >"+out1
    #print(cmd)
    os.system(cmd)
    
    out_intersect = args.output_dir + '/exon_1_2_intersect.bed'
    cmd = "bedtools intersect -s -wo -a " + out1 + " -b " + out2 + " >" + out_intersect
    #print(cmd)
    os.system(cmd)
    
    a_base_file = open(out1,'r')
    for line in a_base_file :
        arr = line.split("\t")
        a_base = a_base + (int(arr[2]) - int(arr[1]))
    a_base_file.close()
    
    p_base_file = open(out2,'r')
    for line in p_base_file :
        arr = line.split("\t")
        p_base = p_base + (int(arr[2]) - int(arr[1]))
    p_base_file.close()

    true_base_file = open(out_intersect,'r')
    for line in true_base_file :
        arr = line.split("\t")
        true_base = true_base + int(arr[12])
    true_base_file.close()

    #Calculate SN/SP for bases 

    base_sn = (true_base/a_base) * 100                                 
    base_sp = (true_base/p_base) * 100

    #Calculate SN/SP for exons 
    annotated_exon = len(a_exons)
    predicted_exon = len(p_exons)
    true_pred_exon = len(exon_pred_all)
    
    exon_sn = (true_pred_exon/annotated_exon) * 100                                 
    exon_sp = (true_pred_exon/predicted_exon) * 100

    #Calculate SN/SP for transcript
    
    #annotated_mrna = len(a_mrna)
    #predicted_mrna = len(p_mrna)
    #true_pred_mrna = len(mrna_true)
    
    #mrna_sn = (true_pred_mrna/annotated_mrna) * 100
    #mrna_sp = (true_pred_mrna/predicted_mrna) * 100
       
    #Calculate SN/SP for genes 

    annotated_gene = len(a_gene)
    predicted_gene = len(p_gene)
    true_pred_gene = len(gene_true)

    
    temp_file7 = args.output_dir + '/true_gene.temp7.txt'
    ft7 = open(temp_file7,'w')
    for g in gene_true :
        ft7.write(g+"\n")
        
        


    
    gene_sn = (true_pred_gene/annotated_gene) * 100                                 
    gene_sp = (true_pred_gene/predicted_gene) * 100
    print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp))
    #print("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp))
    print("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp))
    print("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp))
    
    out_file = args.output_dir + '/summary.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')

    fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n")
   # fout.write("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)+"\n")
    fout.write("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n")
    fout.write("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)+"\n\n")
    
    new_gene = 0
    gene_merge = 0
    gene_found = 0
    gene_split = 0
    gene_missing = 0
    altered_pred = 0
    altered_known = 0
    gene = 0



    temp_file1 = args.output_dir + '/pred_new.txt'
    temp_file2 = args.output_dir + '/pred_merged.txt'
    temp_file3 = args.output_dir + '/pred_1.txt'
    temp_file4 = args.output_dir + '/known_split.txt'
    temp_file5 = args.output_dir + '/known_1.txt'
    temp_file6 = args.output_dir + '/known_missed.txt'
    temp_file8 = args.output_dir + '/pred_altered.txt'
    temp_file9 = args.output_dir + '/known_altered.txt'
    
    ft1 = open(temp_file1,'w')
    ft2 = open(temp_file2,'w')
    ft3 = open(temp_file3,'w')
    ft4 = open(temp_file4,'w')
    ft5 = open(temp_file5,'w')
    ft6 = open(temp_file6,'w')
    ft8 = open(temp_file8,'w')
    ft9 = open(temp_file9,'w')
    
    for gene2 in p_gene :
        gene_overlap = []
        chrom2 = (gene2.split(':'))[0]
        start2 = int((gene2.split(':'))[1])
        stop2 = int((gene2.split(':'))[2])
        strand2 = (gene2.split(':'))[3]
        for gene1 in a_gene:
            chrom1 = (gene1.split(':'))[0]
            start1 = int((gene1.split(':'))[1])
            stop1 = int((gene1.split(':'))[2])
            strand1 = (gene1.split(':'))[3]
            if (chrom1 != chrom2) :
                continue
            if (strand1 != strand2) :
                continue
            if (start1 > stop2) :
                break
            if(start1 <= stop2 and start2 <= stop1) :
                arr = [start1,stop1,start2,stop2]
                arr.sort()
                len_overlap = arr[2] - arr[1]
                per_overlap = (len_overlap/(stop1 - start1)) * 100
                gene_overlap.append(per_overlap)

        if (len(gene_overlap) == 0) :
            new_gene += 1
            ft1.write(gene2+"\n")
            
        if (len(gene_overlap) > 1) :
            true_overlap = 0
            for overlap in gene_overlap :
                if(overlap >= 50) :
                    true_overlap += 1;
            if (true_overlap >= 2) :
                gene_merge += 1
                ft2.write(gene2+"\n")
            else :
                altered_pred += 1;
                ft8.write(gene2+"\n")
                
        if (len(gene_overlap) == 1) :
            gene_found += 1
            ft3.write(gene2+"\n")
            
        
    for gene1 in a_gene :
        gene_overlap = []
        chrom1 = (gene1.split(':'))[0]
        start1 = int((gene1.split(':'))[1])
        stop1 = int((gene1.split(':'))[2])
        strand1 = (gene1.split(':'))[3]
        for gene2 in p_gene:
            chrom2 = (gene2.split(':'))[0]
            start2 = int((gene2.split(':'))[1])
            stop2 = int((gene2.split(':'))[2])
            strand2 = (gene2.split(':'))[3]
            if (chrom1 != chrom2) :
                continue
            if (strand1 != strand2) :
                continue
            if (start2 > stop1) :
                break
            if(start1 <= stop2 and start2 <= stop1) :
                arr = [start1,stop1,start2,stop2]
                arr.sort()
                len_overlap = arr[2] - arr[1]
                per_overlap = (len_overlap/(stop2 - start2)) * 100
                gene_overlap.append(per_overlap)

                
        if (len(gene_overlap) > 1) :
            true_overlap = 0
            for overlap in gene_overlap :
                if(overlap >= 50) :
                    true_overlap += 1;
            if (true_overlap >= 2) :
                gene_split += 1
                ft4.write(gene1+"\n")
            else :
                altered_known += 1
                ft9.write(gene1+"\n")
        
        if (len(gene_overlap) == 1) :
            gene += 1
            ft5.write(gene1+"\n")
        if (len(gene_overlap) == 0) :
            gene_missing += 1
            ft6.write(gene1+"\n")
            
    print ("1. No. of predicted gene overlapping  0 known gene (new gene): ",new_gene)
    print ("2. No. of predicted gene overlapping > 1 known gene by at least 50%: ",gene_merge)
    print ("3. No. of altered predicted gene: ",altered_pred)
    print ("4. No. of predicted gene overlaping 1 known gene : ",gene_found)
    print ("5. No. of known gene overlapping > 1 predicted gene by at least 50% : ",gene_split)
    print ("6. No. of altered known gene: ",altered_known)
    print ("7. No. of known gene overlapping 1 predicted gene : ",gene)
    print ("8. No. of known gene overlapping 0 predicted gene (gene missing) : ",gene_missing)


    fout.write ("1. No. of predicted gene overlapping  0 known gene (new gene): "+str(new_gene)+"\n")
    fout.write ("2. No. of predicted gene overlapping > 1 known gene by at least 50%: "+str(gene_merge)+"\n")
    fout.write ("3. No. of altered predicted gene: "+str(altered_pred)+"\n")
    fout.write ("4. No. of predicted gene overlaping 1 known gene : "+str(gene_found)+"\n")
    fout.write ("5. No. of known gene overlapping > 1 predicted gene by at least 50% : "+str(gene_split)+"\n")
    fout.write ("6. No. of altered known gene: "+str(altered_known)+"\n")
    fout.write ("7. No. of known gene overlapping 1 predicted gene : "+str(gene)+"\n")
    fout.write ("8. No. of known gene overlapping 0 predicted gene (gene missing) : "+str(gene_missing)+"\n")

    
    fout.close()
    ft1.close()
    ft2.close()
    ft3.close()
    ft4.close()
    ft5.close()
    ft6.close()
    ft7.close()
    ft8.close()
    ft9.close()


    #Clean up
    cmd = "rm " + args.output_dir + "/*.bed"
    os.system(cmd)
Ejemplo n.º 36
0
def main():

    gm_es_file = 'genemark_hmm.gff3'
    cegma_file = 'output.cegma.gff3'
    #aat_file = 'bail_training_genes.aat.1500maxintron.80percid.gff3'
    aat_file = 'aat.bail_hominis_filtered_training.gff3'
    #aat_file = 'aat.merged.gff3'

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = biocodegff.get_gff3_features(gm_es_file)
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies,
     cegma_features) = biocodegff.get_gff3_features(cegma_file,
                                                    assemblies=assemblies)
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing AAT results")
    (assemblies,
     aat_muris_features) = biocodegff.get_gff3_features(aat_file,
                                                        assemblies=assemblies)
    aat_genes = get_genes_from_dict(aat_muris_features)
    print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes)))

    genemark_cegma_shared_genes = list()
    gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as(thing=cegma_gene):
                if gm_es_gene.shares_exon_structure_with(
                        thing=cegma_gene) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".
          format(len(genemark_cegma_shared_genes)))

    #############################################################################

    genemark_aat_shared_genes = list()
    gmes_aat_fh = open('gmes_aat.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_genes:
            if gm_es_gene.shares_exon_structure_with(
                    thing=aat_gene, stop_tolerant=True) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                gmes_aat_fh.write("{0}\n".format(gm_es_gene.id))
                break

    print("{0} Genemark-ES genes had an exact AAT match".format(
        len(genemark_aat_shared_genes)))

    ##############################################################################
    cegma_matching_gm_es = list()
    genemark_aat_cegma_shared_genes = list()
    gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt')

    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as(thing=gm_es_gene):
                if cegma_gene.shares_exon_structure_with(
                        thing=gm_es_gene) == True:
                    match_found = True

                    if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes:
                        genemark_aat_cegma_shared_genes.append(gm_es_gene)
                        gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id))

                    break

        if match_found == True:
            cegma_matching_gm_es.append(cegma_gene)

    print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(
        len(genemark_aat_cegma_shared_genes)))
    training_fh = open('training_gene.ids', 'wt')

    for gene in genemark_aat_cegma_shared_genes:
        training_fh.write("{0}\n".format(gene.id))

    ##############################################################################
    cegma_with_aat_not_gm_es = list()
    cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt')

    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_matching_gm_es:
            continue

        for aat_gene in aat_genes:
            if cegma_gene.shares_exon_structure_with(
                    thing=aat_gene, stop_tolerant=True) == True:
                cegma_with_aat_not_gm_es.append(cegma_gene)
                cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id))
                break

    print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".
          format(len(cegma_with_aat_not_gm_es)))
Ejemplo n.º 37
0
def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.set_defaults(check_ends=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():

                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = mRNA.id
                export_header = None

                if mRNA.locus_tag is not None:
                    export_id = mRNA.locus_tag

                ## Add the gene product name if there is one
                for polypeptide in mRNA.polypeptides():
                    if polypeptide.annotation is not None:
                        if polypeptide.annotation.product_name is not None:
                            export_header = polypeptide.annotation.product_name
                            break
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")
                
                coding_seq = mRNA.get_CDS_residues(for_translation=True)

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = biocodeutils.translate(coding_seq)
                    fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Generates a set of transcripts based on a user-defined exon-complexity profile'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Output ID list file to create')
    parser.add_argument('-ni',
                        '--not_included',
                        type=str,
                        required=False,
                        help='Writes the ID list of genes not included')
    parser.add_argument('-c',
                        '--count',
                        type=int,
                        required=True,
                        help='Count of transcripts to pull')
    parser.add_argument('-e',
                        '--exclude',
                        type=str,
                        required=False,
                        help='List of IDs to exclude')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)
    exclude = list()

    if args.exclude is not None:
        for line in open(args.exclude):
            line = line.rstrip()
            exclude.append(line)

    ofh = open(args.output_file, 'wt')

    if args.not_included is None:
        ni_ofh = None
    else:
        ni_ofh = open(args.not_included, 'wt')

    # profile for 99-892
    #profile = { 1:12.2, 2:13.2, 3:15.4, 4:14.1, 5:11.5, 6:8.77, 7:6.99, 8:4.74, 9:3.81, 10:2.45 }

    # profile for 99-880
    #profile = { 1:19.7, 2:17.9, 3:20.6, 4:15.6, 5:10.1, 6:6.62, 7:4.14, 8:1.58, 9:2.07 }

    # c. hominis TU502
    #profile = { 1:89.7, 2:6.8, 3:2.6, 4:0.78 }

    # c. baileyi TAMU 10GZ1
    #profile = { 1:85.6 , 2:9.85 , 3:4.06 , 4:0.25 , 5:0.25 }
    profile = {1: 85.6, 2: 9.85, 3: 4.06}

    mRNAs = dict()
    unselected_mRNAs = dict()
    target = dict()
    selected = dict()

    for CDS_count in profile:
        target[CDS_count] = math.trunc(args.count * (profile[CDS_count] / 100))
        selected[CDS_count] = list()
        mRNAs[CDS_count] = list()
        unselected_mRNAs[CDS_count] = list()

    # fill the bins of each target size, then fill a reservoir to select any additional ones from
    reservoir = list()
    total_mRNAs_selected = 0
    total_mRNA_count = 0

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                total_mRNA_count += 1

                if mRNA.id in exclude:
                    continue

                CDS_count = mRNA.CDS_count()

                if CDS_count not in profile:
                    reservoir.append(mRNA)
                    continue

                mRNAs[CDS_count].append(mRNA)

    for CDS_count in profile:
        # make sure this many were found
        if target[CDS_count] <= len(mRNAs[CDS_count]):
            selected[CDS_count] = random.sample(mRNAs[CDS_count],
                                                target[CDS_count])
        else:
            print(
                "WARN: Not enough mRNAs of length {0} to meet profile request".
                format(CDS_count))
            selected[CDS_count] = mRNAs[CDS_count]

        total_mRNAs_selected += len(selected[CDS_count])
        unselected_mRNAs[CDS_count] = list(
            set(mRNAs[CDS_count])
            & set(set(mRNAs[CDS_count]) ^ set(selected[CDS_count])))

        for mRNA in unselected_mRNAs[CDS_count]:
            reservoir.append(mRNA)

    print("INFO: selected CDS profile:")
    for ccount in sorted(selected):
        print("CDS_count:{0}, target:{3}, gathered:{1}, unselected:{4}, target_perc:{2}".format( \
                ccount, len(selected[ccount]), (len(selected[ccount])/target[ccount]), target[ccount], \
                len(unselected_mRNAs[ccount]) \
              ) )

        for mRNA in selected[ccount]:
            ofh.write("{0}\n".format(mRNA.parent.id))

    print("Total selected according to profile: {0}".format(
        total_mRNAs_selected))
    # now, from the rounding portions fill the rest randomly
    sample_from_reservoir = random.sample(reservoir,
                                          args.count - total_mRNAs_selected)
    sample_ids_from_reservoir = list()

    for mRNA in sample_from_reservoir:
        sample_ids_from_reservoir.append(mRNA.id)
        reservoir.remove(mRNA)

    ofh.write("\n".join(sample_ids_from_reservoir))
    ofh.write("\n")

    total_mRNAs_selected += len(sample_from_reservoir)

    print("Total selected randomly afterwards: {0}".format(
        len(sample_from_reservoir)))

    if ni_ofh is not None:
        for mRNA in reservoir:
            ni_ofh.write("{0}\n".format(mRNA.parent.id))
Ejemplo n.º 39
0
def process_files(args):
    (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1)
    (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2)


    a_exons = []                                    ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file.  
    p_exons = []                                    ## For predicted annotation

    a_gene = []
    p_gene = []

    a_mrna = []
    p_mrna = []

    exon_pred_all = set()
    gene_true = set()
    mrna_true = set()



    chr = []

    a_cds = []                                   
    p_cds = []                                   

    a_cd = []
    p_cd= []
    chr = []

    true_pred_file = args.output_dir + '/true_predicted_genes.txt'
    true_file = open(true_pred_file,'w')
    true_file.write("Known\tPredicted\n")
    
    for asm_id in assemblies_1:                                                                                     ## Iterate through each chromosome from the known ref annotation        
        assembly_1 = assemblies_1[asm_id]
        assembly_2 = assemblies_2.get(asm_id,-1)                                                                    ## Find that chromosome in the predicted gff file
        genes_1 = assembly_1.genes()                                                                                ## All genes from known annotation
        anno_exons = set()

        for gene_1 in sorted(genes_1) :                                                                                     ## Add unique gene, mrna , exon features from known annotation to get each known feature total count 
            gene_1_loc = gene_1.location_on(assembly_1)
            cord_a = cordinate(asm_id,gene_1_loc)      ## Use chromosome id+start+stop+strand as a string to determine uniqueness.
            if (cord_a not in a_gene) :
                a_gene.append(cord_a)

            ex_start = []
            ex_stop = []
            for mrna_1 in sorted(gene_1.mRNAs()) :
                mrna_1_loc = mrna_1.location_on(assembly_1)
                cord = cordinate(asm_id,mrna_1_loc)
                if (cord not in a_mrna) :
                    a_mrna.append(cord)
                    
                if (args.feature == "Exon") :
                    feat_1 = mrna_1.exons()
                    
                if (args.feature == "CDS") :
                    feat_1 = mrna_1.CDSs()
                    
                for exon_1 in sorted(feat_1) :
                    exon_1_loc = exon_1.location_on(assembly_1)
                    cord = cordinate(asm_id, exon_1_loc)
                    if (cord not in a_exons) :
                        a_exons.append(cord)
                    anno_exons.add(cord)

                    
                    ex_start.append(exon_1_loc.fmin)
                    ex_stop.append(exon_1_loc.fmax)
                    
            ex_start.sort()
            ex_stop.sort()
            if (len(ex_start) >= 1) :
                cds1 = asm_id + ":" + gene_1.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" +  str(gene_1_loc.strand)
                
            else :
                cds1 = asm_id + ":" + gene_1.id + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax) + ":" +  str(gene_1_loc.strand)
                
                
            if (cord_a not in a_cd) :
                a_cds.append(cds1)
                a_cd.append(cord_a)
             
                    

        if (type(assembly_2) is int) :                     ##    If the chromosome is not found in prediected file, move to next chromosome.
            continue
        

        genes_2 = assembly_2.genes()                      ## All genes from predicted annotation.
        chr.append(asm_id)                                ## Append all found chromosome in a list.
        pred_exons = set()

        for gene_2 in sorted(genes_2) :                           ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count.  
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_p = cordinate(asm_id, gene_2_loc)
            if (cord_p not in p_gene) :
                p_gene.append(cord_p)

            ex_start = []
            ex_stop = []
            
            for mrna_2 in sorted(gene_2.mRNAs()) :
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord = cordinate(asm_id, mrna_2_loc)
                if (cord not in p_mrna) :
                    p_mrna.append(cord)

                if (args.feature == "Exon") :
                    feat_2 = mrna_2.exons()
                    
                if (args.feature == "CDS") :
                    feat_2 = mrna_2.CDSs()
                    
                for exon_2 in sorted(feat_2) :
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = cordinate(asm_id ,exon_2_loc)
                    pred_exons.add(cord)
                    if (cord not in p_exons) :
                        p_exons.append(cord)
                        
                    ex_start.append(exon_2_loc.fmin)
                    ex_stop.append(exon_2_loc.fmax)
                    
            ex_start.sort()
            ex_stop.sort()
            
            if (len(ex_start) >= 1) :   
                cds2 = asm_id  + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand)
                
            else :
                cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
                

            if (cord_p not in p_cd) :
                p_cds.append(cds2)
                p_cd.append(cord_p)

                    
        exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons
        
        
        for gene_2 in sorted(genes_2) :                                         ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_g = cordinate(asm_id, gene_2_loc)
            
            if (cord_g in gene_true) :                                          ## To prevent duplication, check if the feature already exists in the set of truly predicted gene.
                continue
            ex_mrna1 = set()
            ex_mrna2 = set()
			
        
            for gene_1 in sorted(genes_1) :
                gene_1_loc = gene_1.location_on(assembly_1)
                if (gene_1_loc.strand != gene_2_loc.strand) :
                    continue
                if (gene_2.overlaps_with(gene_1)) :
                    for mrna_2 in sorted(gene_2.mRNAs()) :
                        if (args.feature == "Exon") :
                            feat_2 = mrna_2.exons()
                        if (args.feature == "CDS") :
                            feat_2 = mrna_2.CDSs()
                            
                        for exon_2 in sorted(feat_2) :
                            exon_2_loc = exon_2.location_on(assembly_2)
                            cord2 = cordinate(asm_id , exon_2_loc)
                            ex_mrna2.add(cord2)
                            
                    for mrna_1 in sorted(gene_1.mRNAs()) :
                        if (args.feature == "Exon") :
                            feat_1 = mrna_1.exons()
                    
                        if (args.feature == "CDS") :
                            feat_1 = mrna_1.CDSs()
                        
                        for exon_1 in sorted(feat_1) :
                            exon_1_loc = exon_1.location_on(assembly_1)
                            cord1 = cordinate(asm_id, exon_1_loc)
                            ex_mrna1.add(cord1)
                    
                    ex_union = ex_mrna1.union(ex_mrna2)
                    if (len(ex_union) ==  len(ex_mrna1) and len(ex_union) == len(ex_mrna2)) :
                        gene_true.add(cord_g)
                        true_file.write(gene_1.id+"\t"+gene_2.id+"\n")
                        break
          
    for asm_id in assemblies_2:                                                  ## Iterate through each chromosome from the predicted annotation
        if asm_id not in chr :
            assembly_2 = assemblies_2.get(asm_id,-1)                             ## Find that chromosome in the predicted gff file which is not found in known annotation
            genes_2 = assembly_2.genes()                                         ## Add  genes, mrna, exon features from predicted annotation to total predicted feature set.
            
            for gene_2 in sorted(genes_2) :
                gene_2_loc = gene_2.location_on(assembly_2)
                cord_p = cordinate(asm_id ,gene_2_loc)
                if (cord_p not in p_gene) :
                    p_gene.append(cord_p)

                ex_start = []
                ex_stop = []
                
                for mrna_2 in sorted(gene_2.mRNAs()) :
                    mrna_2_loc = mrna_2.location_on(assembly_2)
                    cord = cordinate(asm_id , mrna_2_loc)
                    if (cord not in p_mrna) :
                        p_mrna.append(cord)

                    if (args.feature == "Exon") :
                        feat_2 = mrna_2.exons()
                    if (args.feature == "CDS") :
                        feat_2 = mrna_2.CDSs()
                        
                    for exon_2 in sorted(feat_2) :
                        exon_2_loc = exon_2.location_on(assembly_2)
                        cord = cordinate(asm_id ,exon_2_loc)
                        if (cord not in p_exons) :
                            p_exons.append(cord)
                            
                
                        ex_start.append(exon_2_loc.fmin)
                        ex_stop.append(exon_2_loc.fmax)

                ex_start.sort()
                ex_stop.sort()
                if (len(ex_start) >= 1) :
                    cds2 = asm_id  + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand)
                    
                else :
                    cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
                    

                if (cord_p not in p_cd) :
                    p_cds.append(cds2)
                    p_cd.append(cord_p)
                            

    

    #Calculate SN/SP for bases 

    (a_base_val, p_base_val, true_base) = base_comparison(p_exons,a_exons)

    base_sn = (true_base/a_base_val) * 100                                 
    base_sp = (true_base/p_base_val) * 100


    #Calculate SN/SP for exons 
    annotated_exon = len(a_exons)
    predicted_exon = len(p_exons)
    true_pred_exon = len(exon_pred_all)
    
    exon_sn = (true_pred_exon/annotated_exon) * 100                                 
    exon_sp = (true_pred_exon/predicted_exon) * 100

    #Calculate SN/SP for genes 

    annotated_gene = len(a_gene)
    predicted_gene = len(p_gene)
    true_pred_gene = len(gene_true)

    
    gene_sn = (true_pred_gene/annotated_gene) * 100                                 
    gene_sp = (true_pred_gene/predicted_gene) * 100
    print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp))
    print(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp))
    print("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp))
    
    out_file = args.output_dir + '/summary.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')

    fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n")
    fout.write(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n")
    fout.write("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)+"\n\n")


    arr_pred = compare_cds(p_cds,a_cds,"pred")
    arr_known = compare_cds(a_cds,p_cds,"known")
    arr_pred_same = compare_cds(p_cds,p_cds,"pred_same")
    
    new_gene = arr_pred[2]
    gene_merge = arr_pred[3]
    gene_found = arr_pred[0]
    gene_opp = arr_pred[1]       
    gene_missing = arr_known[2]
    gene = arr_known[0]
    gene_opp_known = arr_known[1]
    gene_split = arr_known[3]
    gene_pred_overlap_opp = arr_pred_same[1]


            
    print ("1. No. of known gene : ",len(a_cds))
    print ("2. No. of predicted gene : ",len(p_cds))
    print ("3. No. of predicted gene overlapping  0 known gene (new gene): ",new_gene)
    print ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : ",gene_merge)
    print ("5. No. of predicted gene overlaping 1 known gene : ",gene_found)
    print ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : ",gene_opp)
    print ("7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundaries) : ",true_pred_gene)
    print ("8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : ",gene_pred_overlap_opp)
    
    print ("9. No. of known gene overlapping  0 predicted gene (gene missing): ",gene_missing)
    print ("10. No. of known gene overlapping > 1 predicted gene(gene split) : ",gene_split)
    print ("11. No. of known gene overlaping 1 predicted gene : ",gene)
    print ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : ",gene_opp_known)

    
    out_file = args.output_dir + '/final_stats.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')
    
    fout.write ("1. No. of known gene : " + str(len(a_cds)) + "\n")
    fout.write ("2. No. of predicted gene : " + str(len(p_cds)) + "\n")
    fout.write ("3. No. of predicted gene overlapping  0 known gene (new gene): " + str(new_gene) + "\n")
    fout.write ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : " + str(gene_merge) + "\n")
    fout.write ("5. No. of predicted gene overlaping 1 known gene : " + str(gene_found) + "\n")
    fout.write ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : " + str(gene_opp) + "\n")
    fout.write ("7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundary) : " + str(true_pred_gene) + "\n")
    fout.write ("8. No. of predicted gene overlapping >= 1  predicted gene in opp strand : " + str(gene_pred_overlap_opp) + "\n")
    fout.write ("9. No. of known gene overlapping  0 predicted gene (gene missing): " + str(gene_missing) + "\n")
    fout.write ("10. No. of known gene overlapping > 1 predicted gene (gene_split): " + str(gene_split) + "\n")
    fout.write ("11. No. of known gene overlaping 1 predicted gene : " + str(gene) + "\n")
    fout.write ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : " + str(gene_opp_known) + "\n")



    true_pred_file = args.output_dir + '/true_pred.txt'
    fout_true = open(true_pred_file,'w')
    for true_gene in gene_true :
        fout_true.write(true_gene+"\n")
    


    #Clean up
    delete_file = ['exon_1.bed','exon_2.bed','exon_1_merged.bed','exon_2_merged.bed','exon_1_2_intersect.bed']
    for f in delete_file :
        cmd = "rm " + args.output_dir + "/" + f
        os.system(cmd)
Ejemplo n.º 40
0
def main():
    parser = argparse.ArgumentParser( description='Parses multiple sources of evidence to generate a consensus functional annotation')

    ## output file to be written
    parser.add_argument('-f', '--input_fasta', type=str, required=True, help='Protein FASTA file of source molecules' )
    parser.add_argument('-m', '--hmm_htab_list', type=str, required=False, help='List of htab files from hmmpfam3' )
    parser.add_argument('-bs', '--blast_sprot_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/SWISS-PROT' )
    parser.add_argument('-rs', '--rapsearch_sprot_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniProtKB/SWISS-PROT' )
    parser.add_argument('-bt', '--blast_trembl_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/Trembl' )
    parser.add_argument('-bk', '--blast_kegg_btab_list', type=str, required=False, help='List of btab files from BLAST against KEGG' )
    parser.add_argument('-bu100', '--blast_uniref100_btab_list', type=str, required=False, help='List of btab files from BLAST against UniRef100' )
    parser.add_argument('-ru100', '--rapsearch_uniref100_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniRef100' )
    parser.add_argument('-u100f', '--uniref100_fasta', type=str, required=False, help='Only required if also passing RAPSEARCH2 against UniRef100 evidence' )
    parser.add_argument('-tm', '--tmhmm_raw_list', type=str, required=False, help='List of raw files from a tmhmm search' )
    parser.add_argument('-d', '--hmm_db', type=str, required=False, help='SQLite3 db with HMM information' )
    parser.add_argument('-u', '--uniprot_sprot_db', type=str, required=False, help='SQLite3 db with UNIPROT/SWISSPROT information' )
    parser.add_argument('-ur', '--uniref_db', type=str, required=False, help='SQLite3 db with UNIREF information' )
    parser.add_argument('-a', '--format', type=str, required=False, default='tab', help='Output format.  Current options are: "tab", "fasta", "gff3"' )
    parser.add_argument('-s', '--source_gff', type=str, required=False, help='Source GFF file from which proteins were derived.  Required if you want to export any format other than tab-delimited.' )
    parser.add_argument('-e', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='Skip BLAST hits unless they have an E-value at least as low as this' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-r', '--organism_table', type=str, required=False, help='Optional table with counts of organism frequency based on top BLAST match for each protein' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=False, help='If passed, the genomic FASTA sequence will be included in the exported GFF3')
    parser.add_argument('-eon', '--export_organism_names', help='If passed, includes organism names from top BLAST hit into 9th column when available.  Mostly useful for metagenomic samples.', action='store_true')
    args = parser.parse_args()

    check_arguments(args)

    # If --rapsearch_uniref100_btab_list passed, --uniref100_fasta is required
    if args.rapsearch_uniref100_btab_list is not None:
        if args.uniref100_fasta is None:
            raise Exception("ERROR: --uniref100_fasta required if --rapsearch_uniref100_btab_list is passed")

    sources_log_fh = open("{0}.sources.log".format(args.output_file), 'wt')
    
    # this is a dict of biothings.Polypeptide objects
    polypeptides = initialize_polypeptides( sources_log_fh, args.input_fasta )

    # Keyed on polypeptide ID (from the FASTA, which is actually the mRNA gff feature ID), the
    #  values here are the organism name for the top BLAST match of each
    polypeptide_blast_org = dict()

    # get source structural annotation, if necessary:
    if args.source_gff is not None:
        print("INFO: parsing source GFF")
        (assemblies, features) = biocodegff.get_gff3_features( args.source_gff )

    if args.hmm_htab_list is not None:
        # connection to the HMM-associated SQLite3 database
        hmm_db_conn = sqlite3.connect(args.hmm_db)
        hmm_db_curs = hmm_db_conn.cursor()
        
        if args.hmm_db is None:
            raise Exception("ERROR: You specified HMM results but not the db with the -d option")
        
        print("INFO: parsing HMM evidence")
        parse_hmm_evidence( sources_log_fh, polypeptides, args.hmm_htab_list, hmm_db_curs )
        hmm_db_curs.close()

    if args.blast_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified BLAST evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing BLAST (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.blast_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'blast' )
        usp_db_curs.close()

    if args.rapsearch_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified RAPSEARCH2 evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing RAPSEARCH2 (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.rapsearch_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'rapsearch2' )
        usp_db_curs.close()

    if args.blast_trembl_btab_list is not None:
        print("INFO: parsing BLAST (TrEMBL) evidence")
        parse_trembl_blast_evidence(polypeptides, args.blast_trembl_btab_list, args.blast_eval_cutoff)

    if args.blast_kegg_btab_list is not None:
        print("INFO: parsing BLAST (KEGG) evidence")
        parse_kegg_blast_evidence(sources_log_fh, polypeptides, args.blast_kegg_btab_list, args.blast_eval_cutoff)

    if args.blast_uniref100_btab_list is not None:
        print("INFO: parsing BLAST (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.blast_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'blast', args.uniref100_fasta)
        uniref_db_curs.close()

    if args.rapsearch_uniref100_btab_list is not None:
        print("INFO: parsing RAPSEARCH2 (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.rapsearch_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'rapsearch2', args.uniref100_fasta)
        uniref_db_curs.close()
        
    if args.tmhmm_raw_list is not None:
        print("INFO: parsing TMHMM evidence")
        parse_tmhmm_evidence(sources_log_fh, polypeptides, args.tmhmm_raw_list)

    ## output will either be a file or STDOUT
    print("INFO: writing output")
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    if args.format == 'tab':
        write_tab_results(fout, polypeptides)
    elif args.format == 'fasta':
        write_fasta_results(fout, polypeptides)
    elif args.format == 'gff3':
        write_gff3_results(fout, polypeptides, assemblies, features, args.genomic_fasta)
    
    fout.close()

    ## There isn't a method in biocodegff3 to add arbitrary key=value pairs.  So we have to cheat here.
    if args.export_organism_names is True:
        if args.output_file:
            append_organism_names_to_gff(args.output_file, polypeptide_blast_org)
        else:
            raise Exception("ERROR: an --output_file must be specified when using the --export_organism_names option.")

    if args.organism_table is not None:
        create_organism_table(args.organism_table, polypeptide_blast_org)
Ejemplo n.º 41
0
def main():

    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-g', '--genemark', type=str, required=True, help='Path to the results from GeneMark-ES' )
    parser.add_argument('-c', '--cegma', type=str, required=True, help='Path to the results from CEGMA, converted to GFF3' )
    parser.add_argument('-a', '--aat', type=str, required=True, help='Path to the results from AAT, converted to GFF3' )
    parser.add_argument('-e', '--expression', type=str, required=False, help='Any expression data aligned using GMAP (in gff3_gene mode)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-m', '--max_genes', type=int, required=False, help='Limits gene IDs exported to the top N by strongest evidence class' )
    args = parser.parse_args()

    print("INFO: parsing Genemark-ES data")
    (assemblies, gm_es_features) = biocodegff.get_gff3_features( args.genemark )
    gm_es_genes = get_genes_from_dict(gm_es_features)
    print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes)))

    print("INFO: parsing CEGMA data")
    (assemblies, cegma_features) = biocodegff.get_gff3_features( args.cegma, assemblies=assemblies )
    cegma_genes = get_genes_from_dict(cegma_features)
    print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes)))

    print("INFO: parsing AAT results")
    (assemblies, aat_features) = biocodegff.get_gff3_features( args.aat, assemblies=assemblies)
    aat_genes = get_genes_from_dict(aat_features)
    print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes)))

    expression_genes = list()
    if args.expression is not None:
        print("INFO: parsing expression results")
        (assemblies, expression_features) = biocodegff.get_gff3_features( args.expression, assemblies=assemblies)
        expression_genes = get_genes_from_dict(expression_features)
        print("\tINFO: Got {0} expression 'genes'".format(len(expression_genes)))

    genemark_cegma_shared_genes = list()
    gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for cegma_gene in cegma_genes:
            if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ):
                if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True:
                    genemark_cegma_shared_genes.append(gm_es_gene)
                    gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                    break

    print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) )

    #############################################################################

    genemark_cegma_expression_shared_genes = list()
    gmes_cegma_exp_fh = open('gmes_cegma_exp.shared.ids', 'wt')

    for gm_es_gene in genemark_cegma_shared_genes:
        for exp_gene in expression_genes:
            if gm_es_gene.shares_CDS_structure_with( exp_gene ):
                genemark_cegma_expression_shared_genes.append(gm_es_gene)
                break

    print("{0} genes were shared perfectly between Genemark-ES and CEGMA and expression data".format(len(genemark_cegma_expression_shared_genes)) )

    #############################################################################

    genemark_aat_shared_genes = list()
    gmes_aat_fh = open('gmes_aat.shared.ids', 'wt')

    for gm_es_gene in gm_es_genes:
        for aat_gene in aat_genes:
            if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
            #if gm_es_gene.shares_exon_structure_with( thing=aat_gene ) == True:
                genemark_aat_shared_genes.append(gm_es_gene)
                gmes_aat_fh.write("{0}\n".format(gm_es_gene.id))
                break

    print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) )    

    ##############################################################################
    cegma_matching_gm_es = list()
    genemark_aat_cegma_shared_genes = list()
    gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        match_found = False

        for gm_es_gene in gm_es_genes:
            if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ):
                if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True:
                    match_found = True

                    if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes:
                        genemark_aat_cegma_shared_genes.append(gm_es_gene)
                        gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id))
                        
                    break

        if match_found == True:
            cegma_matching_gm_es.append(cegma_gene)

    
    print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) )
    training_fh = open('training_gene.ids', 'wt')
    
    for gene in genemark_aat_cegma_shared_genes:
        training_fh.write("{0}\n".format(gene.id) )

    ##############################################################################
    cegma_with_aat_not_gm_es = list()
    cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt')
    
    for cegma_gene in cegma_genes:
        if cegma_gene in cegma_matching_gm_es:
            continue

        for aat_gene in aat_genes:
            #if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True:
            if cegma_gene.shares_exon_structure_with( thing=aat_gene ) == True:
                cegma_with_aat_not_gm_es.append(cegma_gene)
                cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id))
                break
            
    print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) )


    ##############################################################################
    ## now to assemble the results
    training_ids = list()

    # 0. Start with genes shared between GeneMark-ES, CEGMA and expression evidence
    recruit_training_genes( training_ids, genemark_cegma_expression_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and expression data".format(len(training_ids)))
    
    # 1. Pull in the genes with shared evidence across GeneMark-ES, CEGMA and AAT
    recruit_training_genes( training_ids, genemark_aat_cegma_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and AAT".format(len(training_ids)))

    # 2. Next include those genes 
    recruit_training_genes( training_ids, cegma_with_aat_not_gm_es, args.max_genes )
    print("DEBUG: {0} genes after recruitment of CEGMA + AAT without GM-ES".format(len(training_ids)))

    recruit_training_genes( training_ids, genemark_cegma_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES + CEGMA".format(len(training_ids)))

    recruit_training_genes( training_ids, genemark_aat_shared_genes, args.max_genes )
    print("DEBUG: {0} genes after recruitment of GeneMark-ES + AAT".format(len(training_ids)))

    output_list_fh = open(args.output_file, 'wt')
    for training_id in training_ids:
        output_list_fh.write("{0}\n".format(training_id))
Ejemplo n.º 42
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    type_counts = defaultdict(int)
    type_lengths = defaultdict(int)
    assembly_lengths_found = False

    # key is number of exons, value is number of mRNAs with that many
    CDS_profile = defaultdict(int)

    for assembly_id in assemblies:
        type_counts['assembly'] += 1

        if assemblies[assembly_id].length is not None:
            type_lengths['assembly'] += assemblies[assembly_id].length
            assembly_lengths_found = True

        for gene in assemblies[assembly_id].genes():
            type_counts['gene'] += 1
            type_lengths['gene'] += gene.length

            for mRNA in gene.mRNAs():
                type_counts['mRNA'] += 1
                type_lengths['mRNA'] += mRNA.length
                CDS_profile[mRNA.CDS_count()] += 1

                for exon in mRNA.exons():
                    type_counts['exon'] += 1
                    type_lengths['exon'] += exon.length

                for CDS in mRNA.CDSs():
                    type_counts['CDS fragments'] += 1
                    type_lengths['CDS fragments'] += CDS.length

    ofh.write("Assembly count\t{0}\n".format(type_counts['assembly']))
    if assembly_lengths_found:
        ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly']))
    else:
        ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n")

    gene_length_mean = type_lengths['gene'] / type_counts['gene']
    mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA']
    exon_length_mean = type_lengths['exon'] / type_counts['exon']
    CDS_length_mean = type_lengths['CDS fragments'] / type_counts[
        'CDS fragments']

    mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene']
    exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA']
    CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA']

    ofh.write("\nGene count\t{0}\n".format(type_counts['gene']))
    ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean))
    ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene']))

    ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA']))
    ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean))
    ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA']))
    ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean))

    ofh.write("\nexon count\t{0}\n".format(type_counts['exon']))
    ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean))
    ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon']))
    ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean))

    ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments']))
    ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean))
    ofh.write("CDS fragment length (sum)\t{0}\n".format(
        type_lengths['CDS fragments']))
    ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean))

    ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n")
    for cds_count in sorted(CDS_profile):
        perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100
        ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format(
            cds_count, CDS_profile[cds_count], perc))
Ejemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser(
        description='Basic comparison of two GFF3 files')

    ## output file to be written
    parser.add_argument('-r',
                        '--ref',
                        type=str,
                        required=True,
                        help='Path to the reference GFF3 file')
    parser.add_argument('-q',
                        '--qry',
                        type=str,
                        required=True,
                        help='Path to the query GFF3 file')
    parser.add_argument(
        '-o',
        '--output_base',
        type=str,
        required=True,
        help='Base name/path of the output files to be created')
    args = parser.parse_args()

    (assemblies, ref_features) = biocodegff.get_gff3_features(args.ref)
    ref_genes = get_genes_from_dict(ref_features)

    (assemblies,
     qry_features) = biocodegff.get_gff3_features(args.qry,
                                                  assemblies=assemblies)
    qry_genes = get_genes_from_dict(qry_features)

    ref_matches_found = dict()
    qry_matches_found = dict()

    for ref_gene in ref_genes:
        for qry_gene in qry_genes:
            if ref_gene.has_same_coordinates_as( thing=qry_gene ) and \
               ref_gene.shares_exon_structure_with( thing=qry_gene ) and \
               ref_gene.shares_CDS_structure_with( thing=qry_gene ):

                ref_matches_found[ref_gene.id] = qry_gene.id
                qry_matches_found[qry_gene.id] = ref_gene.id

    # open our output files
    out_matches = open("{0}.matches".format(args.output_base), 'wt')
    out_summary = open("{0}.summary".format(args.output_base), 'wt')

    print("INFO: {0}/{1} reference genes had a match to a qry gene".format(
        len(ref_matches_found), len(ref_genes)))
    print("INFO: {0}/{1} qry genes had a match to a reference gene".format(
        len(qry_matches_found), len(qry_genes)))

    for ref_gene_id in ref_matches_found:
        out_matches.write("{0}\t{1}\n".format(ref_gene_id,
                                              ref_matches_found[ref_gene_id]))

    out_summary.write("Reference\t{0}\n".format(args.ref))
    out_summary.write("Query\t{0}\n".format(args.ref))
    out_summary.write(
        "Total identical models (with respect to reference)\t{0}\n".format(
            len(ref_matches_found)))
    out_summary.write("Models in REF not in QRY\t{0}\n".format(
        len(ref_genes) - len(ref_matches_found)))
    out_summary.write("Models in QRY not in REF\t{0}\n".format(
        len(qry_genes) - len(qry_matches_found)))
Ejemplo n.º 44
0
def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    type_counts = defaultdict(int)
    type_lengths = defaultdict(int)
    assembly_lengths_found = False

    # key is number of exons, value is number of mRNAs with that many
    CDS_profile = defaultdict(int)
        
    for assembly_id in assemblies:
        type_counts['assembly'] += 1

        if assemblies[assembly_id].length is not None:
            type_lengths['assembly'] += assemblies[assembly_id].length
            assembly_lengths_found = True
        
        for gene in assemblies[assembly_id].genes():
            type_counts['gene'] += 1
            type_lengths['gene'] += gene.length
            
            for mRNA in gene.mRNAs():
                type_counts['mRNA'] += 1
                type_lengths['mRNA'] += mRNA.length
                CDS_profile[mRNA.CDS_count()] += 1

                for exon in mRNA.exons():
                    type_counts['exon'] += 1
                    type_lengths['exon'] += exon.length
                
                for CDS in mRNA.CDSs():
                    type_counts['CDS fragments'] += 1
                    type_lengths['CDS fragments'] += CDS.length
                    

    ofh.write("Assembly count\t{0}\n".format(type_counts['assembly']))
    if assembly_lengths_found:
        ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly']))
    else:
        ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n")

    gene_length_mean = type_lengths['gene'] / type_counts['gene']
    mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA']
    exon_length_mean = type_lengths['exon'] / type_counts['exon']
    CDS_length_mean = type_lengths['CDS fragments'] / type_counts['CDS fragments']

    mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene']
    exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA']
    CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA']
    
    ofh.write("\nGene count\t{0}\n".format(type_counts['gene']))
    ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean))
    ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene']))
    
    
    ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA']))
    ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean))
    ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA']))
    ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean) )
    
    ofh.write("\nexon count\t{0}\n".format(type_counts['exon']))
    ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean))
    ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon']))
    ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean) )

    ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments']))
    ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean))
    ofh.write("CDS fragment length (sum)\t{0}\n".format(type_lengths['CDS fragments']))
    ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean) )
    
    ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n")
    for cds_count in sorted(CDS_profile):
        perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100
        ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format(cds_count, CDS_profile[cds_count], perc) )
Ejemplo n.º 45
0
def process_files(args):
    (assemblies_1,
     features_1) = biocodegff.get_gff3_features(args.annotation_1)
    (assemblies_2,
     features_2) = biocodegff.get_gff3_features(args.annotation_2)

    a_exons = [
    ]  ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file.
    p_exons = []  ## For predicted annotation

    a_gene = []
    p_gene = []

    a_mrna = []
    p_mrna = []

    exon_pred_all = set()
    gene_true = set()
    mrna_true = set()

    chr = []

    a_cds = []
    p_cds = []

    a_cd = []
    p_cd = []
    chr = []

    true_pred_file = args.output_dir + '/true_predicted_genes.txt'
    true_file = open(true_pred_file, 'w')
    true_file.write("Known\tPredicted\n")

    for asm_id in assemblies_1:  ## Iterate through each chromosome from the known ref annotation
        assembly_1 = assemblies_1[asm_id]
        assembly_2 = assemblies_2.get(
            asm_id, -1)  ## Find that chromosome in the predicted gff file
        genes_1 = assembly_1.genes()  ## All genes from known annotation
        anno_exons = set()

        for gene_1 in sorted(
                genes_1
        ):  ## Add unique gene, mrna , exon features from known annotation to get each known feature total count
            gene_1_loc = gene_1.location_on(assembly_1)
            cord_a = cordinate(
                asm_id, gene_1_loc
            )  ## Use chromosome id+start+stop+strand as a string to determine uniqueness.
            if (cord_a not in a_gene):
                a_gene.append(cord_a)

            ex_start = []
            ex_stop = []
            for mrna_1 in sorted(gene_1.mRNAs()):
                mrna_1_loc = mrna_1.location_on(assembly_1)
                cord = cordinate(asm_id, mrna_1_loc)
                if (cord not in a_mrna):
                    a_mrna.append(cord)

                if (args.feature == "Exon"):
                    feat_1 = mrna_1.exons()

                if (args.feature == "CDS"):
                    feat_1 = mrna_1.CDSs()

                for exon_1 in sorted(feat_1):
                    exon_1_loc = exon_1.location_on(assembly_1)
                    cord = cordinate(asm_id, exon_1_loc)
                    if (cord not in a_exons):
                        a_exons.append(cord)
                    anno_exons.add(cord)

                    ex_start.append(exon_1_loc.fmin)
                    ex_stop.append(exon_1_loc.fmax)

            ex_start.sort()
            ex_stop.sort()
            if (len(ex_start) >= 1):
                cds1 = asm_id + ":" + gene_1.id + ":" + str(
                    ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(
                        gene_1_loc.strand)

            else:
                cds1 = asm_id + ":" + gene_1.id + ":" + str(
                    gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax) + ":" + str(
                        gene_1_loc.strand)

            if (cord_a not in a_cd):
                a_cds.append(cds1)
                a_cd.append(cord_a)

        if (
                type(assembly_2) is int
        ):  ##    If the chromosome is not found in prediected file, move to next chromosome.
            continue

        genes_2 = assembly_2.genes()  ## All genes from predicted annotation.
        chr.append(asm_id)  ## Append all found chromosome in a list.
        pred_exons = set()

        for gene_2 in sorted(
                genes_2
        ):  ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count.
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_p = cordinate(asm_id, gene_2_loc)
            if (cord_p not in p_gene):
                p_gene.append(cord_p)

            ex_start = []
            ex_stop = []

            for mrna_2 in sorted(gene_2.mRNAs()):
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord = cordinate(asm_id, mrna_2_loc)
                if (cord not in p_mrna):
                    p_mrna.append(cord)

                if (args.feature == "Exon"):
                    feat_2 = mrna_2.exons()

                if (args.feature == "CDS"):
                    feat_2 = mrna_2.CDSs()

                for exon_2 in sorted(feat_2):
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = cordinate(asm_id, exon_2_loc)
                    pred_exons.add(cord)
                    if (cord not in p_exons):
                        p_exons.append(cord)

                    ex_start.append(exon_2_loc.fmin)
                    ex_stop.append(exon_2_loc.fmax)

            ex_start.sort()
            ex_stop.sort()

            if (len(ex_start) >= 1):
                cds2 = asm_id + ":" + gene_2.id + ":" + str(
                    ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(
                        gene_2_loc.strand)

            else:
                cds2 = asm_id + ":" + gene_2.id + ":" + str(
                    gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(
                        gene_2_loc.strand)

            if (cord_p not in p_cd):
                p_cds.append(cds2)
                p_cd.append(cord_p)

        exon_pred_all.update(pred_exons.intersection(anno_exons))  # true exons

        for gene_2 in sorted(
                genes_2
        ):  ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_g = cordinate(asm_id, gene_2_loc)

            if (
                    cord_g in gene_true
            ):  ## To prevent duplication, check if the feature already exists in the set of truly predicted gene.
                continue

            ex_mrna2 = set()

            for gene_1 in sorted(genes_1):
                ex_mrna1 = set()
                gene_1_loc = gene_1.location_on(assembly_1)
                if (gene_1_loc.strand != gene_2_loc.strand):
                    continue
                if (gene_2.overlaps_with(gene_1)):

                    for mrna_2 in sorted(gene_2.mRNAs()):
                        if (args.feature == "Exon"):
                            feat_2 = mrna_2.exons()
                        if (args.feature == "CDS"):
                            feat_2 = mrna_2.CDSs()

                        for exon_2 in sorted(feat_2):
                            exon_2_loc = exon_2.location_on(assembly_2)
                            cord2 = cordinate(asm_id, exon_2_loc)
                            ex_mrna2.add(cord2)

                    for mrna_1 in sorted(gene_1.mRNAs()):
                        if (args.feature == "Exon"):
                            feat_1 = mrna_1.exons()

                        if (args.feature == "CDS"):
                            feat_1 = mrna_1.CDSs()

                        for exon_1 in sorted(feat_1):
                            exon_1_loc = exon_1.location_on(assembly_1)
                            cord1 = cordinate(asm_id, exon_1_loc)
                            ex_mrna1.add(cord1)

                    ex_union = ex_mrna1.union(ex_mrna2)
                    if (len(ex_union) == len(ex_mrna1)
                            and len(ex_union) == len(ex_mrna2)):
                        gene_true.add(cord_g)
                        true_file.write(gene_1.id + "\t" + gene_2.id + "\n")
                        break

    for asm_id in assemblies_2:  ## Iterate through each chromosome from the predicted annotation
        if asm_id not in chr:
            assembly_2 = assemblies_2.get(
                asm_id, -1
            )  ## Find that chromosome in the predicted gff file which is not found in known annotation
            genes_2 = assembly_2.genes(
            )  ## Add  genes, mrna, exon features from predicted annotation to total predicted feature set.

            for gene_2 in sorted(genes_2):
                gene_2_loc = gene_2.location_on(assembly_2)
                cord_p = cordinate(asm_id, gene_2_loc)
                if (cord_p not in p_gene):
                    p_gene.append(cord_p)

                ex_start = []
                ex_stop = []

                for mrna_2 in sorted(gene_2.mRNAs()):
                    mrna_2_loc = mrna_2.location_on(assembly_2)
                    cord = cordinate(asm_id, mrna_2_loc)
                    if (cord not in p_mrna):
                        p_mrna.append(cord)

                    if (args.feature == "Exon"):
                        feat_2 = mrna_2.exons()
                    if (args.feature == "CDS"):
                        feat_2 = mrna_2.CDSs()

                    for exon_2 in sorted(feat_2):
                        exon_2_loc = exon_2.location_on(assembly_2)
                        cord = cordinate(asm_id, exon_2_loc)
                        if (cord not in p_exons):
                            p_exons.append(cord)

                        ex_start.append(exon_2_loc.fmin)
                        ex_stop.append(exon_2_loc.fmax)

                ex_start.sort()
                ex_stop.sort()
                if (len(ex_start) >= 1):
                    cds2 = asm_id + ":" + gene_2.id + ":" + str(
                        ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(
                            gene_2_loc.strand)

                else:
                    cds2 = asm_id + ":" + gene_2.id + ":" + str(
                        gene_2_loc.fmin) + ":" + str(
                            gene_2_loc.fmax) + ":" + str(gene_2_loc.strand)

                if (cord_p not in p_cd):
                    p_cds.append(cds2)
                    p_cd.append(cord_p)

    #Calculate SN/SP for bases

    (a_base_val, p_base_val, true_base) = base_comparison(p_exons, a_exons)

    base_sn = (true_base / a_base_val) * 100
    base_sp = (true_base / p_base_val) * 100

    #Calculate SN/SP for exons
    annotated_exon = len(a_exons)
    predicted_exon = len(p_exons)
    true_pred_exon = len(exon_pred_all)

    exon_sn = (true_pred_exon / annotated_exon) * 100
    exon_sp = (true_pred_exon / predicted_exon) * 100

    #Calculate SN/SP for genes

    annotated_gene = len(a_gene)
    predicted_gene = len(p_gene)
    true_pred_gene = len(gene_true)

    gene_sn = (true_pred_gene / annotated_gene) * 100
    gene_sp = (true_pred_gene / predicted_gene) * 100
    print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    print("Gene\t" + str(annotated_gene) + "\t" + str(predicted_gene) + "\t" +
          str(true_pred_gene) + "\t" + str(gene_sn) + "\t" + str(gene_sp))
    print(args.feature + "\t" + str(annotated_exon) + "\t" +
          str(predicted_exon) + "\t" + str(true_pred_exon) + "\t" +
          str(exon_sn) + "\t" + str(exon_sp))
    print("Base\t" + str(a_base_val) + "\t" + str(p_base_val) + "\t" +
          str(true_base) + "\t" + str(base_sn) + "\t" + str(base_sp))

    out_file = args.output_dir + '/summary.txt'
    if not (os.path.exists(args.output_dir)):
        sys.exit("Directory does not exist.")
    fout = open(out_file, 'w')

    fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    fout.write("Gene\t" + str(annotated_gene) + "\t" + str(predicted_gene) +
               "\t" + str(true_pred_gene) + "\t" + str(gene_sn) + "\t" +
               str(gene_sp) + "\n")
    fout.write(args.feature + "\t" + str(annotated_exon) + "\t" +
               str(predicted_exon) + "\t" + str(true_pred_exon) + "\t" +
               str(exon_sn) + "\t" + str(exon_sp) + "\n")
    fout.write("Base\t" + str(a_base_val) + "\t" + str(p_base_val) + "\t" +
               str(true_base) + "\t" + str(base_sn) + "\t" + str(base_sp) +
               "\n\n")

    arr_pred = compare_cds(p_cds, a_cds, "pred")
    arr_known = compare_cds(a_cds, p_cds, "known")
    arr_pred_same = compare_cds(p_cds, p_cds, "pred_same")

    new_gene = arr_pred[2]
    gene_merge = arr_pred[3]
    gene_found = arr_pred[0]
    gene_opp = arr_pred[1]
    gene_missing = arr_known[2]
    gene = arr_known[0]
    gene_opp_known = arr_known[1]
    gene_split = arr_known[3]
    gene_pred_overlap_opp = arr_pred_same[1]

    print("1. No. of known gene : ", len(a_cds))
    print("2. No. of predicted gene : ", len(p_cds))
    print("3. No. of predicted gene overlapping  0 known gene (new gene): ",
          new_gene)
    print(
        "4. No. of predicted gene overlapping > 1 known gene (gene merge) : ",
        gene_merge)
    print("5. No. of predicted gene overlaping 1 known gene : ", gene_found)
    print(
        "6. No. of predicted gene overlapping >= 1 known gene in opp strand : ",
        gene_opp)
    print(
        "7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundaries) : ",
        true_pred_gene)
    print(
        "8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : ",
        gene_pred_overlap_opp)

    print(
        "9. No. of known gene overlapping  0 predicted gene (gene missing): ",
        gene_missing)
    print(
        "10. No. of known gene overlapping > 1 predicted gene(gene split) : ",
        gene_split)
    print("11. No. of known gene overlaping 1 predicted gene : ", gene)
    print(
        "12. No. of known gene overlapping >= 1 predicted gene in opp strand : ",
        gene_opp_known)

    out_file = args.output_dir + '/final_stats.txt'
    if not (os.path.exists(args.output_dir)):
        sys.exit("Directory does not exist.")
    fout = open(out_file, 'w')

    fout.write("1. No. of known gene : " + str(len(a_cds)) + "\n")
    fout.write("2. No. of predicted gene : " + str(len(p_cds)) + "\n")
    fout.write(
        "3. No. of predicted gene overlapping  0 known gene (new gene): " +
        str(new_gene) + "\n")
    fout.write(
        "4. No. of predicted gene overlapping > 1 known gene (gene merge) : " +
        str(gene_merge) + "\n")
    fout.write("5. No. of predicted gene overlaping 1 known gene : " +
               str(gene_found) + "\n")
    fout.write(
        "6. No. of predicted gene overlapping >= 1 known gene in opp strand : "
        + str(gene_opp) + "\n")
    fout.write(
        "7. No. of predicted gene overlapping  1 known gene (exact intron/exon boundary) : "
        + str(true_pred_gene) + "\n")
    fout.write(
        "8. No. of predicted gene overlapping >= 1  predicted gene in opp strand : "
        + str(gene_pred_overlap_opp) + "\n")
    fout.write(
        "9. No. of known gene overlapping  0 predicted gene (gene missing): " +
        str(gene_missing) + "\n")
    fout.write(
        "10. No. of known gene overlapping > 1 predicted gene (gene_split): " +
        str(gene_split) + "\n")
    fout.write("11. No. of known gene overlaping 1 predicted gene : " +
               str(gene) + "\n")
    fout.write(
        "12. No. of known gene overlapping >= 1 predicted gene in opp strand : "
        + str(gene_opp_known) + "\n")

    true_pred_file = args.output_dir + '/true_pred.txt'
    fout_true = open(true_pred_file, 'w')
    for true_gene in gene_true:
        fout_true.write(true_gene + "\n")

    #Clean up
    delete_file = [
        'exon_1.bed', 'exon_2.bed', 'exon_1_merged.bed', 'exon_2_merged.bed',
        'exon_1_2_intersect.bed'
    ]
    for f in delete_file:
        cmd = "rm " + args.output_dir + "/" + f
        os.system(cmd)
Ejemplo n.º 46
0
def main():
    parser = argparse.ArgumentParser( description='Converts GFF3 into a GenBank flat file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to a Genbank flat file to be created. Supersedes --output_dir if both are specified.' )
    parser.add_argument('-od', '--output_dir', type=str, required=False, help='Path to an output directory. If this option is specified then each input assembly will be written to a separate GenBank output file, named with the assembly_id.' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-mt', '--molecule_type', type=str, required=False, default='DNA', help='Molecule type' )
    parser.add_argument('-gbd', '--genbank_division', type=str, required=False, default='.', help='GenBank Division (3-letter abbreviation)' )
    parser.add_argument('-md', '--modification_date', type=str, required=False, default='DD-MMM-YYYY', help='The modification date for header in format like 21-JUN-1999' )
    parser.add_argument('-org', '--organism', type=str, required=False, default='.', help='Full organism name (including strain)' )
    parser.add_argument('-str', '--strain', type=str, required=False, help="Only the strain designation, which is written to the FEATURES.source element" )
    parser.add_argument('-d', '--definition', type=str, required=False, default='.', help='Brief description of sequence; includes information such as source organism, gene name/protein name, or some description of the sequence\'s function.' )
    parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Free-format information including an abbreviated form of the organism name, sometimes followed by a molecule type.' )
    parser.add_argument('-t', '--taxon_id', type=int, required=False, help='NCBI taxon ID, if known' )
    parser.add_argument('-l', '--lineage', type=str, required=False, default='Unknown', help='Semicolon-delimited lineage of the organism e.g., "Eukaryota; Alveolata; Apicomplexa; Aconoidasida; Piroplasmida; Theileriidae; Theileria"' )
    parser.add_argument('-seq', '--include_sequence', action='store_true', help='Include sequence (if present) in the output GenBank flat file(s).' )
    parser.add_argument('-p', '--locus_id_prefix', required=False, default='', help='Prefix to add to the GenBank LOCUS id in the output GenBank flat file(s).' )
    args = parser.parse_args()

    # check that output directory exists
    if args.output_dir is not None:
        if not os.path.isdir(args.output_dir):
            sys.stderr.write("FATAL: the specified output directory (" + args.output_dir + ") does not exist\n");
            exit(1)

    # line-wrap lineage to stay below 79 character GenBank flat file width
    lineage = biocodegenbank.line_wrap_lineage_string( args.lineage )

    (assemblies, features) = biocodegff.get_gff3_features( args.input_file )
    ofh = sys.stdout
    if args.output_file is not None:
        if args.output_dir is None:
            ofh = open(args.output_file, 'wt')
        else:
            sys.stderr.write("WARN: both -o/--output_file and -od/--output_dir were passed so the former will be ignored\n")

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        process_assembly_fasta(assemblies, args.genome_fasta)

    for assembly_id in assemblies:
        locus_id = args.locus_id_prefix + assembly_id
        if args.output_dir is not None:
            ofn = args.output_dir + "/" + locus_id + ".gbk"
            ofh = open(ofn, 'wt')
        assembly = assemblies[assembly_id]

        context = { 'locus':locus_id, 'molecule_size':assembly.length, 'molecule_type':args.molecule_type,
                    'division':args.genbank_division, 'modification_date':args.modification_date,
                    'accession':'.', 'version':'.', 
                    'source':args.source, 'definition':args.definition, 'organism':args.organism,
                    'lineage':lineage
        }
        header = TEMPLATE_ENVIRONMENT.get_template('genbank_flat_file_header.template').render(context)
        ofh.write(header)
        ofh.write("\nFEATURES             Location/Qualifiers\n")
        ofh.write("     source          1..{0}\n".format(assembly.length))
        ofh.write("                     /organism=\"{0}\"\n".format(args.organism))
        ofh.write("                     /mol_type=\"genomic DNA\"\n")

        if args.strain is not None:
            ofh.write("                     /strain=\"{0}\"\n".format(args.strain))

        if args.taxon_id is not None:
            ofh.write("                     /db_xref=\"taxon:{0}\"\n".format(args.taxon_id))
        
        for gene in assemblies[assembly_id].genes():
            biocodegenbank.print_biogene( gene=gene, fh=ofh, on=assembly )

        if args.include_sequence:
            ofh.write("ORIGIN\n")
            biocodegenbank.print_sequence( seq=assembly.residues, fh=ofh )

        ofh.write("//\n")
        # there may be multiple output files
        if args.output_dir is not None:
            ofh.close()

    # there is only one output file
    if args.output_dir is None:
        ofh.close()
Ejemplo n.º 47
0
def main():
    parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file')

    ## output file to be written
    parser.add_argument('evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces' )
    parser.add_argument('-r', '--reference', type=str, required=True, help='Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' )
    parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional path to an output file to be created, else prints on STDOUT' )
    args = parser.parse_args()

    ## parse the fasta
    fasta = biocodeutils.fasta_dict_from_file(args.fasta)

    ## open the output file
    fout = None
    if args.output_file is None:
        fout = codecs.getwriter('utf8')(sys.stdout.buffer)
    else:
        fout = open(args.output_file, "w")

    ####################################################
    ## Sanity checks

    allowed_extensions = ['bed', 'gff3', 'pileup', 'sam']
    for ev_file in args.evidence_files:
        valid_ext_found = False
        
        for ext in allowed_extensions:
            if ev_file.endswith(ext):
                valid_ext_found = True

        if valid_ext_found == False:
            raise Exception("ERROR: Evidence file passed with unsupported file extension: {0}.  Supported extensions are {1}".format(ev_file, allowed_extensions))

    ## The input file should be defined as $path:$feattype
    if ':' not in args.reference:
        raise Exception("ERROR: input_file must be like /path/to/some.gff3:mRNA")
        
    ref_file_parts = args.reference.split(':')
    print("DEBUG: part count: {0}".format(len(ref_file_parts)))
        
    if ref_file_parts[0].endswith('.gff3'):
        (ref_assemblies, ref_features) = biocodegff.get_gff3_features( ref_file_parts[0] )
    else:
        raise Exception("ERROR: Expected input file (-i) to have a gff3 extension, got {0}".format(ref_file_parts[0]))

    ####################################################
    ## Initialize the coverage arrays

    fasta_cov = dict()
    for seq_id in fasta:
        # create a list of 0s the length of the molecule
        fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s'])

    ####################################################
    ## Now parse the evidence files
        
    for ev_file in args.evidence_files:
        if ev_file.endswith('pileup'):
            parse_pileup(fasta_cov, ev_file)
        elif ev_file.endswith('sam'):
            parse_sam(fasta_cov, ev_file)
        else:
            print("INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented".format(ev_file))
        

    for id in fasta_cov:
        covered_bases = 0

        for i in fasta_cov[id]:
            if fasta_cov[id][i] > 0:
                covered_bases += 1

        fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
Ejemplo n.º 48
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']

    for assembly_id in assemblies:
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = biocodeutils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax
                    else:
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(
                        mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                          end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                                codon_step_size > 0 and CDS_pos > mRNA_limit):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos),
                                  end='')

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))
def main():
    parser = argparse.ArgumentParser( description='Parses multiple sources of evidence to generate a consensus functional annotation')

    ## output file to be written
    parser.add_argument('-f', '--input_fasta', type=str, required=True, help='Protein FASTA file of source molecules' )
    parser.add_argument('-m', '--hmm_htab_list', type=str, required=False, help='List of htab files from hmmpfam3' )
    parser.add_argument('-bs', '--blast_sprot_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/SWISS-PROT' )
    parser.add_argument('-rs', '--rapsearch_sprot_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniProtKB/SWISS-PROT' )
    parser.add_argument('-bt', '--blast_trembl_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/Trembl' )
    parser.add_argument('-bk', '--blast_kegg_btab_list', type=str, required=False, help='List of btab files from BLAST against KEGG' )
    parser.add_argument('-bu100', '--blast_uniref100_btab_list', type=str, required=False, help='List of btab files from BLAST against UniRef100' )
    parser.add_argument('-ru100', '--rapsearch_uniref100_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniRef100' )
    parser.add_argument('-u100f', '--uniref100_fasta', type=str, required=False, help='Only required if also passing RAPSEARCH2 against UniRef100 evidence' )
    parser.add_argument('-tm', '--tmhmm_raw_list', type=str, required=False, help='List of raw files from a tmhmm search' )
    parser.add_argument('-d', '--hmm_db', type=str, required=False, help='SQLite3 db with HMM information' )
    parser.add_argument('-u', '--uniprot_sprot_db', type=str, required=False, help='SQLite3 db with UNIPROT/SWISSPROT information' )
    parser.add_argument('-ur', '--uniref_db', type=str, required=False, help='SQLite3 db with UNIREF information' )
    parser.add_argument('-a', '--format', type=str, required=False, default='tab', help='Output format.  Current options are: "tab", "fasta", "gff3"' )
    parser.add_argument('-s', '--source_gff', type=str, required=False, help='Source GFF file from which proteins were derived.  Required if you want to export any format other than tab-delimited.' )
    parser.add_argument('-e', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='Skip BLAST hits unless they have an E-value at least as low as this' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-r', '--organism_table', type=str, required=False, help='Optional table with counts of organism frequency based on top BLAST match for each protein' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=False, help='If passed, the genomic FASTA sequence will be included in the exported GFF3')
    parser.add_argument('-eon', '--export_organism_names', help='If passed, includes organism names from top BLAST hit into 9th column when available.  Mostly useful for metagenomic samples.', action='store_true')
    args = parser.parse_args()

    check_arguments(args)

    # If --rapsearch_uniref100_btab_list passed, --uniref100_fasta is required
    if args.rapsearch_uniref100_btab_list is not None:
        if args.uniref100_fasta is None:
            raise Exception("ERROR: --uniref100_fasta required if --rapsearch_uniref100_btab_list is passed")

    sources_log_fh = open("{0}.sources.log".format(args.output_file), 'wt')
    
    # this is a dict of biothings.Polypeptide objects
    polypeptides = initialize_polypeptides( sources_log_fh, args.input_fasta )

    # Keyed on polypeptide ID (from the FASTA, which is actually the mRNA gff feature ID), the
    #  values here are the organism name for the top BLAST match of each
    polypeptide_blast_org = dict()

    # get source structural annotation, if necessary:
    if args.source_gff is not None:
        print("INFO: parsing source GFF")
        (assemblies, features) = biocodegff.get_gff3_features( args.source_gff )

    if args.hmm_htab_list is not None:
        # connection to the HMM-associated SQLite3 database
        hmm_db_conn = sqlite3.connect(args.hmm_db)
        hmm_db_curs = hmm_db_conn.cursor()
        
        if args.hmm_db is None:
            raise Exception("ERROR: You specified HMM results but not the db with the -d option")
        
        print("INFO: parsing HMM evidence")
        parse_hmm_evidence( sources_log_fh, polypeptides, args.hmm_htab_list, hmm_db_curs )
        hmm_db_curs.close()

    if args.blast_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified BLAST evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing BLAST (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.blast_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'blast' )
        usp_db_curs.close()

    if args.rapsearch_sprot_btab_list is not None:
        if args.uniprot_sprot_db is None:
            raise Exception("ERROR: You specified RAPSEARCH2 evidence vs UnitProt/SwissProt results but not the db with the -u option")
        
        # connection to the UniProt_Sprot SQLite3 database
        usp_db_conn = sqlite3.connect(args.uniprot_sprot_db)
        usp_db_curs = usp_db_conn.cursor()
        print("INFO: parsing RAPSEARCH2 (SWISS-PROT) evidence")
        parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.rapsearch_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'rapsearch2' )
        usp_db_curs.close()

    if args.blast_trembl_btab_list is not None:
        print("INFO: parsing BLAST (TrEMBL) evidence")
        parse_trembl_blast_evidence(polypeptides, args.blast_trembl_btab_list, args.blast_eval_cutoff)

    if args.blast_kegg_btab_list is not None:
        print("INFO: parsing BLAST (KEGG) evidence")
        parse_kegg_blast_evidence(sources_log_fh, polypeptides, args.blast_kegg_btab_list, args.blast_eval_cutoff)

    if args.blast_uniref100_btab_list is not None:
        print("INFO: parsing BLAST (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.blast_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'blast', args.uniref100_fasta)
        uniref_db_curs.close()

    if args.rapsearch_uniref100_btab_list is not None:
        print("INFO: parsing RAPSEARCH2 (UniRef100) evidence")
        # connection to the UniRef SQLite3 database
        uniref_db_conn = sqlite3.connect(args.uniref_db)
        uniref_db_curs = uniref_db_conn.cursor()
        parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.rapsearch_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'rapsearch2', args.uniref100_fasta)
        uniref_db_curs.close()
        
    if args.tmhmm_raw_list is not None:
        print("INFO: parsing TMHMM evidence")
        parse_tmhmm_evidence(sources_log_fh, polypeptides, args.tmhmm_raw_list)

    ## output will either be a file or STDOUT
    print("INFO: writing output")
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    if args.format == 'tab':
        write_tab_results(fout, polypeptides)
    elif args.format == 'fasta':
        write_fasta_results(fout, polypeptides)
    elif args.format == 'gff3':
        write_gff3_results(fout, polypeptides, assemblies, features, args.genomic_fasta)
    
    fout.close()

    ## There isn't a method in biocodegff3 to add arbitrary key=value pairs.  So we have to cheat here.
    if args.export_organism_names is True:
        if args.output_file:
            append_organism_names_to_gff(args.output_file, polypeptide_blast_org)
        else:
            raise Exception("ERROR: an --output_file must be specified when using the --export_organism_names option.")

    if args.organism_table is not None:
        create_organism_table(args.organism_table, polypeptide_blast_org)
def main():
    bin_dir = os.path.abspath(os.path.dirname(__file__))
    test_gff_file = bin_dir + '/biothings_coordinate_comparisons.data'
    
    (assemblies, features) = biocodegff.get_gff3_features( test_gff_file )


###########################################################################################

    if features['TP03_0010'] < features['TP03_0012.t01_polypeptide']:
        print("INFO: < positive check successful")
    else:
        print("ERROR: < check unsuccessful")

    if features['TP03_0012'] < features['TP03_0012.t01_polypeptide']:
        print("ERROR: < check unsuccessful")
    else:
        print("INFO: < negative check successful")

###########################################################################################
        
    if features['TP03_0012'] > features['TP03_0010']:
        print("INFO: > positive check successful")
    else:
        print("ERROR: > check unsuccessful")

    if features['TP03_0010'] > features['TP03_0012.t01_polypeptide']:
        print("ERROR: > check unsuccessful")
    else:
        print("INFO: > negative check successful")
        
###########################################################################################

    if features['TP03_0012.t01_exon-auto15079'] <= features['TP03_0012.t01_polypeptide']:
        print("INFO: <= positive check successful")
    else:
        print("ERROR: <= check unsuccessful")

    if features['TP03_0010'] <= features['TP03_0012']:
        print("ERROR: <= check unsuccessful")
    else:
        print("INFO: <= negative check successful")

###########################################################################################

    if features['TP03_0012.t01_exon-auto15085'] >= features['TP03_0012.t01_polypeptide']:
        print("INFO: >= positive check successful")
    else:
        print("ERROR: >= check unsuccessful")

    if features['TP03_0010'] >= features['TP03_0012']:
        print("ERROR: >= check unsuccessful")
    else:
        print("INFO: >= negative check successful")

###########################################################################################

    if features['TP03_0012.t01_exon-auto15079'].overlaps_with(features['TP03_0012.t01_polypeptide']):
        print("INFO: overlaps_with() positive check successful")
    else:
        print("ERROR: overlaps_with() positive check unsuccessful")

    if features['TP03_0002'].overlaps_with(features['TP03_0010']):
        print("ERROR: overlaps_with() negative check unsuccessful")
    else:
        print("INFO: overlaps_with() negative check successful")

###########################################################################################
    overlap_size = features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15079'])

    if overlap_size == 224:
        print("INFO: overlap_size_with() positive check successful")
    else:
        print("ERROR: overlap_size_with() positive check unsuccessful (overlap returned: {0})".format(overlap_size))

    if features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15085']) == 224:
        print("INFO: overlap_size_with() negative check unsuccessful")
    else:
        print("ERROR: overlap_size_with() negative check successful")
Ejemplo n.º 51
0
def process_files(args):
    (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1)
    (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2)


    a_exons = []                                    ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file.  
    p_exons = []                                    ## For predicted annotation

    a_gene = []
    p_gene = []

    a_mrna = []
    p_mrna = []

    exon_pred_all = []
    gene_true = set()
    mrna_true = set()

    a_base = 0
    p_base = 0
    true_base = 0
    
    chr = []
    
    for asm_id in assemblies_1:                                                                                     ## Iterate through each chromosome from the known ref annotation        
        assembly_1 = assemblies_1[asm_id]
        assembly_2 = assemblies_2.get(asm_id,-1)                                                                    ## Find that chromosome in the predicted gff file
        genes_1 = assembly_1.genes()                                                                                ## All genes from known annotation
        anno_exons = set()

        for gene_1 in sorted(genes_1) :                                                                                     ## Add unique gene, mrna , exon features from known annotation to get each known feature total count 
            gene_1_loc = gene_1.location_on(assembly_1)
            cord = asm_id  + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax)+ ":"  + str(gene_1_loc.strand)        ## Use chromosome id+start+stop+strand as a string to determine uniqueness.
            if (cord not in a_gene) :
                a_gene.append(cord)
            
            for mrna_1 in sorted(gene_1.mRNAs()) :
                mrna_1_loc = mrna_1.location_on(assembly_1)
                cord = asm_id  + ":" + str(mrna_1_loc.fmin) + ":" + str(mrna_1_loc.fmax) + ":" + str(mrna_1_loc.strand)
                if (cord not in a_mrna) :
                    a_mrna.append(cord)

                for exon_1 in sorted(mrna_1.exons()) :
                    exon_1_loc = exon_1.location_on(assembly_1)
                    cord = asm_id + ":"  + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" + str(exon_1_loc.strand)
                    if (cord not in a_exons) :
                        a_exons.append(cord)
                    anno_exons.add(cord)

        if (type(assembly_2) is int) :                     ##    If the chromosome is not found in prediected file, move to next chromosome.
            continue
        

        genes_2 = assembly_2.genes()                      ## All genes from predicted annotation.
        chr.append(asm_id)                                ## Append all found chromosome in a list.
        pred_exons = set()

        for gene_2 in sorted(genes_2) :                           ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count.  
            gene_2_loc = gene_2.location_on(assembly_2)
            cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" +  str(gene_2_loc.strand)
            if (cord not in p_gene) :
                p_gene.append(cord)
            
            for mrna_2 in sorted(gene_2.mRNAs()) :
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord = asm_id  + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax)+ ":" +  str(mrna_2_loc.strand)
                if (cord not in p_mrna) :
                    p_mrna.append(cord)
                
                for exon_2 in sorted(mrna_2.exons()) :
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = asm_id  + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax)+ ":" + str(exon_2_loc.strand)
                    pred_exons.add(cord)
                    if (cord not in p_exons) :
                        p_exons.append(cord)


        for exon_2 in pred_exons :                                   ##Identify true exons.   
            for exon_1 in anno_exons :
                if (exon_1 == exon_2) :
                    if (exon_2 not in exon_pred_all) :
                        exon_pred_all.append(exon_2)
                        break

        for gene_2 in sorted(genes_2) :                                         ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate
            gene_2_loc = gene_2.location_on(assembly_2)
            cord_g = asm_id  + ":"+ str(gene_2_loc.fmin) + ":" +  str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand)
            
            if (cord_g in gene_true) :                                          ## To prevent duplication, check if the feature already exists in the set of truly predicted gene.
                continue
        
            true_pred_mrna_per_gene = 0
            
            for mrna_2 in sorted(gene_2.mRNAs()) :                                ## Iterate through each predicted mrna , if all of its exon is true , then the predicted mRNA is true
                mrna_2_loc = mrna_2.location_on(assembly_2)
                cord_m = asm_id  + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax)  + ":" + str(mrna_2_loc.strand)
                
                if (cord_m in mrna_true) :
                    continue
                count = 0
                pred_exon = set()

                for exon_2 in sorted(mrna_2.exons()) :
                    exon_2_loc = exon_2.location_on(assembly_2)
                    cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" +  str(exon_2_loc.strand)
                    if cord in pred_exon :
                        continue
                    pred_exon.add(cord)
                    for true_exon in exon_pred_all :
                        if (cord == true_exon) :
                            count += 1
                            break
                    
                    if (len(pred_exon) == count) :        
                        mrna_true.add(cord_m)    
                        true_pred_mrna_per_gene += 1
                        
            if (true_pred_mrna_per_gene >= 1) :                                  ## If the predicted gene has atleast one true predicted mrna, then the gene is true.
                gene_true.add(cord_g)


    for asm_id in assemblies_2:                                                  ## Iterate through each chromosome from the predicted annotation
        if asm_id not in chr :
            assembly_2 = assemblies_2.get(asm_id,-1)                             ## Find that chromosome in the predicted gff file which is not found in known annotation
            genes_2 = assembly_2.genes()                                         ## Add  genes, mrna, exon features from predicted annotation to total predicted feature set.
            
            for gene_2 in sorted(genes_2) :
                gene_2_loc = gene_2.location_on(assembly_2)
                cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax)  + ":"+ str(gene_2_loc.strand)
                if (cord not in p_gene) :
                    p_gene.append(cord)
            
                for mrna_2 in sorted(gene_2.mRNAs()) :
                    mrna_2_loc = mrna_2.location_on(assembly_2)
                    cord = asm_id  + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax) + ":" + str(mrna_2_loc.strand)
                    if (cord not in p_mrna) :
                        p_mrna.append(cord)
                    
                    for exon_2 in sorted(mrna_2.exons()) :
                        exon_2_loc = exon_2.location_on(assembly_2)
                        cord = asm_id  + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand)
                        if (cord not in p_exons) :
                            p_exons.append(cord)

    exon2_bed = args.output_dir + '/exon_2.bed'
    e_bed = open(exon2_bed, 'w')
    for exon in p_exons :
        chrom = (exon.split(':'))[0]
        start = int((exon.split(':'))[1])
        stop = int((exon.split(':'))[2])
        strand = (exon.split(':'))[3]
        if (strand == str(1)) :
            strand = "+"
        else :
            strand = "-"
        e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n")

    e_bed.close()
    
    out2 = args.output_dir + '/exon_2_merged.bed'
    cmd = "bedtools merge -nms -scores sum -i " + exon2_bed + " -s >"+out2
    print(cmd)
    os.system(cmd)
    
    exon1_bed = args.output_dir + '/exon_1.bed'
    e_bed = open(exon1_bed, 'w')
    for exon in a_exons :
        chrom = (exon.split(':'))[0]
        start = int((exon.split(':'))[1])
        stop = int((exon.split(':'))[2])
        strand = (exon.split(':'))[3]
        if (strand == str(1)) :
            strand = "+"
        else :
            strand = "-"
        e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n")
    e_bed.close()

    out1 = args.output_dir + '/exon_1_merged.bed'
    cmd = "bedtools merge -nms -scores sum -i " + exon1_bed + " -s >"+out1
    print(cmd)
    os.system(cmd)
    
    out_intersect = args.output_dir + '/exon_1_2_intersect.bed'
    cmd = "bedtools intersect -s -wo -a " + out1 + " -b " + out2 + " >" + out_intersect
    print(cmd)
    os.system(cmd)
    
    a_base_file = open(out1,'r')
    for line in a_base_file :
        arr = line.split("\t")
        a_base = a_base + (int(arr[2]) - int(arr[1]))
    a_base_file.close()
    
    p_base_file = open(out2,'r')
    for line in p_base_file :
        arr = line.split("\t")
        p_base = p_base + (int(arr[2]) - int(arr[1]))
    p_base_file.close()

    true_base_file = open(out_intersect,'r')
    for line in true_base_file :
        arr = line.split("\t")
        true_base = true_base + int(arr[12])
    true_base_file.close()

    #Calculate SN/SP for bases 

    base_sn = (true_base/a_base) * 100                                 
    base_sp = (true_base/p_base) * 100

    #Calculate SN/SP for exons 
    annotated_exon = len(a_exons)
    predicted_exon = len(p_exons)
    true_pred_exon = len(exon_pred_all)
    
    exon_sn = (true_pred_exon/annotated_exon) * 100                                 
    exon_sp = (true_pred_exon/predicted_exon) * 100

    #Calculate SN/SP for transcript
    
    annotated_mrna = len(a_mrna)
    predicted_mrna = len(p_mrna)
    true_pred_mrna = len(mrna_true)
    
    mrna_sn = (true_pred_mrna/annotated_mrna) * 100
    mrna_sp = (true_pred_mrna/predicted_mrna) * 100
       
    #Calculate SN/SP for genes 

    annotated_gene = len(a_gene)
    predicted_gene = len(p_gene)
    true_pred_gene = len(gene_true)
    
    gene_sn = (true_pred_gene/annotated_gene) * 100                                 
    gene_sp = (true_pred_gene/predicted_gene) * 100
    print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp))
    print("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp))
    print("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp))
    print("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp))
    
    out_file = args.output_dir + '/summary.txt'
    if not (os.path.exists(args.output_dir)) :
        sys.exit("Directory does not exist.")
    fout = open(out_file,'w')

    fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n")
    fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n")
    fout.write("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)+"\n")
    fout.write("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n")
    
    new_gene = 0
    gene_merge = 0
    gene_found = 0
    gene_split = 0
    gene_missing = 0
    gene = 0
    
    for gene2 in p_gene :
        gene_overlap = []
        chrom2 = (gene2.split(':'))[0]
        start2 = int((gene2.split(':'))[1])
        stop2 = int((gene2.split(':'))[2])
        strand2 = (gene2.split(':'))[3]
        for gene1 in a_gene:
            chrom1 = (gene1.split(':'))[0]
            start1 = int((gene1.split(':'))[1])
            stop1 = int((gene1.split(':'))[2])
            strand1 = (gene1.split(':'))[3]
            if (chrom1 != chrom2) :
                continue
            if (strand1 != strand2) :
                continue
            if (start1 > stop2) :
                break
            if(start1 <= stop2 and start2 <= stop1) :
                arr = [start1,stop1,start2,stop2]
                arr.sort()
                len_overlap = arr[2] - arr[1]
                gene_overlap.append(len_overlap)

        if (len(gene_overlap) == 0) :
            new_gene += 1
        if (len(gene_overlap) > 1) :
            gene_merge += 1
        if (len(gene_overlap) == 1) :
            gene_found += 1
        
    for gene1 in a_gene :
        gene_overlap = []
        chrom1 = (gene1.split(':'))[0]
        start1 = int((gene1.split(':'))[1])
        stop1 = int((gene1.split(':'))[2])
        strand1 = (gene1.split(':'))[3]
        for gene2 in p_gene:
            chrom2 = (gene2.split(':'))[0]
            start2 = int((gene2.split(':'))[1])
            stop2 = int((gene2.split(':'))[2])
            strand2 = (gene2.split(':'))[3]
            if (chrom1 != chrom2) :
                continue
            if (strand1 != strand2) :
                continue
            if (start2 > stop1) :
                break
            if(start1 <= stop2 and start2 <= stop1) :
                arr = [start1,stop1,start2,stop2]
                arr.sort()
                len_overlap = arr[2] - arr[1]
                gene_overlap.append(len_overlap)

                
        if (len(gene_overlap) > 1) :
            gene_split += 1
        if (len(gene_overlap) == 1) :
            gene += 1
        if (len(gene_overlap) == 0) :
            gene_missing += 1
            
    print ("No. of predicted gene overlapping  0 known gene (new gene): ",new_gene)
    print ("No. of predicted gene overlapping > 1 known gene: ",gene_merge)
    print ("No. of predicted gene overlaping 1 known gene : ",gene_found)
    print ("No. of known gene overlapping > 1 predicted gene : ",gene_split)
    print ("No. of known gene overlapping 1 predicted gene : ",gene)
    print ("No. of known gene overlapping 0 predicted gene (gene missing) : ",gene_missing)
def main():
    '''
    This script reports statistics on the areas of a genome where features aren't - introns and
    intergenic space.  Pass a valid GFF3 file (along with FASTA data) and get a report like this:

    Molecule count: 9

    Gene count: 4171
    Intergenic space count: 4061
    Average intergenic space distance: 361.7 bp
    Median intergenic space distance: 245 bp
    Minimum intergenic space distance: 0 bp
    Maximum intergenic space distance: 6272 bp

    Intron count: 10533
    Intron space count: 989024
    Average intron size: 93.9 bp
    Median intron size: 63 bp
    Minimum intron size: 2 bp
    Maximum intron size: 1676 bp


    Optionally, you can pass the path to a PNG file to be created using the --histogram parameter,
    which will generate a size distribution histogram with two overlaying plots - one representing
    the distribution of intergenic region sizes and the other the intron lengths.  Because these
    can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and
    --xlimit options, respectively.

    FASTA:
    If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need
    to specify the --fasta option in this script and pass it as a separate file.

    Definitions:
    Intergenic space was a little ambiguous to me as I started writing this.  Does one count the space from
    the beginning of the contig until the first gene, or only between them?  What about short contigs which
    have no annotated genes at all?  From the Sequence Ontology:

    SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or
    bounded by a gene and the end of the chromosome.

    To my reading, this includes contig ends but not gene-less contigs.  To that end, I include the
    former in intergenic space reporting but include the latter as a separate statistic.

    Author: Joshua Orvis (jorvis AT gmail)
    '''
    parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' )
    parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' )
    parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' )
    parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA')
    args = parser.parse_args()

    (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 )

    if args.fasta is not None:
        seqs = biocodeutils.fasta_dict_from_file( args.fasta )
        for seq_id in seqs:
            if seq_id in assemblies:
                assemblies[seq_id].residues = seqs[seq_id]['s']
                assemblies[seq_id].length = len(assemblies[seq_id].residues)

    ## things to keep stats on and report
    total_molecule_count = len(assemblies)
    total_gene_count = 0
    
    ## this number is NOT just the total genes N - 1, since there can be multiple molecules
    #   genes can overlap, etc.
    total_intergenic_space_count = 0
    
    total_intergenic_space_residues = 0
    intergenic_distances = list()

    total_contig_residues = 0
    empty_contig_residues = 0

    total_intron_count = 0
    total_intron_residues = 0
    intron_sizes = list()

    ############################
    ## Calculation section
    ############################

    for asm_id in assemblies:
        #print("DEBUG: processing assembly: {0}".format(asm_id))
        assembly = assemblies[asm_id]
        genes = sorted(assembly.genes())
        total_gene_count += len(genes)
        previous_gene_loc = None

        # we should have a length here
        if assembly.length is None or assembly.length == 0:
            raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id))

        if total_gene_count == 0:
            empty_contig_residues += assembly.length
            continue

        total_contig_residues += assembly.length
        first_gene_loc = None
        last_gene_loc = None

        for gene in genes:
            gene_loc = gene.location_on(assembly)

            # if this is the first gene, track the number of bases from the start of the molecule here
            if first_gene_loc is None:
                total_intergenic_space_count += 1
                intergenic_distance = gene_loc.fmin
                total_intergenic_space_residues += intergenic_distance
                intergenic_distances.append(intergenic_distance)
                first_gene_loc = gene_loc

            if previous_gene_loc is not None:
                ## skip this gene if it overlaps the previous
                if gene_loc.fmin < previous_gene_loc.fmax:
                    if gene_loc.fmax > previous_gene_loc.fmax:
                        previous_gene_loc = gene_loc

                else:
                    total_intergenic_space_count += 1
                    intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax
                    total_intergenic_space_residues += intergenic_distance
                    intergenic_distances.append(intergenic_distance)
                    
            for mRNA in gene.mRNAs():
                introns = mRNA.introns( on=assembly )

                for intron in sorted(introns):
                    total_intron_count += 1
                    intron_loc = intron.location_on(assembly)
                    intron_size = intron_loc.fmax - intron_loc.fmin

                    #if intron_size > 0:
                        #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size))

                    if intron_size < 0:
                        print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size))
                    
                    intron_sizes.append(intron_size)
                    total_intron_residues += intron_size
                
            previous_gene_loc = gene_loc
            last_gene_loc = previous_gene_loc
        
        if last_gene_loc is not None:
            total_intergenic_space_count += 1
            intergenic_distance = assembly.length - last_gene_loc.fmax
            total_intergenic_space_residues += intergenic_distance
            intergenic_distances.append(intergenic_distance)

    if total_intergenic_space_count == 0:
        avg_intergenic_space_dist = None
        intergenic_distances = None
        median_int_space_dist = None
    else:
        avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count
        intergenic_distances = sorted(intergenic_distances)
        median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ]

    avg_intron_size = total_intron_residues / total_intron_count
    intron_sizes = sorted(intron_sizes)
    median_intron_size = intron_sizes[int(len(intron_sizes)/2)]
            
    ############################
    ## Reporting section
    ############################

    print("\nMolecule count: {0}".format(total_molecule_count))
    print("Gene count: {0}".format(total_gene_count) )

    print("\nTotal molecule bases: {0} bp".format(total_contig_residues) )
    print("Empty molecule bases: {0} bp".format(empty_contig_residues) )

    if total_intergenic_space_count > 0:
        print("Intergenic space count: {0}".format(total_intergenic_space_count) )
        print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) )
        print("Median intergenic space distance: {0} bp".format(median_int_space_dist) )
        print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) )
        print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) )
    else:
        print("There were no intergenic spaces found.  This might mean there were no molecules with at least 2 genes.")
 
    print("Intron count: {0}".format(total_intron_count) )
    print("Intron space count: {0}".format(total_intron_residues) )

    print("Average intron size: {0:.1f} bp".format(avg_intron_size) )
    print("Median intron size: {0} bp".format(median_intron_size) )
    print("Minimum intron size: {0} bp".format(intron_sizes[0]) )
    print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) )
    
    ############################
    ## Graphics section (optional)
    ############################
    if args.histogram is not None:
        import matplotlib.pyplot as plt

        plt.xlabel('length (bp)')
        plt.ylabel('count')
        plt.title('Distribution of intron size and intergenic distances')
        plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' )
        plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' )

        if args.xlimit is not None:
            plt.xlim([0, args.xlimit])
        
        if args.ylimit is not None:
            plt.ylim([0, args.ylimit])

        plt.legend(loc='best')
        plt.savefig(args.histogram)