Python print_gff3_from_assemblies 예제들, biocodegff.print_gff3_from_assemblies Python 예제들

예제 #1

0

파일 보기

파일: custom_merge_annotation_col9.py 프로젝트: 452990729/biocode

def main():
    parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id))
            continue
        
        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id))
                continue
                
            ref_annot = ref_gene.polypeptides()[0].annotation
            
            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)
                
                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps)))
    
    biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))

예제 #2

0

파일 보기

파일: custom_merge_annotation_col9.py 프로젝트: yuzhenpeng/biocode

def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies'
    )

    ## output file to be written
    parser.add_argument('-r',
                        '--reference_file',
                        type=str,
                        required=True,
                        help='GFF3 file of a reference annotation')
    parser.add_argument(
        '-q',
        '--query_file',
        type=str,
        required=True,
        help=
        'GFF3 file with alternative annotation (such as an RNA-seq assemby)')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (ref_assemblies,
     ref_feats) = biocodegff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print(
                "WARN: expected to find assembly_id {0} in both reference and query sets"
                .format(assembly_id))
            continue

        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".
                      format(ref_gene.id))
                continue

            ref_annot = ref_gene.polypeptides()[0].annotation

            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)

                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(
                        qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(
                    ref_gene.id, len(overlaps)))

    biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies,
                                          ofh=open(args.output_file, 'w'))

예제 #3

0

파일 보기

파일: assign_functional_annotation.py 프로젝트: jorvis/Attributor

def main():
    parser = argparse.ArgumentParser( description='Assigns functional annotation based on user-configurable evidence tiers')

    ## output file to be written
    parser.add_argument('-c', '--config_file', type=str, required=True, help='Configuration file for annotation' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of output files to be created' )
    parser.add_argument('-f', '--output_format', type=str, required=False, default='gff3', help='Desired output format' )
    args = parser.parse_args()

    sources_log_fh = open("{0}.sources.log".format(args.output_base), 'wt')

    configuration = yaml.load(open(args.config_file).read())
    check_configuration(configuration, args)
    evidence = parse_evidence_config(configuration)
    default_product_name = configuration['general']['default_product_name']

    # stores any active SQLite3 db connections
    db_conn = dict()

    # this is a dict of biothings.Polypeptide objects
    polypeptides = initialize_polypeptides(sources_log_fh, configuration['input']['polypeptide_fasta'], default_product_name)

    for label in configuration['order']:
        if label not in evidence:
            raise Exception("ERROR: There is a label '{0}' in the 'order' section of the conf file that isn't present in the 'evidence' section".format(label))

        if evidence[label]['type'] == 'HMMer3_htab':
            index_conn, ev_db_conn = get_or_create_db_connections(type_ev='hmm_ev', configuration=configuration,
                                         evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base)
            index_conn.isolation_level = None
            apply_hmm_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration,
                               ev_config=evidence[label], label=label, index_conn=index_conn, log_fh=sources_log_fh)
                
        elif evidence[label]['type'] == 'RAPSearch2_m8':
            index_conn, ev_db_conn = get_or_create_db_connections(type_ev='blast_ev', configuration=configuration,
                                         evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base)
            index_conn.isolation_level = None
            apply_blast_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration,
                                 ev_config=evidence[label], label=label, index_conn=index_conn, log_fh=sources_log_fh)

        elif evidence[label]['type'] == 'TMHMM':
            index_conn, ev_db_conn = get_or_create_db_connections(type_ev='tmhmm_ev', configuration=configuration,
                                         evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base)
            apply_tmhmm_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration,
                                 ev_config=evidence[label], label=label, log_fh=sources_log_fh)
            
        elif evidence[label]['type'] == 'lipoprotein_motif_bsml':
            index_conn, ev_db_conn = get_or_create_db_connections(type_ev='lipoprotein_motif_ev', configuration=configuration,
                                         evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base)
            apply_lipoprotein_motif_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration,
                                             ev_config=evidence[label], label=label, log_fh=sources_log_fh)

        else:
            raise Exception("ERROR: Unsupported evidence type '{0}' with label '{1}' in configuration file".format(evidence[label]['type'], label))

    # close all db connections
    for label in db_conn:
        db_conn[label].close()

    perform_final_checks(polypeptides=polypeptides, config=configuration, log_fh=sources_log_fh)

    # Write the output
    polyset = biothings.PolypeptideSet()
    polyset.load_from_dict(polypeptides)
    
    if args.output_format == 'fasta':
        polyset.write_fasta(path="{0}.faa".format(args.output_base))
    elif args.output_format == 'gff3':
        ## parse input GFF
        (assemblies, ref_features) = biocodegff.get_gff3_features( configuration['input']['gff3'] )

        ## merge annotation with polypeptide collection
        biocodegff.add_annotation(features=ref_features, polypeptide_set=polyset)

        ## print the new GFF
        biocodegff.print_gff3_from_assemblies(assemblies=assemblies, ofh=open("{0}.gff3".format(args.output_base), 'wt'))