def main(): parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies') ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps))) biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description= 'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies' ) ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation') parser.add_argument( '-q', '--query_file', type=str, required=True, help= 'GFF3 file with alternative annotation (such as an RNA-seq assemby)') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print( "WARN: expected to find assembly_id {0} in both reference and query sets" .format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides". format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format( qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format( ref_gene.id, len(overlaps))) biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description='Assigns functional annotation based on user-configurable evidence tiers') ## output file to be written parser.add_argument('-c', '--config_file', type=str, required=True, help='Configuration file for annotation' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of output files to be created' ) parser.add_argument('-f', '--output_format', type=str, required=False, default='gff3', help='Desired output format' ) args = parser.parse_args() sources_log_fh = open("{0}.sources.log".format(args.output_base), 'wt') configuration = yaml.load(open(args.config_file).read()) check_configuration(configuration, args) evidence = parse_evidence_config(configuration) default_product_name = configuration['general']['default_product_name'] # stores any active SQLite3 db connections db_conn = dict() # this is a dict of biothings.Polypeptide objects polypeptides = initialize_polypeptides(sources_log_fh, configuration['input']['polypeptide_fasta'], default_product_name) for label in configuration['order']: if label not in evidence: raise Exception("ERROR: There is a label '{0}' in the 'order' section of the conf file that isn't present in the 'evidence' section".format(label)) if evidence[label]['type'] == 'HMMer3_htab': index_conn, ev_db_conn = get_or_create_db_connections(type_ev='hmm_ev', configuration=configuration, evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base) index_conn.isolation_level = None apply_hmm_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration, ev_config=evidence[label], label=label, index_conn=index_conn, log_fh=sources_log_fh) elif evidence[label]['type'] == 'RAPSearch2_m8': index_conn, ev_db_conn = get_or_create_db_connections(type_ev='blast_ev', configuration=configuration, evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base) index_conn.isolation_level = None apply_blast_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration, ev_config=evidence[label], label=label, index_conn=index_conn, log_fh=sources_log_fh) elif evidence[label]['type'] == 'TMHMM': index_conn, ev_db_conn = get_or_create_db_connections(type_ev='tmhmm_ev', configuration=configuration, evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base) apply_tmhmm_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration, ev_config=evidence[label], label=label, log_fh=sources_log_fh) elif evidence[label]['type'] == 'lipoprotein_motif_bsml': index_conn, ev_db_conn = get_or_create_db_connections(type_ev='lipoprotein_motif_ev', configuration=configuration, evidence=evidence, label=label, db_conn=db_conn, output_base=args.output_base) apply_lipoprotein_motif_evidence(polypeptides=polypeptides, ev_conn=ev_db_conn, config=configuration, ev_config=evidence[label], label=label, log_fh=sources_log_fh) else: raise Exception("ERROR: Unsupported evidence type '{0}' with label '{1}' in configuration file".format(evidence[label]['type'], label)) # close all db connections for label in db_conn: db_conn[label].close() perform_final_checks(polypeptides=polypeptides, config=configuration, log_fh=sources_log_fh) # Write the output polyset = biothings.PolypeptideSet() polyset.load_from_dict(polypeptides) if args.output_format == 'fasta': polyset.write_fasta(path="{0}.faa".format(args.output_base)) elif args.output_format == 'gff3': ## parse input GFF (assemblies, ref_features) = biocodegff.get_gff3_features( configuration['input']['gff3'] ) ## merge annotation with polypeptide collection biocodegff.add_annotation(features=ref_features, polypeptide_set=polyset) ## print the new GFF biocodegff.print_gff3_from_assemblies(assemblies=assemblies, ofh=open("{0}.gff3".format(args.output_base), 'wt'))