def main(): # parse options and arguments parser = argparse.ArgumentParser( prog='referenceseeker', formatter_class=argparse.RawDescriptionHelpFormatter, description='Rapid determination of appropriate reference genomes.', epilog= "Citation:\n%s\n\nGitHub:\nhttps://github.com/oschwengers/referenceseeker" % rc.CITATION, add_help=False) parser.add_argument('db', metavar='<database>', help='ReferenceSeeker database path') group_workflow = parser.add_argument_group( 'Filter options / thresholds', 'These options control the filtering and alignment workflow.') group_workflow.add_argument( '--crg', '-r', action='store', type=int, default=100, help= 'Max number of candidate reference genomes to pass kmer prefilter (default = 100)' ) group_workflow.add_argument('--ani', '-a', action='store', type=float, default=0.95, help='ANI threshold (default = 0.95)') group_workflow.add_argument( '--conserved-dna', '-c', action='store', dest='conserved_dna', type=float, default=0.69, help='Conserved DNA threshold (default = 0.69)') group_workflow.add_argument( '--unfiltered', '-u', action='store_true', help= 'Set kmer prefilter to extremely conservative values and skip species level ANI cutoffs (ANI >= 0.95 and conserved DNA >= 0.69' ) group_workflow.add_argument( '--bidirectional', '-b', action='store_true', help='Compute bidirectional ANI/conserved DNA values (default = False)' ) group_runtime = parser.add_argument_group('Runtime & auxiliary options') group_runtime.add_argument('--help', '-h', action='help', help='Show this help message and exit') group_runtime.add_argument('--version', '-V', action='version', version='%(prog)s ' + referenceseeker.__version__) group_runtime.add_argument('--verbose', '-v', action='store_true', help='Print verbose information') group_runtime.add_argument( '--threads', '-t', action='store', type=int, default=mp.cpu_count(), help='Number of used threads (default = number of available CPU cores)' ) subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') # add "single" sub-command option parser_single = subparsers.add_parser( 'single', help='start reference genome search for single genome') parser_single.add_argument('--genome', "-g", metavar='<genome>', action='store', help='target draft genome in fasta format') parser_single.add_argument( '--n_mash_results', '-n', action='store', default=100, help= "Define the number of mash results that will be used as reference-candidates" ) # add "cohort" sub-command option parser_cohort = subparsers.add_parser( 'cohort', help='start reference genome search for genome cohort') parser_cohort.add_argument( '--cohort_genomes', '-cg', metavar='<genome>', action='store', nargs="*", help= 'Target draft genomes or directory with all draft genomes in fasta format' ) parser_cohort.add_argument( '--algorithm', '-a', action='store', default="product", help='Choose algorithm to calculate best fitting reference genome.') parser_cohort.add_argument( '--n_mash_results', '-n', action='store', default=100, help= "Define the number of mash results that will be used as reference-candidates" ) args = parser.parse_args() # setup global configuration config = util.setup_configuration(args) util.test_binaries(config) # check database parameters try: config['db_path'] = util.check_path(args.db) except FileNotFoundError: sys.exit('ERROR: database directory is not readable!') except PermissionError: sys.exit('ERROR (permission): database directory is not accessible') except OSError: sys.exit('ERROR: database directory is empty') # print verbose information if args.verbose: print("ReferenceSeeker v%s" % referenceseeker.__version__) print('Options, parameters and arguments:') print("\tuse bundled binaries: %s" % str(config['bundled-binaries'])) print("\tdb path: %s" % str(config['db_path'])) print("\tgenome path: %s" % str(config['genome_path'])) print("\ttmp path: %s" % str(config['tmp'])) print("\tunfiltered: %s" % str(config['unfiltered'])) print("\tbidirectional: %s" % str(config['bidirectional'])) print("\tANI: %0.2f" % config['ani']) print("\tconserved DNA: %0.2f" % config['conserved_dna']) print("\t# CRG: %d" % config['crg']) print("\t# threads: %d" % config['threads']) if args.subcommand == 'single': single.single(args, config) elif args.subcommand == 'cohort': cohort.cohort(args, config) else: parser.print_help() sys.exit("Error: no subcommand provided!")
def main(): # setup path and test if necessary 3rd party executables are available config = {} util.set_path(config) util.test_binaries(config) # parse options and arguments parser = argparse.ArgumentParser( prog='referenceseeker_db', formatter_class=argparse.RawDescriptionHelpFormatter, description='Rapid determination of appropriate reference genomes.', epilog= "Citation:\n%s\n\nGitHub:\nhttps://github.com/oschwengers/referenceseeker" % rc.CITATION, add_help=False) # add common options group_runtime = parser.add_argument_group('Runtime & auxiliary options') group_runtime.add_argument('--help', '-h', action='help', help='Show this help message and exit') group_runtime.add_argument('--version', '-V', action='version', version='%(prog)s ' + referenceseeker.__version__) subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') # add init sub-command options parser_init = subparsers.add_parser('init', help='Initialize a new database') parser_init.add_argument( '--output', '-o', action='store', default=Path.cwd(), help='output directory (default = current working directory)') parser_init.add_argument('--db', '-d', action='store', required=True, help='Name of the new ReferenceSeeker database') # add import sub-command options parser_import = subparsers.add_parser('import', help='Add a new genome to database') parser_import.add_argument('--db', '-d', action='store', required=True, help='ReferenceSeeker database path') parser_import.add_argument('--genome', '-g', action='store', required=True, help='Genome path [Fasta, GenBank, EMBL]') parser_import.add_argument( '--id', '-i', action='store', default=None, help='Unique genome identifier (default sequence id of first record)') parser_import.add_argument( '--taxonomy', '-t', action='store', type=int, default=12908, help='Taxonomy ID (default = 12908 [unclassified sequences])') parser_import.add_argument( '--status', '-s', action='store', choices=['complete', 'chromosome', 'scaffold', 'contig'], default='contig', help='Assembly level (default = contig)') parser_import.add_argument('--organism', '-o', action='store', default='', help='Organism name (default = "")') args = parser.parse_args() if (args.subcommand == 'init'): init(args) elif (args.subcommand == 'import'): import_genome(config, args) else: parser.print_help() sys.exit("Error: no subcommand provided!")
def main(): # parse options and arguments parser = argparse.ArgumentParser( prog='referenceseeker', formatter_class=argparse.RawDescriptionHelpFormatter, description='Rapid determination of appropriate reference genomes.', epilog= "Citation:\n%s\n\nGitHub:\nhttps://github.com/oschwengers/referenceseeker" % rc.CITATION, add_help=False) parser.add_argument('db', metavar='<database>', help='ReferenceSeeker database path') parser.add_argument('genome', metavar='<genome>', help='target draft genome in fasta format') group_workflow = parser.add_argument_group( 'Filter options / thresholds', 'These options control the filtering and alignment workflow.') group_workflow.add_argument( '--crg', '-r', action='store', type=int, default=100, help= 'Max number of candidate reference genomes to pass kmer prefilter (default = 100)' ) group_workflow.add_argument('--ani', '-a', action='store', type=float, default=0.95, help='ANI threshold (default = 0.95)') group_workflow.add_argument( '--conserved-dna', '-c', action='store', dest='conserved_dna', type=float, default=0.69, help='Conserved DNA threshold (default = 0.69)') group_workflow.add_argument( '--unfiltered', '-u', action='store_true', help= 'Set kmer prefilter to extremely conservative values and skip species level ANI cutoffs (ANI >= 0.95 and conserved DNA >= 0.69' ) group_workflow.add_argument( '--bidirectional', '-b', action='store_true', help='Compute bidirectional ANI/conserved DNA values (default = False)' ) group_runtime = parser.add_argument_group('Runtime & auxiliary options') group_runtime.add_argument('--help', '-h', action='help', help='Show this help message and exit') group_runtime.add_argument('--version', '-V', action='version', version='%(prog)s ' + referenceseeker.__version__) group_runtime.add_argument('--verbose', '-v', action='store_true', help='Print verbose information') group_runtime.add_argument( '--threads', '-t', action='store', type=int, default=mp.cpu_count(), help='Number of used threads (default = number of available CPU cores)' ) args = parser.parse_args() # setup global configuration config = util.setup_configuration(args) util.test_binaries(config) # check parameters db_path = Path(args.db) if (not os.access(str(db_path), os.R_OK)): sys.exit('ERROR: database directory not readable!') db_path = db_path.resolve() config['db_path'] = db_path genome_path = Path(args.genome) if (not os.access(str(genome_path), os.R_OK)): sys.exit('ERROR: genome file not readable!') if (genome_path.stat().st_size == 0): sys.exit('ERROR: genome file (%s) is empty!' % genome_path) genome_path = genome_path.resolve() config['genome_path'] = genome_path # print verbose information if (args.verbose): print("ReferenceSeeker v%s" % referenceseeker.__version__) print('Options, parameters and arguments:') print("\tuse bundled binaries: %s" % str(config['bundled-binaries'])) print("\tdb path: %s" % str(config['db_path'])) print("\tgenome path: %s" % str(config['genome_path'])) print("\ttmp path: %s" % str(config['tmp'])) print("\tunfiltered: %s" % str(config['unfiltered'])) print("\tbidirectional: %s" % str(config['bidirectional'])) print("\tANI: %0.2f" % config['ani']) print("\tconserved DNA: %0.2f" % config['conserved_dna']) print("\t# CRG: %d" % config['crg']) print("\t# threads: %d" % config['threads']) # calculate genome distances via Mash if (args.verbose): print('\nEstimate genome distances...') mash_output_path = config['tmp'].joinpath('mash.out') mash.run_mash(config, mash_output_path) # extract hits and store dist screened_ref_genome_ids, mash_distances = mash.parse_mash_results( config, mash_output_path) if (args.verbose): print("\tscreened %d potential reference genome(s)" % len(screened_ref_genome_ids)) # reduce Mash output to best hits (args.crg) if (len(screened_ref_genome_ids) > args.crg): if (args.verbose): print("\treduce to best %d hits..." % args.crg) tmp_screened_ref_genome_ids = sorted(screened_ref_genome_ids, key=lambda k: mash_distances[k]) screened_ref_genome_ids = tmp_screened_ref_genome_ids[:args.crg] # get genomes from RefSeq by accessions ref_genomes = util.read_reference_genomes(config) screened_ref_genomes = { k: v for k, v in ref_genomes.items() if k in screened_ref_genome_ids } # build dna fragments dna_fragments_path = config['tmp'].joinpath('dna-fragments.fasta') dna_fragments = util.build_dna_fragments(genome_path, dna_fragments_path) # align query fragments to reference genomes and compute ANI/conserved DNA results = {} if (args.verbose): print('\nCompute ANIs...') with cf.ThreadPoolExecutor(max_workers=args.threads) as tpe: futures = [] for id, ref_genome in screened_ref_genomes.items(): futures.append( tpe.submit(rani.align_query_genome, config, dna_fragments_path, dna_fragments, id)) for f in futures: ref_genome_id, ani, conserved_dna = f.result() results[ref_genome_id] = [(ani, conserved_dna)] # align reference genomes fragments to query genome and compute ANI/conserved DNA if (args.bidirectional): if (args.verbose): print('\nCompute reverse ANIs...') with cf.ProcessPoolExecutor(args.threads) as ppe: futures = [] for id, ref_genome in screened_ref_genomes.items(): futures.append( ppe.submit(rani.align_reference_genome, config, genome_path, id)) for f in futures: ref_genome_id, ani, conserved_dna = f.result() result = results[ref_genome_id] result.append((ani, conserved_dna)) # remove tmp dir shutil.rmtree(str(config['tmp'])) # filter and sort results filtered_reference_ids = [] for ref_genome_id, result in results.items(): if (args.unfiltered): filtered_reference_ids.append(ref_genome_id) else: if (args.bidirectional): query_ref = result[0] ref_query = result[1] if ((query_ref[0] >= config['ani']) and (query_ref[1] >= config['conserved_dna']) and (ref_query[0] >= config['ani']) and (ref_query[1] >= config['conserved_dna'])): filtered_reference_ids.append(ref_genome_id) else: (ani, conserved_dna) = result[0] if ((conserved_dna >= config['conserved_dna']) and (ani >= config['ani'])): filtered_reference_ids.append(ref_genome_id) # sort and print results according to ANI * conserved DNA values if (args.bidirectional): filtered_reference_ids = sorted(filtered_reference_ids, key=lambda k: (results[k][0][0] * results[k][0][1] * results[k][1][0] * results[k][1][1]), reverse=True) if (args.verbose): print('') print( '#ID\tMash Distance\tQR ANI\tQR Con. DNA\tRQ ANI\tRQ Con. DNA\tTaxonomy ID\tAssembly Status\tOrganism' ) for id in filtered_reference_ids: # print results to STDOUT ref_genome = ref_genomes[id] result = results[id] print( '%s\t%1.5f\t%2.2f\t%2.2f\t%2.2f\t%2.2f\t%s\t%s\t%s' % (id, mash_distances[id], result[0][0] * 100, result[0][1] * 100, result[1][0] * 100, result[1][1] * 100, ref_genome['tax'], ref_genome['status'], ref_genome['name'])) else: filtered_reference_ids = sorted(filtered_reference_ids, key=lambda k: (results[k][0][0] * results[k][0][1]), reverse=True) if (args.verbose): print('') print( '#ID\tMash Distance\tANI\tCon. DNA\tTaxonomy ID\tAssembly Status\tOrganism' ) for id in filtered_reference_ids: # print results to STDOUT ref_genome = ref_genomes[id] result = results[id][0] print( '%s\t%1.5f\t%2.2f\t%2.2f\t%s\t%s\t%s' % (id, mash_distances[id], result[0] * 100, result[1] * 100, ref_genome['tax'], ref_genome['status'], ref_genome['name']))