def get_options(): parser = ArgumentParser( description= "This script uses an naive De Bruijn approach to convert sequence back into an " "assembly graph file, such as a gfa (Graphical Fragment Assembly) or a fastg file.", usage="reconstruct_graph_from_fasta.py -i fasta_file -o out.gfa") parser.add_argument("-i", dest="input", help="Input fasta file.") parser.add_argument( "-o", dest="output", default="", help= "Output graph file. The output format is GFA by default, but FASTG only when " "indicated with postfix '.fastg'.") parser.add_argument( "-L", "--overlap", dest="overlap", default=55, type=int, help="overlap for reconstructing De Bruijn graph. Default:%(default)s") parser.add_argument( "-c", "--circular", dest="circular", default="auto", help="Sequences in input fasta file are all circular (yes/no/auto). " "The auto mode enables detection by checking the existence of '(circular)' in " "the end of the header of each sequence. Default:%(default)s") parser.add_argument( "--single-chain", dest="single_chain", default=False, action="store_true", help= "The input sequence(s) was by default treated as DNA double-chain with its complementary " "sequence. Choose this flag to turn off.") parser.add_argument("--out-kg", dest="out_kg", default="", help="Output kmer node graph.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not ((options.output or options.out_kg) and options.input): parser.print_help() sys.stdout.write("Insufficient arguments!\n") sys.exit() elif options.circular not in ("auto", "yes", "no"): # parser.print_help() sys.stdout.write( "Illegal -c input! circular mode must be one of yes/no/auto!\n") sys.exit() elif options.overlap % 2 == 0: # parser.print_help() sys.stdout.write("Illegal -k input! kmer must be an odd number!\n") return options
def require_commands(): usage = "python this_script.py Query.gb -r Reference.gb" \ "\n\nThis script only checks the mainly check the reliability of automatically annotated tRNA and CDS." \ "\n" parser = ArgumentParser(usage=usage) group_need = parser.add_argument_group("NECESSARY OPTIONS") group_need.add_argument('query_gb', metavar='query', type=str, nargs='+', help='Input a list of *.gb files as the query (split the files by spaces).') group_need.add_argument('-r', dest='reference_gb', help='input reference *.gb file') group_alternation = parser.add_argument_group("ALTERNATION of NECESSARY OPTIONS") group_alternation.add_argument('-d', dest='reference_fasta', help='input reference fasta file exported exported by "Extract Annotations"-"Export"-"Selected Documents"-fasta in Geneious, remember to choose "Replace spaces in sequence name with underscores"') group_optional = parser.add_argument_group("OPTIONAL OPTIONS") group_optional.add_argument('--t-ends', dest='ends_trna', help='Default=10. The length to check at the both ends of tRNA.', type=int, default=10) group_optional.add_argument('--c-ends', dest='ends_cds', help='Default:not activated. Activate this calculation and assign the length to check at the both ends of CDS.', type=int) group_optional.add_argument('--a-ends', dest='ends_all', help='Default:not activated. Activate this calculation and assign the length to check at the both ends of annotated all regions.', type=int) group_optional.add_argument('--l-threshold', dest='length', help='Default=0.9. Length threshold to report warning.', type=float, default=0.9) group_optional.add_argument('--similarity', dest='enable_similarity', help='Default=False. Choose to enable similarity calculation.', default=False, action='store_true') group_optional.add_argument('--s-threshold', dest='similarity', help='Default=0.9. Similarity threshold to report warning. Should be < length threshold.', type=float, default=0.9) parser.add_argument("-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) # parser.add_option_group(group_need) # parser.add_option_group(group_alternation) # parser.add_option_group(group_optional) options = parser.parse_args() if not (len(options.query_gb) and options.reference_gb or options.reference_fasta): parser.print_help() return options, options.query_gb
def get_options(description): parser = ArgumentParser(description=description, usage="plastome_arch_info.py fasta_format_sequence_file(s)") parser.add_argument("sequences", metavar="sequences", type=str, nargs="+", help="Input fasta format sequences (split the files by spaces).") parser.add_argument("-o", dest="output", help="output file.") parser.add_argument("-r", dest="min_ir_length", default=5000, type=int, help="The minimum repeat length treated as the IR region of plastome. Default: [%(default)s]") parser.add_argument("-v", dest="valid_bases", default="ATGCRMYKHBDVatgcrmykhbdv", help="Valid bases. Default: ATGCRMYKHBDVatgcrmykhbdv") parser.add_argument("--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not len(options.sequences): parser.print_help() sys.exit() else: for f in options.sequences: if not os.path.isfile(f): raise IOError(f + " not found/valid!") options.valid_bases = set(list(options.valid_bases)) return options, options.sequences
def get_options(): parser = ArgumentParser( usage="summary_get_organelle_output.py list_of_folders -o tab_file") parser.add_argument( "sample_folders", metavar="output", type=str, nargs="+", help="Input a list of folders generated by get_organelle_from_reads.py." "Please split the files by spaces.") parser.add_argument("-o", dest="output", help="Output csv file.") # parser.add_argument("--verbose", dest="verbose", default=False, action="store_true", # help="Verbose style.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not options.output or not len(options.sample_folders): parser.print_help() sys.stdout.write("Insufficient arguments!\n") sys.exit() return options
def require_commands(): global options usage = 'python ' + str( os.path.basename(__file__)) + ' -g input.fastg -f refernce.fasta' parser = ArgumentParser(usage=usage) parser.add_argument('-g', dest='in_fastg_file', type=str, help='followed by your input fastg file') parser.add_argument('-f', dest='reference_fa_base', type=str, help='followed by Fasta index format') parser.add_argument( '--keep-temp', dest='keep_temp', default=False, action='store_true', help= 'Choose to disable deleting temp files produced by blast and this script' ) parser.add_argument('--bt', dest='blast_hits_threshold', default=0.60, help='Default: 0.60', type=float) parser.add_argument('--max-gap', dest='max_gap_to_add', default=1500, help='Default: 1500', type=int) parser.add_argument( '--con-all', dest='connect_inner_contig', default=False, action='store_true', help= 'Choose to activate connecting all possible contigs. Default: False') parser.add_argument('--depth', dest='depth_to_connect', default=1.0, help='Default: 1.0', type=float) parser.add_argument( "--which-blast", dest="which_blast", default="", help="Assign the path to BLAST binary files if not added to the path. " "Default: try GetOrganelleDep/" + SYSTEM_NAME + "/ncbi-blast first, then $PATH") # parser.add_argument('--merge-overlaps', default=False, action='store_true', help='Choose to activate automatically merging overlapping contigs') # parser.add_argument('--min-os', dest='min_overlap_similarity', default=0.9, help='The similarity threshold to merge overlapping contigs. Default: 0.9', type=float) # parser.add_argument('--min-ol', dest='min_overlap_length', default=15, help='The length threshold to merge overlapping contigs. Default: 15', type=int) parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) try: options = parser.parse_args() except Exception as e: sys.stdout.write('\n######################################' + str(e)) sys.stdout.write('\n"-h" for more usage') exit() else: if not (options.in_fastg_file and options.reference_fa_base): sys.stdout.write( "\n######################################\nInsufficient arguments!" ) sys.stdout.write("\n\"-h\" for more usage") exit()
os.path.join(SEQ_NAME, "VERSION")] } if os.path.isdir(DEP_DIR) and os.path.isfile( os.path.join(DEP_DIR, "__init__.py")): PACKAGES.append(DEP_NAME) PACKAGE_DATA[DEP_NAME] = [ this_file for this_file in get_recursive_files( target_dir=os.path.join(DEP_DIR, SYSTEM_NAME), start_from=DEP_DIR, exclude_files=EXCLUDE_SHARE_SPADES_PATHS) ] if not in_situ: setup( name="GetOrganelle", version=get_versions(), description= "a fast and versatile toolkit for accurate de novo assembly of organelle genomes.", author="Jian-Jun Jin", author_email="*****@*****.**", url="http://github.com/Kinggerm/GetOrganelle", license="GNU General Public License, version 3", packages=PACKAGES, platforms="linux/MacOS", scripts=scripts_to_install, # relative path to each package package_data=PACKAGE_DATA, install_requires=install_dependencies, zip_safe=False) if keep_temp: for temp_dir_or_files in ("build", "dist", "*.pyc", "*.tgz",
def main(): time0 = time.time() print_title = "GetOrganelle v" + str(get_versions()) + \ "\n\nThis is a script for extracting organelle genomes" \ " from slim_fastg.py-produced files (csv & fastg). " + \ "\nBy [email protected]\n\n" options, log_handler = get_options(print_title) @set_time_limit(options.time_limit) def disentangle_circular_assembly(fastg_file, tab_file, prefix, weight_factor, type_factor, mode="embplant_pt", log_hard_cov_threshold=10., expected_max_size=inf, expected_min_size=0, contamination_depth=3., contamination_similarity=5., degenerate=True, degenerate_depth=1.5, degenerate_similarity=1.5, min_sigma_factor=0.1, max_copy_in=10, only_max_cov=True, keep_temp=False, acyclic_allowed=False, verbose=False, inner_logging=None, debug=False): if options.resume and os.path.exists(prefix + ".graph1.selected_graph.gfa"): pass if inner_logging: inner_logging.info(">>> Result graph existed!") else: sys.stdout.write(">>> Result graph existed!\n") else: time_a = time.time() if inner_logging: inner_logging.info(">>> Parsing " + fastg_file + " ..") else: sys.stdout.write("Parsing " + fastg_file + " ..\n") input_graph = Assembly(fastg_file, min_cov=options.min_cov, max_cov=options.max_cov) time_b = time.time() if inner_logging: inner_logging.info(">>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s") else: sys.stdout.write("\n>>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s\n") temp_graph = prefix + ".temp.fastg" if keep_temp else None copy_results = input_graph.find_target_graph( tab_file, database_name=mode, mode=mode, type_factor=type_factor, weight_factor=weight_factor, log_hard_cov_threshold=log_hard_cov_threshold, contamination_depth=contamination_depth, contamination_similarity=contamination_similarity, degenerate=degenerate, degenerate_depth=degenerate_depth, degenerate_similarity=degenerate_similarity, expected_max_size=expected_max_size, expected_min_size=expected_min_size, max_contig_multiplicity=max_copy_in, only_keep_max_cov=only_max_cov, min_sigma_factor=min_sigma_factor, temp_graph=temp_graph, broken_graph_allowed=acyclic_allowed, verbose=verbose, log_handler=inner_logging, debug=debug) time_c = time.time() if inner_logging: inner_logging.info(">>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s") if len(copy_results) > 1: inner_logging.info( str(len(copy_results)) + " set(s) of graph detected.") else: sys.stdout.write("\n\n>>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s\n") if len(copy_results) > 1: sys.stdout.write( str(len(copy_results)) + " set(s) of graph detected.\n") degenerate_base_used = False if acyclic_allowed: # still_complete = [] for go_res, copy_res in enumerate(copy_results): go_res += 1 broken_graph = copy_res["graph"] count_path = 0 these_paths = broken_graph.get_all_paths( mode=mode, log_handler=inner_logging) # reducing paths if len(these_paths) > options.max_paths_num: this_warn_str = "Only exporting " + str(options.max_paths_num) + " out of all " + \ str(len(these_paths)) + " possible paths. (see '--max-paths-num' to change it.)" if inner_logging: inner_logging.warning(this_warn_str) else: sys.stdout.write("Warning: " + this_warn_str + "\n") these_paths = these_paths[:options.max_paths_num] # exporting paths, reporting results for this_paths, other_tag in these_paths: count_path += 1 all_contig_str = [] contigs_are_circular = [] for go_contig, this_p_part in enumerate(this_paths): this_contig = broken_graph.export_path(this_p_part) if DEGENERATE_BASES & set(this_contig.seq): degenerate_base_used = True if this_contig.label.endswith("(circular)"): contigs_are_circular.append(True) else: contigs_are_circular.append(False) if len(this_paths ) == 1 and contigs_are_circular[-1]: all_contig_str.append(this_contig.fasta_str()) else: all_contig_str.append(">contig_" + str(go_contig + 1) + "--" + this_contig.label + "\n" + this_contig.seq + "\n") if len(all_contig_str) == 1 and set( contigs_are_circular) == {True}: # print ir stat if count_path == 1 and mode == "embplant_pt": detect_seq = broken_graph.export_path( this_paths[0]).seq ir_stats = detect_plastome_architecture( detect_seq, 1000) print_str = "Detecting large repeats (>1000 bp) in PATH1 with " + ir_stats[-1] +\ ", Total:LSC:SSC:Repeat(bp) = " + str(len(detect_seq)) + ":" + \ ":".join([str(len_val) for len_val in ir_stats[:3]]) if inner_logging: inner_logging.info(print_str) else: sys.stdout.write(print_str + "\n") # if len(all_contig_str) == 1 and set(contigs_are_circular) == {True}: # still_complete.append(True) # else: # still_complete.append(False) open( prefix + ".graph" + str(go_res) + other_tag + "." + str(count_path) + ".path_sequence.fasta", "w").write("\n".join(all_contig_str)) broken_graph.write_to_gfa(prefix + ".graph" + str(go_res) + ".selected_graph.gfa") else: for go_res, copy_res in enumerate(copy_results): go_res += 1 idealized_graph = copy_res["graph"] # should add making one-step-inversion pairs for paths, # which would be used to identify existence of a certain isomer using mapping information count_path = 0 these_paths = idealized_graph.get_all_circular_paths( mode=mode, log_handler=inner_logging, reverse_start_direction_for_pt=options.reverse_lsc) # reducing paths if len(these_paths) > options.max_paths_num: this_warn_str = "Only exporting " + str(options.max_paths_num) + " out of all " + \ str(len(these_paths)) + " possible paths. (see '--max-paths-num' to change it.)" if inner_logging: inner_logging.warning(this_warn_str) else: sys.stdout.write("Warning: " + this_warn_str + "\n") these_paths = these_paths[:options.max_paths_num] # exporting paths, reporting results for this_path, other_tag in these_paths: count_path += 1 this_seq_obj = idealized_graph.export_path(this_path) if DEGENERATE_BASES & set(this_seq_obj.seq): degenerate_base_used = True open( prefix + ".graph" + str(go_res) + other_tag + "." + str(count_path) + ".path_sequence.fasta", "w").write(this_seq_obj.fasta_str()) # print ir stat if count_path == 1 and mode == "embplant_pt": detect_seq = this_seq_obj.seq ir_stats = detect_plastome_architecture( detect_seq, 1000) print_str = "Detecting large repeats (>1000 bp) in PATH1 with " + ir_stats[-1] + \ ", Total:LSC:SSC:Repeat(bp) = " + str(len(detect_seq)) + ":" + \ ":".join([str(len_val) for len_val in ir_stats[:3]]) if inner_logging: inner_logging.info(print_str) else: sys.stdout.write(print_str + "\n") idealized_graph.write_to_gfa(prefix + ".graph" + str(go_res) + ".selected_graph.gfa") if degenerate_base_used: inner_logging.warning("Degenerate base(s) used!") time_d = time.time() if inner_logging: inner_logging.info( ">>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s") else: sys.stdout.write( "\n\n>>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s\n") try: disentangle_circular_assembly( options.fastg_file, options.tab_file, os.path.join(options.output_directory, options.prefix), type_factor=options.type_factor, mode=options.mode, weight_factor=options.weight_factor, log_hard_cov_threshold=options.depth_factor, contamination_depth=options.contamination_depth, contamination_similarity=options.contamination_similarity, degenerate=options.degenerate, degenerate_depth=options.degenerate_depth, degenerate_similarity=options.degenerate_similarity, expected_max_size=options.expected_max_size, expected_min_size=options.expected_min_size, min_sigma_factor=options.min_sigma_factor, max_copy_in=options.max_multiplicity, only_max_cov=options.only_keep_max_cov, acyclic_allowed=options.acyclic_allowed, keep_temp=options.keep_temp_graph, inner_logging=log_handler, verbose=options.verbose, debug=options.debug) log_handler = simple_log(logging.getLogger(), options.output_directory, options.prefix + ".disentangle.") log_handler.info('\nTotal cost: ' + str(round(time.time() - time0, 4)) + 's\n') except IOError as e: raise e except KeyError as e: if str(e).strip("'") == options.mode: log_handler.error(options.mode + " not found in " + str(options.tab_file) + "!") log_handler.error("Disentangling failed!") else: log_handler.exception(str(e)) log_handler.error("Disentangling failed!") if not options.acyclic_allowed: log_handler.info( "You might try again with '--linear' to export contig(s) " "instead of circular genome.") log_handler = simple_log(log_handler, options.output_directory, options.prefix + ".disentangle.") log_handler.info("\nTotal cost " + str(time.time() - time0)) log_handler.info( "Please email [email protected] if you find bugs!\n") except Exception as e: log_handler.exception(str(e)) log_handler.error("Disentangling failed!") if not options.acyclic_allowed: log_handler.info( "You might try again with '--linear' to export contig(s) " "instead of circular genome.") log_handler = simple_log(log_handler, options.output_directory, options.prefix + ".disentangle.") log_handler.info("\nTotal cost " + str(time.time() - time0)) log_handler.info( "Please email [email protected] if you find bugs!\n") logging.shutdown()
def require_options(): usage = "Usage: rm_low_coverage_duplicated_contigs.py *.fastg" parser = ArgumentParser(usage=usage) parser.add_argument( 'assemblies', metavar='assemblies', type=str, nargs='+', help= "Input FASTG format assembly graph files (split the files by spaces).") parser.add_argument( '--cov-t', dest='coverage_threshold', default=0.12, help='With ratio (coverage of query/coverage of subject) below which, ' 'the query would be exposed to discarded. Default: 0.12') parser.add_argument( '--len-t', dest='length_threshold', default=0.9, help= 'With overlap (length of hit of query/ length of query) above which, ' 'the query would be exposed to discarded. Default: 0.9') parser.add_argument('--blur', dest='blur_bases', default=False, action='store_true', help='Replace hit low-coverage bases with N.') parser.add_argument('--keep-temp', dest='keep_temp', default=False, action='store_true', help='Keep temp blast files.') parser.add_argument( "--which-blast", dest="which_blast", default="", help="Assign the path to BLAST binary files if not added to the path.") parser.add_argument( '-o', dest='output_dir', help='Output directory. Default: along with the original file') parser.add_argument('-t', '--threads', dest="threads", default=4, type=int, help="Threads of blastn.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not options.assemblies: parser.print_help() sys.stdout.write( '\n######################################\nERROR: Insufficient REQUIRED arguments!\n\n' ) exit() if not options.which_blast: try_this_bin = os.path.join(GO_DEP_PATH, "ncbi-blast", "blastn") if os.path.isfile(try_this_bin) and executable(try_this_bin): output, err = subprocess.Popen(try_this_bin + " -version", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate() if "not found" in output.decode("utf8"): sys.stdout.write(output.decode("utf8") + "\n") else: options.which_blast = os.path.split(try_this_bin)[0] if not executable(os.path.join(options.which_blast, "blastn")): sys.stdout.write( os.path.join(options.which_blast, "blastn") + " not accessible!") exit() if not executable(os.path.join(options.which_blast, "makeblastdb")): sys.stdout.write( os.path.join(options.which_blast, "makeblastdb") + " not accessible!") exit() if options.treat_no_hits not in ["ex_no_con", "ex_no_hit", "keep_all"]: sys.stdout.write( '\n\nOption Error: you should choose assign one of "ex_no_con", "ex_no_hit"' ' and "keep_all" to variable treat_no_hits\n') exit() return options, options.assemblies
def get_options(): parser = ArgumentParser( "evaluate_assembly_using_mapping.py -f fasta_file -1 RAW_1.fq -2 RAW_2.fq -o output" ) parser.add_argument("-f", dest="fasta", help="input assembly fasta file.") parser.add_argument("-1", dest="original_fq_1") parser.add_argument("-2", dest="original_fq_2") parser.add_argument( "-u", dest="unpaired_fq_files", default="", help= "Input file(s) with unpaired (single-end) reads to be added to the pool. " "files could be comma-separated lists such as 'seq1,seq2'.") parser.add_argument( "-X", "--max-lib-len", dest="max_lib_len", type=int, default=1200, help="Corresponding to '-X' option in Bowtie2. Default: %(default)s.") parser.add_argument( "-c", dest="is_circular", default="auto", help="(yes/no/auto) input fasta is circular. " "If auto was chosen, the input fasta would be treated as circular when the sequence name " "ends with '(circular)'. " "Default: auto") parser.add_argument("-o", dest="output_base", help="output folder.") parser.add_argument("-t", dest="threads", type=int, default=2, help="threads.") parser.add_argument("--continue", dest="resume", default=False, action="store_true") parser.add_argument( "--seed", dest="random_seed", default=12345, type=int, help="Seed for random number generator. Default: %(default)s") parser.add_argument( "--draw", dest="draw_plot", default=False, action="store_true", help="Draw density plot using matplotlib, which should be installed.") parser.add_argument("--plot-format", dest="plot_format", default="pdf,png", help='Default: pdf,png') parser.add_argument("--plot-title", dest="plot_title", help="Default: `the file name of the input fasta`") parser.add_argument("--plot-subtitle", dest="plot_subtitle", default="", help="A 4-space indicates a line break. Default: None") parser.add_argument("--plot-transparent", dest="plot_transparent", default=False, action="store_true", help="Default: False") parser.add_argument("--plot-x-density", dest="plot_x_density", default=12000., type=float, help="Default: %(default)s") # parser.add_argument("--plot-x-sliding-window", dest="sliding_window_size", default=1, type=int, # help="Default: %(default)s") parser.add_argument( "--plot-x-gap-dots", dest="gap_len", default=3000, type=int, help= "Number of sites added in-between isolated contigs. Default: %(default)s" ) parser.add_argument("--plot-figure-height", dest="figure_height", default=5., type=float, help="Default: %(default)s") parser.add_argument("--plot-y-lim", dest="y_lim", type=float, help="Y axis value limit. ") # parser.add_argument("--plot-figure-extra-width", dest="extra_width", default=3., type=float, # help="Default: %(default)s") parser.add_argument( "--plot-font", dest="plot_font", default=None, help= "For plot of unicode characters in some environments. Use 'Times New Roman','Arial' etc. " "Default: %(default)s.") parser.add_argument("--disable-customized-error-rate", dest="customized_error_rate", default=True, action="store_true") parser.add_argument( "--which-bowtie2", dest="which_bowtie2", default="", help="Assign the path to Bowtie2 binary files if not added to the path. " "Default: try GetOrganelleDep/" + SYSTEM_NAME + "/bowtie2 first, then $PATH") parser.add_argument("--bowtie2-mode", dest="bowtie2_mode", default="--sensitive", help="Default: %(default)s") parser.add_argument("--bowtie2-options", dest="other_bowtie2_options", default="--no-discordant --dovetail", help="Default: %(default)s") parser.add_argument( "--stat-mode", dest="stat_mode", default="best", help= "Statistical mode for counting multiple hits of a single read: best/all. " "The all mode is meaningful only when '-k <INT>' was included in '--bowtie2-options'. " "Default: %(default)s") parser.add_argument("--debug", dest="debug_mode", default=False, action="store_true", help="Turn on debug mode.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not (options.fasta and ((options.original_fq_1 and options.original_fq_2) or options.unpaired_fq_files) and options.output_base): sys.stderr.write("Insufficient arguments!\n") sys.exit() if not os.path.isdir(options.output_base): os.mkdir(options.output_base) if options.debug_mode: log_level = "DEBUG" else: log_level = "INFO" assert options.stat_mode in ("best", "all") log_handler = simple_log(logging.getLogger(), options.output_base, "", log_level=log_level) log_handler.info("") log_handler.info("Python " + str(sys.version).replace("\n", " ")) log_handler.info("PLATFORM: " + " ".join(platform.uname())) # log versions of python libs lib_versions_info = [] if options.draw_plot: try: import matplotlib except ImportError: pass else: lib_versions_info.append("matplotlib " + matplotlib.__version__) lib_versions_info.append("GetOrganelleLib " + GetOrganelleLib.__version__) log_handler.info("PYTHON LIBS: " + "; ".join(lib_versions_info)) # log versions of dependencies dep_versions_info = [] if not options.which_bowtie2: try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2") if os.path.isfile(try_this_bin) and executable(try_this_bin): options.which_bowtie2 = os.path.split(try_this_bin)[0] if not executable(os.path.join(options.which_bowtie2, "bowtie2")): log_handler.error( os.path.join(options.which_bowtie2, "bowtie2") + " not accessible!") exit() else: output, err = subprocess.Popen( os.path.join(options.which_bowtie2, "bowtie2") + " --version", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate() this_lines = output.decode("utf8").split("\n")[:3] dep_versions_info.append("Bowtie2 " + this_lines[0].split()[-1].strip()) if not executable( os.path.join(options.which_bowtie2, "bowtie2-build") + " --large-index"): log_handler.error( os.path.join(options.which_bowtie2, "bowtie2-build") + " not accessible!") exit() log_handler.info("DEPENDENCIES: " + "; ".join(dep_versions_info)) log_handler.info("WORKING DIR: " + os.getcwd()) # if not executable(os.path.join(options.which_bowtie2, "bowtie2-build-l")): # log_handler.error(os.path.join(options.which_bowtie2, "bowtie2-build-l") + " not accessible!") # exit() log_handler.info(" ".join( ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") log_handler = timed_log(log_handler, options.output_base, "", log_level=log_level) return options, log_handler
def main(): time0 = time.time() print_title = "GetOrganelle v" + str(get_versions()) + \ "\n\nThis is a script for extracting organelle genomes" \ " from slim_fastg.py-produced files (csv & fastg). " + \ "\nBy [email protected]\n\n" options, log_handler = get_options(print_title) @set_time_limit(options.time_limit) def disentangle_circular_assembly(fastg_file, tab_file, prefix, weight_factor, type_factor, mode="embplant_pt", log_hard_cov_threshold=10., expected_max_size=inf, expected_min_size=0, contamination_depth=3., contamination_similarity=5., degenerate=True, degenerate_depth=1.5, degenerate_similarity=1.5, min_sigma_factor=0.1, only_max_c=True, keep_temp=False, acyclic_allowed=False, verbose=False, log_handler=None, debug=False): if options.resume and os.path.exists(prefix + ".graph1.selected_graph.gfa"): pass if log_handler: log_handler.info(">>> Result graph existed!") else: sys.stdout.write(">>> Result graph existed!\n") else: time_a = time.time() if log_handler: log_handler.info(">>> Parsing " + fastg_file + " ..") else: sys.stdout.write("Parsing " + fastg_file + " ..\n") input_graph = Assembly(fastg_file, min_cov=options.min_cov, max_cov=options.max_cov) time_b = time.time() if log_handler: log_handler.info(">>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s") else: sys.stdout.write("\n>>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s\n") temp_graph = prefix + ".temp.fastg" if keep_temp else None copy_results = input_graph.find_target_graph(tab_file, mode=mode, type_factor=type_factor, weight_factor=weight_factor, log_hard_cov_threshold=log_hard_cov_threshold, contamination_depth=contamination_depth, contamination_similarity=contamination_similarity, degenerate=degenerate, degenerate_depth=degenerate_depth, degenerate_similarity=degenerate_similarity, expected_max_size=expected_max_size, expected_min_size=expected_min_size, only_keep_max_cov=only_max_c, min_sigma_factor=min_sigma_factor, temp_graph=temp_graph, broken_graph_allowed=acyclic_allowed, verbose=verbose, log_handler=log_handler, debug=debug) time_c = time.time() if log_handler: log_handler.info(">>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s") if len(copy_results) > 1: log_handler.info(str(len(copy_results)) + " set(s) of graph detected.") else: sys.stdout.write("\n\n>>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s\n") if len(copy_results) > 1: sys.stdout.write(str(len(copy_results)) + " set(s) of graph detected.\n") degenerate_base_used = False if acyclic_allowed: # still_complete = [] for go_res, copy_res in enumerate(copy_results): broken_graph = copy_res["graph"] count_path = 0 for this_paths, other_tag in broken_graph.get_all_paths(mode=mode, log_handler=log_handler): count_path += 1 all_contig_str = [] contigs_are_circular = [] for go_contig, this_p_part in enumerate(this_paths): this_contig = broken_graph.export_path(this_p_part) if DEGENERATE_BASES & set(this_contig.seq): degenerate_base_used = True if this_contig.label.endswith("(circular)"): contigs_are_circular.append(True) else: contigs_are_circular.append(False) if len(this_paths) == 1 and contigs_are_circular[-1]: all_contig_str.append(this_contig.fasta_str()) else: all_contig_str.append(">contig_" + str(go_contig + 1) + "--" + this_contig.label + "\n" + this_contig.seq + "\n") # if len(all_contig_str) == 1 and set(contigs_are_circular) == {True}: # still_complete.append(True) # else: # still_complete.append(False) open(prefix + ".graph" + str(go_res + 1) + other_tag + "." + str(count_path) + ".path_sequence.fasta", "w").write("\n".join(all_contig_str)) broken_graph.write_to_gfa(prefix + ".graph" + str(go_res + 1) + ".selected_graph.gfa") else: for go_res, copy_res in enumerate(copy_results): idealized_graph = copy_res["graph"] # should add making one-step-inversion pairs for paths, # which would be used to identify existence of a certain isomer using mapping information count_path = 0 for this_path, other_tag in idealized_graph.get_all_circular_paths(mode=mode, log_handler=log_handler): count_path += 1 this_seq_obj = idealized_graph.export_path(this_path) if DEGENERATE_BASES & set(this_seq_obj.seq): degenerate_base_used = True open(prefix + ".graph" + str(go_res + 1) + other_tag + "." + str(count_path) + ".path_sequence.fasta", "w").write(this_seq_obj.fasta_str()) idealized_graph.write_to_gfa(prefix + ".graph" + str(go_res + 1) + ".selected_graph.gfa") if degenerate_base_used: log_handler.warning("Degenerate base(s) used!") time_d = time.time() if log_handler: log_handler.info(">>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s") else: sys.stdout.write("\n\n>>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s\n") try: disentangle_circular_assembly(options.fastg_file, options.tab_file, os.path.join(options.output_directory, options.prefix), type_factor=options.type_factor, mode=options.mode, weight_factor=options.weight_factor, log_hard_cov_threshold=options.depth_factor, contamination_depth=options.contamination_depth, contamination_similarity=options.contamination_similarity, degenerate=options.degenerate, degenerate_depth=options.degenerate_depth, degenerate_similarity=options.degenerate_similarity, expected_max_size=options.expected_max_size, expected_min_size=options.expected_min_size, min_sigma_factor=options.min_sigma_factor, only_max_c=options.only_keep_max_cov, acyclic_allowed=options.acyclic_allowed, keep_temp=options.keep_temp_graph, log_handler=log_handler, verbose=options.verbose, debug=options.debug) log_handler = simple_log(logging.getLogger(), options.output_directory, options.prefix + ".disentangle.") log_handler.info('\nTotal cost: ' + str(round(time.time() - time0, 4)) + 's\n') except Exception as e: if options.debug: log_handler.exception("") else: log_handler.exception(str(e)) log_handler.exception("Disentangling failed!") if not options.acyclic_allowed: log_handler.info("You might try again with '--linear' to export contig(s) instead of circular genome.") log_handler = simple_log(log_handler, options.output_directory, options.prefix + ".disentangle.") log_handler.info("\nTotal cost " + str(time.time() - time0)) log_handler.info("Please email [email protected] if you find bugs!\n") logging.shutdown()
def get_options(description): parser = ArgumentParser( description=description, usage="get_organelle_config.py -a embplant_pt,embplant_mt") parser.add_argument( "-a", "--add", dest="add_organelle_type", help="Add database for organelle type(s). Followed by any of all/" + "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma such as " "embplant_pt,embplant_mt,fungus_mt.") parser.add_argument( "--use-version", dest="db_version", default="latest", help="The version of database to add. " "Find more versions at github.com/Kinggerm/GetOrganelleDB. " "Default: %(default)s") parser.add_argument( "-r", "--rm", dest="rm_organelle_type", help= "Remove local database(s) for organelle type(s). Followed by any of all/" + "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma " "such as embplant_pt,embplant_mt.") parser.add_argument( "--update", dest="update", default=False, action="store_true", help= "Update local databases to the latest online version, or the local version " "if \"--use-local LOCAL_DB_PATH\" provided.") parser.add_argument( "--config-dir", dest="get_organelle_path", default=None, help="The directory where the default databases were placed. " "The default value also can be changed by adding 'export GETORG_PATH=your_favor' " "to the shell script (e.g. ~/.bash_profile or ~/.bashrc) " "Default: " + GO_PATH) parser.add_argument( "--use-local", dest="use_local", help= "Input a path. This local database path must include subdirectories " "LabelDatabase and SeedDatabase, under which there is the fasta file(s) named by the " "organelle type you want add, such as fungus_mt.fasta. ") parser.add_argument( "--clean", dest="clean", default=False, action="store_true", help="Remove all configured database files (==\"--rm all\").") parser.add_argument("--list", dest="list_available", default=False, action="store_true", help="List configured databases checking and exit. ") parser.add_argument("--check", dest="check", default=False, action="store_true", help="Check configured database files and exit. ") parser.add_argument( "--db-type", dest="db_type", default="both", help="The database type (seed/label/both). Default: %(default)s") parser.add_argument( "--which-blast", dest="which_blast", default="", help="Assign the path to BLAST binary files if not added to the path. " "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + "/ncbi-blast\" first, then $PATH") parser.add_argument( "--which-bowtie2", dest="which_bowtie2", default="", help="Assign the path to Bowtie2 binary files if not added to the path. " "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + "/bowtie2\" first, then $PATH") parser.add_argument( "--verbose", dest="verbose", default=False, action="store_true", help="verbose output to the screen. Default: %(default)s") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() assert options.db_type in ("seed", "label", "both") global _GO_PATH, _LBL_DB_PATH, _SEQ_DB_PATH if options.get_organelle_path: _GO_PATH = os.path.expanduser(options.get_organelle_path) if os.path.isdir(_GO_PATH): _LBL_DB_PATH = os.path.join(_GO_PATH, LBL_NAME) _SEQ_DB_PATH = os.path.join(_GO_PATH, SEQ_NAME) # check directories if not os.path.isdir(_GO_PATH): os.mkdir(_GO_PATH) if not os.path.isdir(_LBL_DB_PATH): os.mkdir(_LBL_DB_PATH) if not os.path.isdir(_SEQ_DB_PATH): os.mkdir(_SEQ_DB_PATH) # only print if options.list_available: if options.db_type in ("seed", "both"): version_file = os.path.join(_SEQ_DB_PATH, "VERSION") if os.path.isfile(version_file): with open(version_file) as open_version: for line in open_version: db_type, db_version, db_hash = line.strip().split("\t") db_version = find_version(db_type, db_hash, SEED_DB_HASH, db_version) sys.stdout.write(db_type + " Seed Database:\t" + db_version + "\t" + db_hash + "\n") if options.db_type in ("label", "both"): version_file = os.path.join(_LBL_DB_PATH, "VERSION") if os.path.isfile(version_file): with open(version_file) as open_version: for line in open_version: db_type, db_version, db_hash = line.strip().split("\t") db_version = find_version(db_type, db_hash, LABEL_DB_HASH, db_version) sys.stdout.write(db_type + " Label Database:\t" + db_version + "\t" + db_hash + "\n") sys.exit() # sys.stdout.write("\n" + description + "\n") sys.stdout.write("\nPython " + str(sys.version).replace("\n", " ") + "\n") options.which_bowtie2 = detect_bowtie2_path(options.which_bowtie2, GO_DEP_PATH) options.which_blast = detect_blast_path(options.which_blast, GO_DEP_PATH) bowtie2_v = detect_bowtie2_version(options.which_bowtie2) if bowtie2_v.endswith("N/A"): sys.stdout.write("ERROR: Bowtie2 is not available!\n") sys.exit() blast_v = detect_blast_version(options.which_blast) if blast_v.endswith("N/A"): sys.stdout.write("ERROR: Blast is not available!\n") sys.exit() sys.stdout.write("DEPENDENCIES: " + "; ".join([bowtie2_v, blast_v]) + "\n") sys.stdout.write("WORKING DIR: " + os.getcwd() + "\n") sys.stdout.write(" ".join( ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n\n") if not (options.add_organelle_type or options.rm_organelle_type or options.update or options.clean): parser.print_help() sys.stdout.write("Insufficient arguments!\n") sys.exit() mutually_exclusive_options = [(options.add_organelle_type, "adding"), (options.rm_organelle_type, "removing"), (options.update, "updating"), (options.clean, "cleaning")] for config_mode1, config_name1 in mutually_exclusive_options: for config_mode2, config_name2 in mutually_exclusive_options: if config_name1 != config_name2: assert not (config_mode1 and config_mode2), \ config_name1 + " and " + config_name2 + " removing are mutually exclusive!" if options.add_organelle_type: options.add_organelle_type = options.add_organelle_type.split(",") for sub_type in options.add_organelle_type: if sub_type == "all": options.add_organelle_type = list(ORGANELLE_TYPE_LIST) break elif sub_type not in ORGANELLE_TYPE_SET: sys.stdout.write("Illegal 'adding' type: " + sub_type + "! " "Types must be one of all/" + "/".join(ORGANELLE_TYPE_LIST) + "!\n") sys.exit() if options.rm_organelle_type: options.rm_organelle_type = options.rm_organelle_type.split(",") for sub_type in options.rm_organelle_type: if sub_type == "all": options.clean = True break elif sub_type not in ORGANELLE_TYPE_SET: sys.stdout.write("Illegal 'removing' type: " + sub_type + "! " "Types must be one of all/" + "/".join(ORGANELLE_TYPE_LIST) + "!\n") sys.exit() if options.use_local: if not os.path.isdir(options.use_local): raise NotADirectoryError(options.use_local) if options.add_organelle_type: for sub_type in options.add_organelle_type: this_fas_f = os.path.join(options.use_local, SEQ_NAME, sub_type + ".fasta") if not os.path.isfile(this_fas_f): sys.stdout.write("File " + this_fas_f + " not available!\n") sys.exit() this_fas_f = os.path.join(options.use_local, LBL_NAME, sub_type + ".fasta") if not os.path.isfile(this_fas_f): sys.stdout.write("File " + this_fas_f + " not available!\n") sys.exit() options.db_version = "customized" sys.stdout.write("Use local database: " + options.use_local + "\n") else: if options.update: options.db_version = "latest" if options.db_version == "latest": remote_quest = get_static_html_context( VERSION_URLS[0], verbose=options.verbose, alternative_url_list=VERSION_URLS[1:]) if remote_quest["status"]: options.db_version = remote_quest["content"].strip() else: sys.stderr.write("Error: " + remote_quest["info"] + "\n") sys.stderr.write( "Please check your connection to github/gitee!\n") sys.stdout.write( "\nYou can download the database files from www.github.com/Kinggerm/GetOrganelleDB " "and install it from from local (flag --use-local)\n") sys.exit() if options.db_version not in SEED_DB_HASH or options.db_version not in LABEL_DB_HASH: sys.stderr.write( "GetOrganelle v{} does not support Database v{}\n".format( get_versions(), options.db_version) + "Please upgrade GetOrganelle (recommended) " "or degrade the Database version (not recommended; --use-version)\n" ) sys.exit() return options
def main(): time_start = time.time() description = "get_organelle_config.py " + get_versions( ) + " is used for setting up default GetOrganelle database." options = get_options(description=description) existing_seed_db, existing_label_db = get_current_db_versions( options.db_type, seq_db_path=_SEQ_DB_PATH, lbl_db_path=_LBL_DB_PATH, clean_mode=options.clean, check_hash=options.check) seed_version_f = os.path.join(_SEQ_DB_PATH, "VERSION") label_version_f = os.path.join(_LBL_DB_PATH, "VERSION") time_out = 100000 # Case 1 if options.clean: if options.db_type in ("seed", "both"): for rm_o_type in sorted(existing_seed_db): rm_files(_SEQ_DB_PATH, file_name_prefix=rm_o_type) if os.path.isfile(seed_version_f): os.remove(seed_version_f) if options.db_type in ("label", "both"): for rm_o_type in sorted(existing_label_db): rm_files(_LBL_DB_PATH, file_name_prefix=rm_o_type) if os.path.isfile(label_version_f): os.remove(label_version_f) # Case 2 if options.rm_organelle_type: if options.db_type in ("seed", "both"): for rm_o_type in options.rm_organelle_type: if rm_o_type in existing_seed_db: rm_files(_SEQ_DB_PATH, file_name_prefix=rm_o_type) del existing_seed_db[rm_o_type] else: sys.stdout.write("Warning: " + rm_o_type + " Seed Database not found!\n") write_version_file(version_dict=existing_seed_db, output_to_file=seed_version_f) if options.db_type in ("label", "both"): for rm_o_type in options.rm_organelle_type: if rm_o_type in existing_label_db: rm_files(_LBL_DB_PATH, file_name_prefix=rm_o_type) del existing_label_db[rm_o_type] else: sys.stdout.write("Warning: " + rm_o_type + " Label Database not found!\n") write_version_file(version_dict=existing_label_db, output_to_file=label_version_f) # Case 3 if options.update: if options.db_type in ("seed", "both"): for sub_o_type in ORGANELLE_TYPE_LIST: target_output = os.path.join(_SEQ_DB_PATH, sub_o_type + ".fasta") if sub_o_type not in existing_seed_db: pass else: if options.use_local: update_to_fa = os.path.join(options.use_local, SEQ_NAME, sub_o_type + ".fasta") if not os.path.exists(update_to_fa): sys.stdout.write("Warning: " + update_to_fa + " not available!\n") else: new_hash_val = cal_f_sha256(update_to_fa) if new_hash_val != existing_seed_db[sub_o_type][ "sha256"]: # for try_version in sorted(SEED_DB_HASH, reverse=True): # if sub_o_type in SEED_DB_HASH[try_version] and \ # new_hash_val == SEED_DB_HASH[try_version][sub_o_type]["sha256"]: # existing_seed_db[sub_o_type] = {"version": try_version, "sha256": new_hash_val} # else: # existing_seed_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val} existing_seed_db[sub_o_type] = \ {"version": find_version(sub_o_type, new_hash_val, SEED_DB_HASH), "sha256": new_hash_val} if os.path.realpath( os.path.split(update_to_fa) [0]) != os.path.realpath(_SEQ_DB_PATH): copy(update_to_fa, _SEQ_DB_PATH) initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) else: # match existed # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: if existing_seed_db[sub_o_type][ "version"] == options.db_version: # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: these_urls = [ sub_url.format(options.db_version, sub_o_type) for sub_url in seed_url_temp ] check_sha256 = SEED_DB_HASH[ options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) if not status["status"]: sys.stdout.write( "Installing %s Seed Database failed: %s\n" % (sub_o_type, status["info"])) continue initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) existing_seed_db[sub_o_type] = { "version": options.db_version, "sha256": check_sha256 } write_version_file(version_dict=existing_seed_db, output_to_file=seed_version_f) if options.db_type in ("label", "both"): for sub_o_type in ORGANELLE_TYPE_LIST: target_output = os.path.join(_LBL_DB_PATH, sub_o_type + ".fasta") if sub_o_type not in existing_label_db: pass else: if options.use_local: update_to_fa = os.path.join(options.use_local, LBL_NAME, sub_o_type + ".fasta") if not os.path.exists(update_to_fa): sys.stdout.write("Warning: " + update_to_fa + " not available!\n") else: new_hash_val = cal_f_sha256(update_to_fa) if new_hash_val != existing_label_db[sub_o_type][ "sha256"]: # match existed # for try_version in sorted(LABEL_DB_HASH, reverse=True): # if sub_o_type in LABEL_DB_HASH[try_version] and \ # new_hash_val == LABEL_DB_HASH[try_version][sub_o_type]["sha256"]: # existing_label_db[sub_o_type] = {"version": try_version, # "sha256": new_hash_val} # else: # existing_label_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val} existing_label_db[sub_o_type] = \ {"version": find_version(sub_o_type, new_hash_val, LABEL_DB_HASH), "sha256": new_hash_val} if os.path.realpath( os.path.split(update_to_fa) [0]) != os.path.realpath(_LBL_DB_PATH): copy(update_to_fa, _LBL_DB_PATH) initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) else: # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: if existing_seed_db[sub_o_type][ "version"] == options.db_version: # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: these_urls = [ sub_url.format(options.db_version, sub_o_type) for sub_url in label_url_temp ] check_sha256 = LABEL_DB_HASH[ options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) if not status["status"]: sys.stdout.write( "Installing %s Label Database failed: %s\n" % (sub_o_type, status["info"])) continue initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) existing_label_db[sub_o_type] = { "version": options.db_version, "sha256": check_sha256 } write_version_file(version_dict=existing_label_db, output_to_file=label_version_f) # Case 4 if options.add_organelle_type: if options.db_type in ("seed", "both"): for sub_o_type in options.add_organelle_type: target_output = os.path.join(_SEQ_DB_PATH, sub_o_type + ".fasta") if options.use_local: update_to_fa = os.path.join(options.use_local, SEQ_NAME, sub_o_type + ".fasta") if not os.path.exists(update_to_fa): sys.stdout.write("Warning: " + update_to_fa + " not available!\n") else: new_hash_val = cal_f_sha256(update_to_fa) # for try_version in sorted(SEED_DB_HASH, reverse=True): # if sub_o_type in SEED_DB_HASH[try_version] and \ # new_hash_val == SEED_DB_HASH[try_version][sub_o_type]["sha256"]: # existing_seed_db[sub_o_type] = {"version": try_version, "sha256": new_hash_val} # else: # existing_seed_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val} existing_seed_db[sub_o_type] = \ {"version": find_version(sub_o_type, new_hash_val, SEED_DB_HASH), "sha256": new_hash_val} if os.path.realpath( os.path.split(update_to_fa) [0]) != os.path.realpath(_SEQ_DB_PATH): copy(update_to_fa, _SEQ_DB_PATH) initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) else: these_urls = [ sub_url.format(options.db_version, sub_o_type) for sub_url in seed_url_temp ] check_sha256 = SEED_DB_HASH[ options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) if not status["status"]: sys.stdout.write( "Installing %s Seed Database failed: %s\n" % (sub_o_type, status["info"])) continue initialize_seed_database( which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) existing_seed_db[sub_o_type] = { "version": options.db_version, "sha256": check_sha256 } write_version_file(version_dict=existing_seed_db, output_to_file=seed_version_f) if options.db_type in ("label", "both"): for sub_o_type in options.add_organelle_type: target_output = os.path.join(_LBL_DB_PATH, sub_o_type + ".fasta") if options.use_local: update_to_fa = os.path.join(options.use_local, LBL_NAME, sub_o_type + ".fasta") if not os.path.exists(update_to_fa): sys.stdout.write("Warning: " + update_to_fa + " not available!\n") else: new_hash_val = cal_f_sha256(update_to_fa) # for try_version in sorted(LABEL_DB_HASH, reverse=True): # if sub_o_type in LABEL_DB_HASH[try_version] and \ # new_hash_val == LABEL_DB_HASH[try_version][sub_o_type]["sha256"]: # existing_label_db[sub_o_type] = {"version": try_version, # "sha256": new_hash_val} # else: # existing_label_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val} existing_label_db[sub_o_type] = \ {"version": find_version(sub_o_type, new_hash_val, LABEL_DB_HASH), "sha256": new_hash_val} if os.path.realpath( os.path.split(update_to_fa) [0]) != os.path.realpath(_LBL_DB_PATH): copy(update_to_fa, _LBL_DB_PATH) initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) else: these_urls = [ sub_url.format(options.db_version, sub_o_type) for sub_url in label_url_temp ] check_sha256 = LABEL_DB_HASH[ options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) if not status["status"]: sys.stdout.write( "Installing %s Label Database failed: %s\n" % (sub_o_type, status["info"])) continue initialize_notation_database( which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) existing_label_db[sub_o_type] = { "version": options.db_version, "sha256": check_sha256 } write_version_file(version_dict=existing_label_db, output_to_file=label_version_f) sys.stdout.write("\nTotal cost: %.2f s\n" % (time.time() - time_start))
def get_options(print_title): parser = ArgumentParser( "disentangle_organelle_assembly.py -F embplant_pt -g input.fastg -t input.tab -o output_dir" ) parser.add_argument("-g", dest="fastg_file", help="input fastg format file.") parser.add_argument( "-t", dest="tab_file", help= "input tab format file (*.csv; the postfix 'csv' was in conformity with Bandage) " "produced by slim_graph.py.") parser.add_argument("-o", dest="output_directory", help="output directory.") parser.add_argument( "-F", dest="mode", help= "organelle type: embplant_pt/other_pt/embplant_mt/embplant_nr/animal_mt/fungus_mt/fungus_nr/anonym." ) parser.add_argument( "--linear", dest="acyclic_allowed", default=False, action="store_true", help= "By default, this script would only disentangle the circular graph (the complete circular " "organelle genome), and would directly give up linear/broken graphs. Choose this option " "to try for linear/broken cases.") parser.add_argument( "--weight-f", dest="weight_factor", type=float, default=100.0, help= "weight factor for excluding non-target contigs. Default:%(default)s") parser.add_argument( "--depth-f", dest="depth_factor", type=float, default=10., help= "Depth factor for excluding non-target contigs. Default:%(default)s") parser.add_argument( "--type-f", dest="type_factor", type=float, default=3., help="Type factor for identifying genome type tag. Default:%(default)s" ) parser.add_argument( "--contamination-depth", dest="contamination_depth", default=3., type=float, help= "Depth factor for confirming contaminating contigs. Default:%(default)s" ) parser.add_argument( "--contamination-similarity", dest="contamination_similarity", default=0.9, type=float, help= "Similarity threshold for confirming contaminating contigs. Default:%(default)s" ) parser.add_argument( "--no-degenerate", dest="degenerate", default=True, action="store_false", help= "Disable making consensus from parallel contig based on nucleotide degenerate table." ) parser.add_argument( "--degenerate-depth", dest="degenerate_depth", default=1.5, type=float, help="Depth factor for confirming parallel contigs. Default:%(default)s" ) parser.add_argument( "--degenerate-similarity", dest="degenerate_similarity", default=0.98, type=float, help= "Similarity threshold for confirming parallel contigs. Default:%(default)s" ) parser.add_argument( "--expected-max-size", dest="expected_max_size", default=200000, type=int, help= "Expected maximum target genome size. Default: 200000 (-F embplant_pt/fungus_mt), " "25000 (-F embplant_nr/animal_mt/fungus_nr), 600000 (-F embplant_mt/other_pt)" ) parser.add_argument( "--expected-min-size", dest="expected_min_size", default=10000, type=int, help="Expected mininum target genome size. Default: %(default)s") parser.add_argument( "--reverse-lsc", dest="reverse_lsc", default=False, action="store_true", help="For '-F embplant_pt' with complete circular result, " "by default, the direction of the starting contig (usually " "the LSC contig) is determined as the direction with less ORFs. Choose this option " "to reverse the direction of the starting contig when result is circular. " "Actually, both directions are biologically equivalent to each other. The " "reordering of the direction is only for easier downstream analysis.") parser.add_argument( "--max-paths-num", dest="max_paths_num", default=1000, type=int, help= "Repeats would dramatically increase the number of potential isomers (paths). " "This option was used to export a certain amount of paths out of all possible paths " "per assembly graph. Default: %(default)s") parser.add_argument( "--keep-all-polymorphic", dest="only_keep_max_cov", default=True, action="store_false", help= "By default, this script would pick the contig with highest coverage among all parallel " "(polymorphic) contigs when degenerating was not applicable. " "Choose this flag to export all combinations.") parser.add_argument( "--min-sigma", dest="min_sigma_factor", type=float, default=0.1, help= "Minimum deviation factor for excluding non-target contigs. Default:%(default)s" ) parser.add_argument( "--min-depth", dest="min_cov", type=float, default=0., help= "Minimum coverage for a contig to be included in disentangling. Default:%(default)s" ) parser.add_argument( "--max-depth", dest="max_cov", type=float, default=inf, help= "Minimum coverage for a contig to be included in disentangling. Default:%(default)s" ) parser.add_argument( "--max-multiplicity", dest="max_multiplicity", type=int, default=8, help="Maximum multiplicity of contigs for disentangling genome paths. " "Should be 1~12. Default:%(default)s") parser.add_argument( "--prefix", dest="prefix", default="target", help= "Prefix of output files inside output directory. Default:%(default)s") parser.add_argument("--keep-temp", dest="keep_temp_graph", default=False, action="store_true", help="export intermediate graph file.") parser.add_argument( "--time-limit", dest="time_limit", default=3600, type=int, help="time limit for the disentangling process. Default:%(default)s") parser.add_argument( "--random-seed", dest="random_seed", default=12345, type=int, help= "Random seed (only for disentangling at this moment). Default: %(default)s" ) parser.add_argument("--continue", dest="resume", default=False, action="store_true", help="continue mode.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) parser.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="verbose logging.") parser.add_argument("--debug", dest="debug", default=False, action="store_true", help="for debug.") options = parser.parse_args() if (options.fastg_file is None) or (options.tab_file is None) or (options.output_directory is None) \ or (options.mode is None): if options.fastg_file is None: sys.stdout.write("Missing option \"-g\"!\n") if options.tab_file is None: sys.stdout.write("Missing option \"-t\"!\n") if options.output_directory is None: sys.stdout.write("Missing option \"-o\"!\n") if options.mode is None: sys.stdout.write("Missing option \"-F\"!\n") sys.stdout.write("Insufficient arguments!\n") sys.exit() else: assert 12 >= options.max_multiplicity >= 1 assert options.max_paths_num > 0 if options.output_directory and not os.path.exists( options.output_directory): os.mkdir(options.output_directory) log_handler = simple_log(logging.getLogger(), options.output_directory, options.prefix + ".disentangle.") log_handler.info(print_title) log_handler.info(" ".join( ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") log_handler = timed_log(log_handler, options.output_directory, options.prefix + ".disentangle.") if "--expected-max-size" not in sys.argv: if options.mode in ("embplant_mt", "other_pt"): options.expected_max_size *= 3 # elif options.mode == "fungus_mt": # options.expected_max_size /= 2 elif options.mode in ("embplant_nr", "animal_mt", "fungus_nr"): options.expected_max_size /= 8 random.seed(options.random_seed) np.random.seed(options.random_seed) return options, log_handler
def get_options(): parser = ArgumentParser( "round_statistics.py -f fasta_file -d output_per_round_folder -i Initial_mapped.fq -o output" ) parser.add_argument("-f", dest="fasta", help="input fasta file.") parser.add_argument("-d", dest="output_per_round_dir", help="output per round directory.") parser.add_argument("-i", dest="initial_mapped", help="seed fastq.") parser.add_argument("-o", dest="output_base", help="output folder.") parser.add_argument("-R", dest="round", type=int, help="rounds to check. default:automatic stop!") parser.add_argument("-t", dest="threads", type=int, default=2, help="threads.") parser.add_argument( "--which-bowtie2", dest="which_bowtie2", default="", help="Assign the path to Bowtie2 binary files if not added to the path. " "Default: try GetOrganelleDep/" + SYSTEM_NAME + "/bowtie2 first, then $PATH") parser.add_argument( '--random-seed', dest="random_seed", type=int, default=12345, help="seed for random generator for bowtie2. Default: %(default)s") parser.add_argument( "--threshold", dest="threshold", default="0,10", help= "sites with coverage above the threshold would be marked as covered. default: %(default)s" ) parser.add_argument("--continue", dest="resume", default=False, action="store_true") parser.add_argument("--keep-temp", dest="keep_temp", default=False, action="store_true") parser.add_argument( "--draw", dest="draw_plot", default=False, action="store_true", help="Draw density plot using matplotlib, which should be installed.") parser.add_argument("--max-coverage-tick", dest="max_cov_tick") # parser.add_argument("--average", default=False, action="store_true", # help="output average coverage.") parser.add_argument("--debug", dest="debug", help="Debug mode.") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not (options.fasta and options.initial_mapped and options.output_base and options.output_per_round_dir): sys.stderr.write("Insufficient arguments!\n") sys.exit() if not os.path.isdir(options.output_base): os.mkdir(options.output_base) if options.debug: log_level = "DEBUG" else: log_level = "INFO" log_handler = simple_log(logging.getLogger(), options.output_base, "", log_level=log_level) log_handler.info("") log_handler.info(" ".join( ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") if not options.which_bowtie2: try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2") if os.path.isfile(try_this_bin) and executable(try_this_bin): options.which_bowtie2 = os.path.split(try_this_bin)[0] if not executable(os.path.join(options.which_bowtie2, "bowtie2")): log_handler.error( os.path.join(options.which_bowtie2, "bowtie2") + " not accessible!") exit() if not executable( os.path.join(options.which_bowtie2, "bowtie2-build") + " --large-index"): log_handler.error( os.path.join(options.which_bowtie2, "bowtie2-build") + " not accessible!") exit() # if not executable(os.path.join(options.which_bowtie2, "bowtie2-build-l")): # log_handler.error(os.path.join(options.which_bowtie2, "bowtie2-build-l") + " not accessible!") # exit() log_handler = timed_log(log_handler, options.output_base, "", log_level=log_level) return options, log_handler
default="", help="Assign the path to Bowtie2 binary files if not added to the path. " "Default: try GetOrganelleDep/" + SYSTEM_NAME + "/bowtie2 first, then $PATH") parser.add_argument( "--which-spades", dest="which_spades", default="", help="Assign the path to SPAdes binary files if not added to the path. " "Default: try GetOrganelleDep/" + SYSTEM_NAME + "/SPAdes first, then $PATH") parser.add_argument( "-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() if not (options.seed_dir and options.fastq_file_1 and options.fastq_file_2 and options.output_sh_file): parser.print_help() sys.stdout.write('\nERROR: Insufficient arguments!\n') exit() if options.fastq_file_1 == options.fastq_file_2: raise IOError("1st fastq file should NOT be the same with 2nd fastq file!") if not options.which_bowtie2: try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2") if os.path.isfile(try_this_bin) and executable(try_this_bin): options.which_bowtie2 = os.path.split(try_this_bin)[0] if not options.which_spades: try_this_bin = os.path.join(GO_DEP_PATH, "SPAdes", "bin", "spades.py") if os.path.isfile(try_this_bin) and executable(try_this_bin):