def get_options():
    parser = ArgumentParser(
        description=
        "This script uses an naive De Bruijn approach to convert sequence back into an "
        "assembly graph file, such as a gfa (Graphical Fragment Assembly) or a fastg file.",
        usage="reconstruct_graph_from_fasta.py -i fasta_file -o out.gfa")
    parser.add_argument("-i", dest="input", help="Input fasta file.")
    parser.add_argument(
        "-o",
        dest="output",
        default="",
        help=
        "Output graph file. The output format is GFA by default, but FASTG only when "
        "indicated with postfix '.fastg'.")
    parser.add_argument(
        "-L",
        "--overlap",
        dest="overlap",
        default=55,
        type=int,
        help="overlap for reconstructing De Bruijn graph. Default:%(default)s")
    parser.add_argument(
        "-c",
        "--circular",
        dest="circular",
        default="auto",
        help="Sequences in input fasta file are all circular (yes/no/auto). "
        "The auto mode enables detection by checking the existence of '(circular)' in "
        "the end of the header of each sequence. Default:%(default)s")
    parser.add_argument(
        "--single-chain",
        dest="single_chain",
        default=False,
        action="store_true",
        help=
        "The input sequence(s) was by default treated as DNA double-chain with its complementary "
        "sequence. Choose this flag to turn off.")
    parser.add_argument("--out-kg",
                        dest="out_kg",
                        default="",
                        help="Output kmer node graph.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not ((options.output or options.out_kg) and options.input):
        parser.print_help()
        sys.stdout.write("Insufficient arguments!\n")
        sys.exit()
    elif options.circular not in ("auto", "yes", "no"):
        # parser.print_help()
        sys.stdout.write(
            "Illegal -c input! circular mode must be one of yes/no/auto!\n")
        sys.exit()
    elif options.overlap % 2 == 0:
        # parser.print_help()
        sys.stdout.write("Illegal -k input! kmer must be an odd number!\n")
    return options
def require_commands():
    usage = "python this_script.py Query.gb -r Reference.gb" \
            "\n\nThis script only checks the mainly check the reliability of automatically annotated tRNA and CDS." \
            "\n"
    parser = ArgumentParser(usage=usage)
    group_need = parser.add_argument_group("NECESSARY OPTIONS")
    group_need.add_argument('query_gb', metavar='query', type=str, nargs='+',
                            help='Input a list of *.gb files as the query (split the files by spaces).')
    group_need.add_argument('-r', dest='reference_gb', help='input reference *.gb file')
    group_alternation = parser.add_argument_group("ALTERNATION of NECESSARY OPTIONS")
    group_alternation.add_argument('-d', dest='reference_fasta', help='input reference fasta file exported exported by "Extract Annotations"-"Export"-"Selected Documents"-fasta in Geneious, remember to choose "Replace spaces in sequence name with underscores"')
    group_optional = parser.add_argument_group("OPTIONAL OPTIONS")
    group_optional.add_argument('--t-ends', dest='ends_trna', help='Default=10. The length to check at the both ends of tRNA.', type=int, default=10)
    group_optional.add_argument('--c-ends', dest='ends_cds', help='Default:not activated. Activate this calculation and assign the length to check at the both ends of CDS.', type=int)
    group_optional.add_argument('--a-ends', dest='ends_all', help='Default:not activated. Activate this calculation and assign the length to check at the both ends of annotated all regions.', type=int)
    group_optional.add_argument('--l-threshold', dest='length', help='Default=0.9. Length threshold to report warning.', type=float, default=0.9)
    group_optional.add_argument('--similarity', dest='enable_similarity', help='Default=False. Choose to enable similarity calculation.', default=False, action='store_true')
    group_optional.add_argument('--s-threshold', dest='similarity', help='Default=0.9. Similarity threshold to report warning. Should be < length threshold.', type=float, default=0.9)
    parser.add_argument("-v", "--version", action="version",
                        version="GetOrganelle v{version}".format(version=get_versions()))
    # parser.add_option_group(group_need)
    # parser.add_option_group(group_alternation)
    # parser.add_option_group(group_optional)
    options = parser.parse_args()
    if not (len(options.query_gb) and options.reference_gb or options.reference_fasta):
        parser.print_help()
    return options, options.query_gb
Example #3
0
def get_options(description):
    parser = ArgumentParser(description=description, usage="plastome_arch_info.py fasta_format_sequence_file(s)")
    parser.add_argument("sequences", metavar="sequences", type=str, nargs="+",
                        help="Input fasta format sequences (split the files by spaces).")
    parser.add_argument("-o", dest="output",
                        help="output file.")
    parser.add_argument("-r", dest="min_ir_length", default=5000, type=int,
                        help="The minimum repeat length treated as the IR region of plastome. Default: [%(default)s]")
    parser.add_argument("-v", dest="valid_bases", default="ATGCRMYKHBDVatgcrmykhbdv",
                        help="Valid bases. Default: ATGCRMYKHBDVatgcrmykhbdv")
    parser.add_argument("--version", action="version",
                        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not len(options.sequences):
        parser.print_help()
        sys.exit()
    else:
        for f in options.sequences:
            if not os.path.isfile(f):
                raise IOError(f + " not found/valid!")
        options.valid_bases = set(list(options.valid_bases))
        return options, options.sequences
def get_options():
    parser = ArgumentParser(
        usage="summary_get_organelle_output.py list_of_folders -o tab_file")
    parser.add_argument(
        "sample_folders",
        metavar="output",
        type=str,
        nargs="+",
        help="Input a list of folders generated by get_organelle_from_reads.py."
        "Please split the files by spaces.")
    parser.add_argument("-o", dest="output", help="Output csv file.")
    # parser.add_argument("--verbose", dest="verbose", default=False, action="store_true",
    #                     help="Verbose style.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not options.output or not len(options.sample_folders):
        parser.print_help()
        sys.stdout.write("Insufficient arguments!\n")
        sys.exit()
    return options
def require_commands():
    global options
    usage = 'python ' + str(
        os.path.basename(__file__)) + ' -g input.fastg -f refernce.fasta'
    parser = ArgumentParser(usage=usage)
    parser.add_argument('-g',
                        dest='in_fastg_file',
                        type=str,
                        help='followed by your input fastg file')
    parser.add_argument('-f',
                        dest='reference_fa_base',
                        type=str,
                        help='followed by Fasta index format')
    parser.add_argument(
        '--keep-temp',
        dest='keep_temp',
        default=False,
        action='store_true',
        help=
        'Choose to disable deleting temp files produced by blast and this script'
    )
    parser.add_argument('--bt',
                        dest='blast_hits_threshold',
                        default=0.60,
                        help='Default: 0.60',
                        type=float)
    parser.add_argument('--max-gap',
                        dest='max_gap_to_add',
                        default=1500,
                        help='Default: 1500',
                        type=int)
    parser.add_argument(
        '--con-all',
        dest='connect_inner_contig',
        default=False,
        action='store_true',
        help=
        'Choose to activate connecting all possible contigs. Default: False')
    parser.add_argument('--depth',
                        dest='depth_to_connect',
                        default=1.0,
                        help='Default: 1.0',
                        type=float)
    parser.add_argument(
        "--which-blast",
        dest="which_blast",
        default="",
        help="Assign the path to BLAST binary files if not added to the path. "
        "Default: try GetOrganelleDep/" + SYSTEM_NAME +
        "/ncbi-blast first, then $PATH")
    # parser.add_argument('--merge-overlaps', default=False, action='store_true', help='Choose to activate automatically merging overlapping contigs')
    # parser.add_argument('--min-os', dest='min_overlap_similarity', default=0.9, help='The similarity threshold to merge overlapping contigs. Default: 0.9', type=float)
    # parser.add_argument('--min-ol', dest='min_overlap_length', default=15, help='The length threshold to merge overlapping contigs. Default: 15', type=int)
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    try:
        options = parser.parse_args()
    except Exception as e:
        sys.stdout.write('\n######################################' + str(e))
        sys.stdout.write('\n"-h" for more usage')
        exit()
    else:
        if not (options.in_fastg_file and options.reference_fa_base):
            sys.stdout.write(
                "\n######################################\nInsufficient arguments!"
            )
            sys.stdout.write("\n\"-h\" for more usage")
            exit()
Example #6
0
     os.path.join(SEQ_NAME, "VERSION")]
}
if os.path.isdir(DEP_DIR) and os.path.isfile(
        os.path.join(DEP_DIR, "__init__.py")):
    PACKAGES.append(DEP_NAME)
    PACKAGE_DATA[DEP_NAME] = [
        this_file for this_file in get_recursive_files(
            target_dir=os.path.join(DEP_DIR, SYSTEM_NAME),
            start_from=DEP_DIR,
            exclude_files=EXCLUDE_SHARE_SPADES_PATHS)
    ]

if not in_situ:
    setup(
        name="GetOrganelle",
        version=get_versions(),
        description=
        "a fast and versatile toolkit for accurate de novo assembly of organelle genomes.",
        author="Jian-Jun Jin",
        author_email="*****@*****.**",
        url="http://github.com/Kinggerm/GetOrganelle",
        license="GNU General Public License, version 3",
        packages=PACKAGES,
        platforms="linux/MacOS",
        scripts=scripts_to_install,
        # relative path to each package
        package_data=PACKAGE_DATA,
        install_requires=install_dependencies,
        zip_safe=False)
    if keep_temp:
        for temp_dir_or_files in ("build", "dist", "*.pyc", "*.tgz",
def main():
    time0 = time.time()
    print_title = "GetOrganelle v" + str(get_versions()) + \
                  "\n\nThis is a script for extracting organelle genomes" \
                  " from slim_fastg.py-produced files (csv & fastg). " + \
                  "\nBy [email protected]\n\n"
    options, log_handler = get_options(print_title)

    @set_time_limit(options.time_limit)
    def disentangle_circular_assembly(fastg_file,
                                      tab_file,
                                      prefix,
                                      weight_factor,
                                      type_factor,
                                      mode="embplant_pt",
                                      log_hard_cov_threshold=10.,
                                      expected_max_size=inf,
                                      expected_min_size=0,
                                      contamination_depth=3.,
                                      contamination_similarity=5.,
                                      degenerate=True,
                                      degenerate_depth=1.5,
                                      degenerate_similarity=1.5,
                                      min_sigma_factor=0.1,
                                      max_copy_in=10,
                                      only_max_cov=True,
                                      keep_temp=False,
                                      acyclic_allowed=False,
                                      verbose=False,
                                      inner_logging=None,
                                      debug=False):
        if options.resume and os.path.exists(prefix +
                                             ".graph1.selected_graph.gfa"):
            pass
            if inner_logging:
                inner_logging.info(">>> Result graph existed!")
            else:
                sys.stdout.write(">>> Result graph existed!\n")
        else:
            time_a = time.time()
            if inner_logging:
                inner_logging.info(">>> Parsing " + fastg_file + " ..")
            else:
                sys.stdout.write("Parsing " + fastg_file + " ..\n")
            input_graph = Assembly(fastg_file,
                                   min_cov=options.min_cov,
                                   max_cov=options.max_cov)
            time_b = time.time()
            if inner_logging:
                inner_logging.info(">>> Parsing input fastg file finished: " +
                                   str(round(time_b - time_a, 4)) + "s")
            else:
                sys.stdout.write("\n>>> Parsing input fastg file finished: " +
                                 str(round(time_b - time_a, 4)) + "s\n")
            temp_graph = prefix + ".temp.fastg" if keep_temp else None

            copy_results = input_graph.find_target_graph(
                tab_file,
                database_name=mode,
                mode=mode,
                type_factor=type_factor,
                weight_factor=weight_factor,
                log_hard_cov_threshold=log_hard_cov_threshold,
                contamination_depth=contamination_depth,
                contamination_similarity=contamination_similarity,
                degenerate=degenerate,
                degenerate_depth=degenerate_depth,
                degenerate_similarity=degenerate_similarity,
                expected_max_size=expected_max_size,
                expected_min_size=expected_min_size,
                max_contig_multiplicity=max_copy_in,
                only_keep_max_cov=only_max_cov,
                min_sigma_factor=min_sigma_factor,
                temp_graph=temp_graph,
                broken_graph_allowed=acyclic_allowed,
                verbose=verbose,
                log_handler=inner_logging,
                debug=debug)
            time_c = time.time()
            if inner_logging:
                inner_logging.info(">>> Detecting target graph finished: " +
                                   str(round(time_c - time_b, 4)) + "s")
                if len(copy_results) > 1:
                    inner_logging.info(
                        str(len(copy_results)) + " set(s) of graph detected.")
            else:
                sys.stdout.write("\n\n>>> Detecting target graph finished: " +
                                 str(round(time_c - time_b, 4)) + "s\n")
                if len(copy_results) > 1:
                    sys.stdout.write(
                        str(len(copy_results)) +
                        " set(s) of graph detected.\n")

            degenerate_base_used = False
            if acyclic_allowed:
                # still_complete = []
                for go_res, copy_res in enumerate(copy_results):
                    go_res += 1
                    broken_graph = copy_res["graph"]
                    count_path = 0

                    these_paths = broken_graph.get_all_paths(
                        mode=mode, log_handler=inner_logging)
                    # reducing paths
                    if len(these_paths) > options.max_paths_num:
                        this_warn_str = "Only exporting " + str(options.max_paths_num) + " out of all " + \
                                        str(len(these_paths)) + " possible paths. (see '--max-paths-num' to change it.)"
                        if inner_logging:
                            inner_logging.warning(this_warn_str)
                        else:
                            sys.stdout.write("Warning: " + this_warn_str +
                                             "\n")
                        these_paths = these_paths[:options.max_paths_num]

                    # exporting paths, reporting results
                    for this_paths, other_tag in these_paths:
                        count_path += 1
                        all_contig_str = []
                        contigs_are_circular = []
                        for go_contig, this_p_part in enumerate(this_paths):
                            this_contig = broken_graph.export_path(this_p_part)
                            if DEGENERATE_BASES & set(this_contig.seq):
                                degenerate_base_used = True
                            if this_contig.label.endswith("(circular)"):
                                contigs_are_circular.append(True)
                            else:
                                contigs_are_circular.append(False)
                            if len(this_paths
                                   ) == 1 and contigs_are_circular[-1]:
                                all_contig_str.append(this_contig.fasta_str())
                            else:
                                all_contig_str.append(">contig_" +
                                                      str(go_contig + 1) +
                                                      "--" +
                                                      this_contig.label +
                                                      "\n" + this_contig.seq +
                                                      "\n")
                        if len(all_contig_str) == 1 and set(
                                contigs_are_circular) == {True}:
                            # print ir stat
                            if count_path == 1 and mode == "embplant_pt":
                                detect_seq = broken_graph.export_path(
                                    this_paths[0]).seq
                                ir_stats = detect_plastome_architecture(
                                    detect_seq, 1000)
                                print_str = "Detecting large repeats (>1000 bp) in PATH1 with " + ir_stats[-1] +\
                                            ", Total:LSC:SSC:Repeat(bp) = " + str(len(detect_seq)) + ":" + \
                                            ":".join([str(len_val) for len_val in ir_stats[:3]])
                                if inner_logging:
                                    inner_logging.info(print_str)
                                else:
                                    sys.stdout.write(print_str + "\n")
                        # if len(all_contig_str) == 1 and set(contigs_are_circular) == {True}:
                        #     still_complete.append(True)
                        # else:
                        #     still_complete.append(False)
                        open(
                            prefix + ".graph" + str(go_res) + other_tag + "." +
                            str(count_path) + ".path_sequence.fasta",
                            "w").write("\n".join(all_contig_str))
                    broken_graph.write_to_gfa(prefix + ".graph" + str(go_res) +
                                              ".selected_graph.gfa")
            else:
                for go_res, copy_res in enumerate(copy_results):
                    go_res += 1
                    idealized_graph = copy_res["graph"]
                    # should add making one-step-inversion pairs for paths,
                    # which would be used to identify existence of a certain isomer using mapping information
                    count_path = 0

                    these_paths = idealized_graph.get_all_circular_paths(
                        mode=mode,
                        log_handler=inner_logging,
                        reverse_start_direction_for_pt=options.reverse_lsc)
                    # reducing paths
                    if len(these_paths) > options.max_paths_num:
                        this_warn_str = "Only exporting " + str(options.max_paths_num) + " out of all " + \
                                        str(len(these_paths)) + " possible paths. (see '--max-paths-num' to change it.)"
                        if inner_logging:
                            inner_logging.warning(this_warn_str)
                        else:
                            sys.stdout.write("Warning: " + this_warn_str +
                                             "\n")
                        these_paths = these_paths[:options.max_paths_num]

                    # exporting paths, reporting results
                    for this_path, other_tag in these_paths:
                        count_path += 1
                        this_seq_obj = idealized_graph.export_path(this_path)
                        if DEGENERATE_BASES & set(this_seq_obj.seq):
                            degenerate_base_used = True
                        open(
                            prefix + ".graph" + str(go_res) + other_tag + "." +
                            str(count_path) + ".path_sequence.fasta",
                            "w").write(this_seq_obj.fasta_str())
                        # print ir stat
                        if count_path == 1 and mode == "embplant_pt":
                            detect_seq = this_seq_obj.seq
                            ir_stats = detect_plastome_architecture(
                                detect_seq, 1000)
                            print_str = "Detecting large repeats (>1000 bp) in PATH1 with " + ir_stats[-1] + \
                                        ", Total:LSC:SSC:Repeat(bp) = " + str(len(detect_seq)) + ":" + \
                                        ":".join([str(len_val) for len_val in ir_stats[:3]])
                            if inner_logging:
                                inner_logging.info(print_str)
                            else:
                                sys.stdout.write(print_str + "\n")
                    idealized_graph.write_to_gfa(prefix + ".graph" +
                                                 str(go_res) +
                                                 ".selected_graph.gfa")
            if degenerate_base_used:
                inner_logging.warning("Degenerate base(s) used!")
            time_d = time.time()
            if inner_logging:
                inner_logging.info(
                    ">>> Solving and unfolding graph finished: " +
                    str(round(time_d - time_c, 4)) + "s")
            else:
                sys.stdout.write(
                    "\n\n>>> Solving and unfolding graph finished: " +
                    str(round(time_d - time_c, 4)) + "s\n")

    try:
        disentangle_circular_assembly(
            options.fastg_file,
            options.tab_file,
            os.path.join(options.output_directory, options.prefix),
            type_factor=options.type_factor,
            mode=options.mode,
            weight_factor=options.weight_factor,
            log_hard_cov_threshold=options.depth_factor,
            contamination_depth=options.contamination_depth,
            contamination_similarity=options.contamination_similarity,
            degenerate=options.degenerate,
            degenerate_depth=options.degenerate_depth,
            degenerate_similarity=options.degenerate_similarity,
            expected_max_size=options.expected_max_size,
            expected_min_size=options.expected_min_size,
            min_sigma_factor=options.min_sigma_factor,
            max_copy_in=options.max_multiplicity,
            only_max_cov=options.only_keep_max_cov,
            acyclic_allowed=options.acyclic_allowed,
            keep_temp=options.keep_temp_graph,
            inner_logging=log_handler,
            verbose=options.verbose,
            debug=options.debug)
        log_handler = simple_log(logging.getLogger(), options.output_directory,
                                 options.prefix + ".disentangle.")

        log_handler.info('\nTotal cost: ' +
                         str(round(time.time() - time0, 4)) + 's\n')
    except IOError as e:
        raise e
    except KeyError as e:
        if str(e).strip("'") == options.mode:
            log_handler.error(options.mode + " not found in " +
                              str(options.tab_file) + "!")
            log_handler.error("Disentangling failed!")
        else:
            log_handler.exception(str(e))
            log_handler.error("Disentangling failed!")
            if not options.acyclic_allowed:
                log_handler.info(
                    "You might try again with '--linear' to export contig(s) "
                    "instead of circular genome.")
            log_handler = simple_log(log_handler, options.output_directory,
                                     options.prefix + ".disentangle.")
            log_handler.info("\nTotal cost " + str(time.time() - time0))
            log_handler.info(
                "Please email [email protected] if you find bugs!\n")
    except Exception as e:
        log_handler.exception(str(e))
        log_handler.error("Disentangling failed!")
        if not options.acyclic_allowed:
            log_handler.info(
                "You might try again with '--linear' to export contig(s) "
                "instead of circular genome.")
        log_handler = simple_log(log_handler, options.output_directory,
                                 options.prefix + ".disentangle.")
        log_handler.info("\nTotal cost " + str(time.time() - time0))
        log_handler.info(
            "Please email [email protected] if you find bugs!\n")
    logging.shutdown()
def require_options():
    usage = "Usage: rm_low_coverage_duplicated_contigs.py *.fastg"
    parser = ArgumentParser(usage=usage)
    parser.add_argument(
        'assemblies',
        metavar='assemblies',
        type=str,
        nargs='+',
        help=
        "Input FASTG format assembly graph files (split the files by spaces).")
    parser.add_argument(
        '--cov-t',
        dest='coverage_threshold',
        default=0.12,
        help='With ratio (coverage of query/coverage of subject) below which, '
        'the query would be exposed to discarded. Default: 0.12')
    parser.add_argument(
        '--len-t',
        dest='length_threshold',
        default=0.9,
        help=
        'With overlap (length of hit of query/ length of query) above which, '
        'the query would be exposed to discarded. Default: 0.9')
    parser.add_argument('--blur',
                        dest='blur_bases',
                        default=False,
                        action='store_true',
                        help='Replace hit low-coverage bases with N.')
    parser.add_argument('--keep-temp',
                        dest='keep_temp',
                        default=False,
                        action='store_true',
                        help='Keep temp blast files.')
    parser.add_argument(
        "--which-blast",
        dest="which_blast",
        default="",
        help="Assign the path to BLAST binary files if not added to the path.")
    parser.add_argument(
        '-o',
        dest='output_dir',
        help='Output directory. Default: along with the original file')
    parser.add_argument('-t',
                        '--threads',
                        dest="threads",
                        default=4,
                        type=int,
                        help="Threads of blastn.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not options.assemblies:
        parser.print_help()
        sys.stdout.write(
            '\n######################################\nERROR: Insufficient REQUIRED arguments!\n\n'
        )
        exit()
    if not options.which_blast:
        try_this_bin = os.path.join(GO_DEP_PATH, "ncbi-blast", "blastn")
        if os.path.isfile(try_this_bin) and executable(try_this_bin):
            output, err = subprocess.Popen(try_this_bin + " -version",
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.STDOUT,
                                           shell=True).communicate()
            if "not found" in output.decode("utf8"):
                sys.stdout.write(output.decode("utf8") + "\n")
            else:
                options.which_blast = os.path.split(try_this_bin)[0]
    if not executable(os.path.join(options.which_blast, "blastn")):
        sys.stdout.write(
            os.path.join(options.which_blast, "blastn") + " not accessible!")
        exit()
    if not executable(os.path.join(options.which_blast, "makeblastdb")):
        sys.stdout.write(
            os.path.join(options.which_blast, "makeblastdb") +
            " not accessible!")
        exit()
    if options.treat_no_hits not in ["ex_no_con", "ex_no_hit", "keep_all"]:
        sys.stdout.write(
            '\n\nOption Error: you should choose assign one of "ex_no_con", "ex_no_hit"'
            ' and "keep_all" to variable treat_no_hits\n')
        exit()
    return options, options.assemblies
Example #9
0
def get_options():
    parser = ArgumentParser(
        "evaluate_assembly_using_mapping.py -f fasta_file -1 RAW_1.fq -2 RAW_2.fq -o output"
    )
    parser.add_argument("-f", dest="fasta", help="input assembly fasta file.")
    parser.add_argument("-1", dest="original_fq_1")
    parser.add_argument("-2", dest="original_fq_2")
    parser.add_argument(
        "-u",
        dest="unpaired_fq_files",
        default="",
        help=
        "Input file(s) with unpaired (single-end) reads to be added to the pool. "
        "files could be comma-separated lists such as 'seq1,seq2'.")
    parser.add_argument(
        "-X",
        "--max-lib-len",
        dest="max_lib_len",
        type=int,
        default=1200,
        help="Corresponding to '-X' option in Bowtie2. Default: %(default)s.")
    parser.add_argument(
        "-c",
        dest="is_circular",
        default="auto",
        help="(yes/no/auto) input fasta is circular. "
        "If auto was chosen, the input fasta would be treated as circular when the sequence name "
        "ends with '(circular)'. "
        "Default: auto")
    parser.add_argument("-o", dest="output_base", help="output folder.")
    parser.add_argument("-t",
                        dest="threads",
                        type=int,
                        default=2,
                        help="threads.")
    parser.add_argument("--continue",
                        dest="resume",
                        default=False,
                        action="store_true")
    parser.add_argument(
        "--seed",
        dest="random_seed",
        default=12345,
        type=int,
        help="Seed for random number generator. Default: %(default)s")
    parser.add_argument(
        "--draw",
        dest="draw_plot",
        default=False,
        action="store_true",
        help="Draw density plot using matplotlib, which should be installed.")
    parser.add_argument("--plot-format",
                        dest="plot_format",
                        default="pdf,png",
                        help='Default: pdf,png')
    parser.add_argument("--plot-title",
                        dest="plot_title",
                        help="Default: `the file name of the input fasta`")
    parser.add_argument("--plot-subtitle",
                        dest="plot_subtitle",
                        default="",
                        help="A 4-space indicates a line break. Default: None")
    parser.add_argument("--plot-transparent",
                        dest="plot_transparent",
                        default=False,
                        action="store_true",
                        help="Default: False")
    parser.add_argument("--plot-x-density",
                        dest="plot_x_density",
                        default=12000.,
                        type=float,
                        help="Default: %(default)s")
    # parser.add_argument("--plot-x-sliding-window", dest="sliding_window_size", default=1, type=int,
    #                   help="Default: %(default)s")
    parser.add_argument(
        "--plot-x-gap-dots",
        dest="gap_len",
        default=3000,
        type=int,
        help=
        "Number of sites added in-between isolated contigs. Default: %(default)s"
    )
    parser.add_argument("--plot-figure-height",
                        dest="figure_height",
                        default=5.,
                        type=float,
                        help="Default: %(default)s")
    parser.add_argument("--plot-y-lim",
                        dest="y_lim",
                        type=float,
                        help="Y axis value limit. ")
    # parser.add_argument("--plot-figure-extra-width", dest="extra_width", default=3., type=float,
    #                   help="Default: %(default)s")
    parser.add_argument(
        "--plot-font",
        dest="plot_font",
        default=None,
        help=
        "For plot of unicode characters in some environments. Use 'Times New Roman','Arial' etc. "
        "Default: %(default)s.")
    parser.add_argument("--disable-customized-error-rate",
                        dest="customized_error_rate",
                        default=True,
                        action="store_true")
    parser.add_argument(
        "--which-bowtie2",
        dest="which_bowtie2",
        default="",
        help="Assign the path to Bowtie2 binary files if not added to the path. "
        "Default: try GetOrganelleDep/" + SYSTEM_NAME +
        "/bowtie2 first, then $PATH")
    parser.add_argument("--bowtie2-mode",
                        dest="bowtie2_mode",
                        default="--sensitive",
                        help="Default: %(default)s")
    parser.add_argument("--bowtie2-options",
                        dest="other_bowtie2_options",
                        default="--no-discordant --dovetail",
                        help="Default: %(default)s")
    parser.add_argument(
        "--stat-mode",
        dest="stat_mode",
        default="best",
        help=
        "Statistical mode for counting multiple hits of a single read: best/all. "
        "The all mode is meaningful only when '-k <INT>' was included in '--bowtie2-options'. "
        "Default: %(default)s")
    parser.add_argument("--debug",
                        dest="debug_mode",
                        default=False,
                        action="store_true",
                        help="Turn on debug mode.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not (options.fasta and
            ((options.original_fq_1 and options.original_fq_2)
             or options.unpaired_fq_files) and options.output_base):
        sys.stderr.write("Insufficient arguments!\n")
        sys.exit()
    if not os.path.isdir(options.output_base):
        os.mkdir(options.output_base)
    if options.debug_mode:
        log_level = "DEBUG"
    else:
        log_level = "INFO"
    assert options.stat_mode in ("best", "all")
    log_handler = simple_log(logging.getLogger(),
                             options.output_base,
                             "",
                             log_level=log_level)
    log_handler.info("")
    log_handler.info("Python " + str(sys.version).replace("\n", " "))
    log_handler.info("PLATFORM: " + " ".join(platform.uname()))
    # log versions of python libs
    lib_versions_info = []
    if options.draw_plot:
        try:
            import matplotlib
        except ImportError:
            pass
        else:
            lib_versions_info.append("matplotlib " + matplotlib.__version__)
    lib_versions_info.append("GetOrganelleLib " + GetOrganelleLib.__version__)
    log_handler.info("PYTHON LIBS: " + "; ".join(lib_versions_info))
    # log versions of dependencies
    dep_versions_info = []
    if not options.which_bowtie2:
        try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2")
        if os.path.isfile(try_this_bin) and executable(try_this_bin):
            options.which_bowtie2 = os.path.split(try_this_bin)[0]
    if not executable(os.path.join(options.which_bowtie2, "bowtie2")):
        log_handler.error(
            os.path.join(options.which_bowtie2, "bowtie2") +
            " not accessible!")
        exit()
    else:
        output, err = subprocess.Popen(
            os.path.join(options.which_bowtie2, "bowtie2") + " --version",
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True).communicate()
        this_lines = output.decode("utf8").split("\n")[:3]
        dep_versions_info.append("Bowtie2 " +
                                 this_lines[0].split()[-1].strip())
    if not executable(
            os.path.join(options.which_bowtie2, "bowtie2-build") +
            " --large-index"):
        log_handler.error(
            os.path.join(options.which_bowtie2, "bowtie2-build") +
            " not accessible!")
        exit()
    log_handler.info("DEPENDENCIES: " + "; ".join(dep_versions_info))
    log_handler.info("WORKING DIR: " + os.getcwd())
    # if not executable(os.path.join(options.which_bowtie2, "bowtie2-build-l")):
    #     log_handler.error(os.path.join(options.which_bowtie2, "bowtie2-build-l") + " not accessible!")
    #     exit()
    log_handler.info(" ".join(
        ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n")
    log_handler = timed_log(log_handler,
                            options.output_base,
                            "",
                            log_level=log_level)
    return options, log_handler
def main():
    time0 = time.time()
    print_title = "GetOrganelle v" + str(get_versions()) + \
                  "\n\nThis is a script for extracting organelle genomes" \
                  " from slim_fastg.py-produced files (csv & fastg). " + \
                  "\nBy [email protected]\n\n"
    options, log_handler = get_options(print_title)

    @set_time_limit(options.time_limit)
    def disentangle_circular_assembly(fastg_file, tab_file, prefix, weight_factor, type_factor, mode="embplant_pt",
                                      log_hard_cov_threshold=10., expected_max_size=inf, expected_min_size=0,
                                      contamination_depth=3., contamination_similarity=5.,
                                      degenerate=True, degenerate_depth=1.5, degenerate_similarity=1.5,
                                      min_sigma_factor=0.1, only_max_c=True, keep_temp=False, acyclic_allowed=False,
                                      verbose=False, log_handler=None, debug=False):
        if options.resume and os.path.exists(prefix + ".graph1.selected_graph.gfa"):
            pass
            if log_handler:
                log_handler.info(">>> Result graph existed!")
            else:
                sys.stdout.write(">>> Result graph existed!\n")
        else:
            time_a = time.time()
            if log_handler:
                log_handler.info(">>> Parsing " + fastg_file + " ..")
            else:
                sys.stdout.write("Parsing " + fastg_file + " ..\n")
            input_graph = Assembly(fastg_file, min_cov=options.min_cov, max_cov=options.max_cov)
            time_b = time.time()
            if log_handler:
                log_handler.info(">>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s")
            else:
                sys.stdout.write("\n>>> Parsing input fastg file finished: " + str(round(time_b - time_a, 4)) + "s\n")
            temp_graph = prefix + ".temp.fastg" if keep_temp else None

            copy_results = input_graph.find_target_graph(tab_file, mode=mode, type_factor=type_factor,
                                                         weight_factor=weight_factor,
                                                         log_hard_cov_threshold=log_hard_cov_threshold,
                                                         contamination_depth=contamination_depth,
                                                         contamination_similarity=contamination_similarity,
                                                         degenerate=degenerate, degenerate_depth=degenerate_depth,
                                                         degenerate_similarity=degenerate_similarity,
                                                         expected_max_size=expected_max_size,
                                                         expected_min_size=expected_min_size,
                                                         only_keep_max_cov=only_max_c,
                                                         min_sigma_factor=min_sigma_factor,
                                                         temp_graph=temp_graph,
                                                         broken_graph_allowed=acyclic_allowed,
                                                         verbose=verbose, log_handler=log_handler,
                                                         debug=debug)
            time_c = time.time()
            if log_handler:
                log_handler.info(">>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s")
                if len(copy_results) > 1:
                    log_handler.info(str(len(copy_results)) + " set(s) of graph detected.")
            else:
                sys.stdout.write("\n\n>>> Detecting target graph finished: " + str(round(time_c - time_b, 4)) + "s\n")
                if len(copy_results) > 1:
                    sys.stdout.write(str(len(copy_results)) + " set(s) of graph detected.\n")

            degenerate_base_used = False
            if acyclic_allowed:
                # still_complete = []
                for go_res, copy_res in enumerate(copy_results):
                    broken_graph = copy_res["graph"]
                    count_path = 0
                    for this_paths, other_tag in broken_graph.get_all_paths(mode=mode, log_handler=log_handler):
                        count_path += 1
                        all_contig_str = []
                        contigs_are_circular = []
                        for go_contig, this_p_part in enumerate(this_paths):
                            this_contig = broken_graph.export_path(this_p_part)
                            if DEGENERATE_BASES & set(this_contig.seq):
                                degenerate_base_used = True
                            if this_contig.label.endswith("(circular)"):
                                contigs_are_circular.append(True)
                            else:
                                contigs_are_circular.append(False)
                            if len(this_paths) == 1 and contigs_are_circular[-1]:
                                all_contig_str.append(this_contig.fasta_str())
                            else:
                                all_contig_str.append(">contig_" + str(go_contig + 1) + "--" + this_contig.label +
                                                      "\n" + this_contig.seq + "\n")
                        # if len(all_contig_str) == 1 and set(contigs_are_circular) == {True}:
                        #     still_complete.append(True)
                        # else:
                        #     still_complete.append(False)
                        open(prefix + ".graph" + str(go_res + 1) + other_tag + "." + str(count_path) + 
                             ".path_sequence.fasta", "w").write("\n".join(all_contig_str))
                    broken_graph.write_to_gfa(prefix + ".graph" + str(go_res + 1) + ".selected_graph.gfa")
            else:
                for go_res, copy_res in enumerate(copy_results):
                    idealized_graph = copy_res["graph"]
                    # should add making one-step-inversion pairs for paths,
                    # which would be used to identify existence of a certain isomer using mapping information
                    count_path = 0
                    for this_path, other_tag in idealized_graph.get_all_circular_paths(mode=mode, log_handler=log_handler):
                        count_path += 1
                        this_seq_obj = idealized_graph.export_path(this_path)
                        if DEGENERATE_BASES & set(this_seq_obj.seq):
                            degenerate_base_used = True
                        open(prefix + ".graph" + str(go_res + 1) + other_tag + "." + str(count_path) + 
                             ".path_sequence.fasta", "w").write(this_seq_obj.fasta_str())
                    idealized_graph.write_to_gfa(prefix + ".graph" + str(go_res + 1) + ".selected_graph.gfa")
            if degenerate_base_used:
                log_handler.warning("Degenerate base(s) used!")
            time_d = time.time()
            if log_handler:
                log_handler.info(">>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s")
            else:
                sys.stdout.write("\n\n>>> Solving and unfolding graph finished: " + str(round(time_d - time_c, 4)) + "s\n")

    try:
        disentangle_circular_assembly(options.fastg_file, options.tab_file,
                                      os.path.join(options.output_directory, options.prefix),
                                      type_factor=options.type_factor,
                                      mode=options.mode,
                                      weight_factor=options.weight_factor,
                                      log_hard_cov_threshold=options.depth_factor,
                                      contamination_depth=options.contamination_depth,
                                      contamination_similarity=options.contamination_similarity,
                                      degenerate=options.degenerate, degenerate_depth=options.degenerate_depth,
                                      degenerate_similarity=options.degenerate_similarity,
                                      expected_max_size=options.expected_max_size,
                                      expected_min_size=options.expected_min_size,
                                      min_sigma_factor=options.min_sigma_factor,
                                      only_max_c=options.only_keep_max_cov, acyclic_allowed=options.acyclic_allowed,
                                      keep_temp=options.keep_temp_graph,
                                      log_handler=log_handler, verbose=options.verbose, debug=options.debug)
        log_handler = simple_log(logging.getLogger(), options.output_directory, options.prefix + ".disentangle.")

        log_handler.info('\nTotal cost: ' + str(round(time.time() - time0, 4)) + 's\n')
    except Exception as e:
        if options.debug:
            log_handler.exception("")
        else:
            log_handler.exception(str(e))
        log_handler.exception("Disentangling failed!")
        if not options.acyclic_allowed:
            log_handler.info("You might try again with '--linear' to export contig(s) instead of circular genome.")
        log_handler = simple_log(log_handler, options.output_directory, options.prefix + ".disentangle.")
        log_handler.info("\nTotal cost " + str(time.time() - time0))
        log_handler.info("Please email [email protected] if you find bugs!\n")
    logging.shutdown()
Example #11
0
def get_options(description):
    parser = ArgumentParser(
        description=description,
        usage="get_organelle_config.py -a embplant_pt,embplant_mt")
    parser.add_argument(
        "-a",
        "--add",
        dest="add_organelle_type",
        help="Add database for organelle type(s). Followed by any of all/" +
        "/".join(ORGANELLE_TYPE_LIST) +
        " or multiple types joined by comma such as "
        "embplant_pt,embplant_mt,fungus_mt.")
    parser.add_argument(
        "--use-version",
        dest="db_version",
        default="latest",
        help="The version of database to add. "
        "Find more versions at github.com/Kinggerm/GetOrganelleDB. "
        "Default: %(default)s")
    parser.add_argument(
        "-r",
        "--rm",
        dest="rm_organelle_type",
        help=
        "Remove local database(s) for organelle type(s). Followed by any of all/"
        + "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma "
        "such as embplant_pt,embplant_mt.")
    parser.add_argument(
        "--update",
        dest="update",
        default=False,
        action="store_true",
        help=
        "Update local databases to the latest online version, or the local version "
        "if \"--use-local LOCAL_DB_PATH\" provided.")
    parser.add_argument(
        "--config-dir",
        dest="get_organelle_path",
        default=None,
        help="The directory where the default databases were placed. "
        "The default value also can be changed by adding 'export GETORG_PATH=your_favor' "
        "to the shell script (e.g. ~/.bash_profile or ~/.bashrc) "
        "Default: " + GO_PATH)
    parser.add_argument(
        "--use-local",
        dest="use_local",
        help=
        "Input a path. This local database path must include subdirectories "
        "LabelDatabase and SeedDatabase, under which there is the fasta file(s) named by the "
        "organelle type you want add, such as fungus_mt.fasta. ")
    parser.add_argument(
        "--clean",
        dest="clean",
        default=False,
        action="store_true",
        help="Remove all configured database files (==\"--rm all\").")
    parser.add_argument("--list",
                        dest="list_available",
                        default=False,
                        action="store_true",
                        help="List configured databases checking and exit. ")
    parser.add_argument("--check",
                        dest="check",
                        default=False,
                        action="store_true",
                        help="Check configured database files and exit. ")
    parser.add_argument(
        "--db-type",
        dest="db_type",
        default="both",
        help="The database type (seed/label/both). Default: %(default)s")
    parser.add_argument(
        "--which-blast",
        dest="which_blast",
        default="",
        help="Assign the path to BLAST binary files if not added to the path. "
        "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" +
        SYSTEM_NAME + "/ncbi-blast\" first, then $PATH")
    parser.add_argument(
        "--which-bowtie2",
        dest="which_bowtie2",
        default="",
        help="Assign the path to Bowtie2 binary files if not added to the path. "
        "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" +
        SYSTEM_NAME + "/bowtie2\" first, then $PATH")
    parser.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="verbose output to the screen. Default: %(default)s")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    assert options.db_type in ("seed", "label", "both")
    global _GO_PATH, _LBL_DB_PATH, _SEQ_DB_PATH
    if options.get_organelle_path:
        _GO_PATH = os.path.expanduser(options.get_organelle_path)
        if os.path.isdir(_GO_PATH):
            _LBL_DB_PATH = os.path.join(_GO_PATH, LBL_NAME)
            _SEQ_DB_PATH = os.path.join(_GO_PATH, SEQ_NAME)

    # check directories
    if not os.path.isdir(_GO_PATH):
        os.mkdir(_GO_PATH)
    if not os.path.isdir(_LBL_DB_PATH):
        os.mkdir(_LBL_DB_PATH)
    if not os.path.isdir(_SEQ_DB_PATH):
        os.mkdir(_SEQ_DB_PATH)

    # only print
    if options.list_available:
        if options.db_type in ("seed", "both"):
            version_file = os.path.join(_SEQ_DB_PATH, "VERSION")
            if os.path.isfile(version_file):
                with open(version_file) as open_version:
                    for line in open_version:
                        db_type, db_version, db_hash = line.strip().split("\t")
                        db_version = find_version(db_type, db_hash,
                                                  SEED_DB_HASH, db_version)
                        sys.stdout.write(db_type + " Seed Database:\t" +
                                         db_version + "\t" + db_hash + "\n")
        if options.db_type in ("label", "both"):
            version_file = os.path.join(_LBL_DB_PATH, "VERSION")
            if os.path.isfile(version_file):
                with open(version_file) as open_version:
                    for line in open_version:
                        db_type, db_version, db_hash = line.strip().split("\t")
                        db_version = find_version(db_type, db_hash,
                                                  LABEL_DB_HASH, db_version)
                        sys.stdout.write(db_type + " Label Database:\t" +
                                         db_version + "\t" + db_hash + "\n")
        sys.exit()

    # sys.stdout.write("\n" + description + "\n")
    sys.stdout.write("\nPython " + str(sys.version).replace("\n", " ") + "\n")
    options.which_bowtie2 = detect_bowtie2_path(options.which_bowtie2,
                                                GO_DEP_PATH)
    options.which_blast = detect_blast_path(options.which_blast, GO_DEP_PATH)
    bowtie2_v = detect_bowtie2_version(options.which_bowtie2)
    if bowtie2_v.endswith("N/A"):
        sys.stdout.write("ERROR: Bowtie2 is not available!\n")
        sys.exit()
    blast_v = detect_blast_version(options.which_blast)
    if blast_v.endswith("N/A"):
        sys.stdout.write("ERROR: Blast is not available!\n")
        sys.exit()
    sys.stdout.write("DEPENDENCIES: " + "; ".join([bowtie2_v, blast_v]) + "\n")
    sys.stdout.write("WORKING DIR: " + os.getcwd() + "\n")
    sys.stdout.write(" ".join(
        ["\"" + arg + "\"" if " " in arg else arg
         for arg in sys.argv]) + "\n\n")
    if not (options.add_organelle_type or options.rm_organelle_type
            or options.update or options.clean):
        parser.print_help()
        sys.stdout.write("Insufficient arguments!\n")
        sys.exit()

    mutually_exclusive_options = [(options.add_organelle_type, "adding"),
                                  (options.rm_organelle_type, "removing"),
                                  (options.update, "updating"),
                                  (options.clean, "cleaning")]
    for config_mode1, config_name1 in mutually_exclusive_options:
        for config_mode2, config_name2 in mutually_exclusive_options:
            if config_name1 != config_name2:
                assert not (config_mode1 and config_mode2), \
                    config_name1 + " and " + config_name2 + " removing are mutually exclusive!"

    if options.add_organelle_type:
        options.add_organelle_type = options.add_organelle_type.split(",")
        for sub_type in options.add_organelle_type:
            if sub_type == "all":
                options.add_organelle_type = list(ORGANELLE_TYPE_LIST)
                break
            elif sub_type not in ORGANELLE_TYPE_SET:
                sys.stdout.write("Illegal 'adding' type: " + sub_type + "! "
                                 "Types must be one of all/" +
                                 "/".join(ORGANELLE_TYPE_LIST) + "!\n")
                sys.exit()

    if options.rm_organelle_type:
        options.rm_organelle_type = options.rm_organelle_type.split(",")
        for sub_type in options.rm_organelle_type:
            if sub_type == "all":
                options.clean = True
                break
            elif sub_type not in ORGANELLE_TYPE_SET:
                sys.stdout.write("Illegal 'removing' type: " + sub_type + "! "
                                 "Types must be one of all/" +
                                 "/".join(ORGANELLE_TYPE_LIST) + "!\n")
                sys.exit()

    if options.use_local:
        if not os.path.isdir(options.use_local):
            raise NotADirectoryError(options.use_local)
        if options.add_organelle_type:
            for sub_type in options.add_organelle_type:
                this_fas_f = os.path.join(options.use_local, SEQ_NAME,
                                          sub_type + ".fasta")
                if not os.path.isfile(this_fas_f):
                    sys.stdout.write("File " + this_fas_f +
                                     " not available!\n")
                    sys.exit()
                this_fas_f = os.path.join(options.use_local, LBL_NAME,
                                          sub_type + ".fasta")
                if not os.path.isfile(this_fas_f):
                    sys.stdout.write("File " + this_fas_f +
                                     " not available!\n")
                    sys.exit()
        options.db_version = "customized"
        sys.stdout.write("Use local database: " + options.use_local + "\n")
    else:
        if options.update:
            options.db_version = "latest"
        if options.db_version == "latest":
            remote_quest = get_static_html_context(
                VERSION_URLS[0],
                verbose=options.verbose,
                alternative_url_list=VERSION_URLS[1:])
            if remote_quest["status"]:
                options.db_version = remote_quest["content"].strip()
            else:
                sys.stderr.write("Error: " + remote_quest["info"] + "\n")
                sys.stderr.write(
                    "Please check your connection to github/gitee!\n")
                sys.stdout.write(
                    "\nYou can download the database files from www.github.com/Kinggerm/GetOrganelleDB "
                    "and install it from from local (flag --use-local)\n")
                sys.exit()
        if options.db_version not in SEED_DB_HASH or options.db_version not in LABEL_DB_HASH:
            sys.stderr.write(
                "GetOrganelle v{} does not support Database v{}\n".format(
                    get_versions(), options.db_version) +
                "Please upgrade GetOrganelle (recommended) "
                "or degrade the Database version (not recommended; --use-version)\n"
            )
            sys.exit()

    return options
Example #12
0
def main():
    time_start = time.time()
    description = "get_organelle_config.py " + get_versions(
    ) + " is used for setting up default GetOrganelle database."
    options = get_options(description=description)
    existing_seed_db, existing_label_db = get_current_db_versions(
        options.db_type,
        seq_db_path=_SEQ_DB_PATH,
        lbl_db_path=_LBL_DB_PATH,
        clean_mode=options.clean,
        check_hash=options.check)
    seed_version_f = os.path.join(_SEQ_DB_PATH, "VERSION")
    label_version_f = os.path.join(_LBL_DB_PATH, "VERSION")
    time_out = 100000

    # Case 1
    if options.clean:
        if options.db_type in ("seed", "both"):
            for rm_o_type in sorted(existing_seed_db):
                rm_files(_SEQ_DB_PATH, file_name_prefix=rm_o_type)
            if os.path.isfile(seed_version_f):
                os.remove(seed_version_f)
        if options.db_type in ("label", "both"):
            for rm_o_type in sorted(existing_label_db):
                rm_files(_LBL_DB_PATH, file_name_prefix=rm_o_type)
            if os.path.isfile(label_version_f):
                os.remove(label_version_f)

    # Case 2
    if options.rm_organelle_type:
        if options.db_type in ("seed", "both"):
            for rm_o_type in options.rm_organelle_type:
                if rm_o_type in existing_seed_db:
                    rm_files(_SEQ_DB_PATH, file_name_prefix=rm_o_type)
                    del existing_seed_db[rm_o_type]
                else:
                    sys.stdout.write("Warning: " + rm_o_type +
                                     " Seed Database not found!\n")
                write_version_file(version_dict=existing_seed_db,
                                   output_to_file=seed_version_f)
        if options.db_type in ("label", "both"):
            for rm_o_type in options.rm_organelle_type:
                if rm_o_type in existing_label_db:
                    rm_files(_LBL_DB_PATH, file_name_prefix=rm_o_type)
                    del existing_label_db[rm_o_type]
                else:
                    sys.stdout.write("Warning: " + rm_o_type +
                                     " Label Database not found!\n")
                write_version_file(version_dict=existing_label_db,
                                   output_to_file=label_version_f)

    # Case 3
    if options.update:
        if options.db_type in ("seed", "both"):
            for sub_o_type in ORGANELLE_TYPE_LIST:
                target_output = os.path.join(_SEQ_DB_PATH,
                                             sub_o_type + ".fasta")
                if sub_o_type not in existing_seed_db:
                    pass
                else:
                    if options.use_local:
                        update_to_fa = os.path.join(options.use_local,
                                                    SEQ_NAME,
                                                    sub_o_type + ".fasta")
                        if not os.path.exists(update_to_fa):
                            sys.stdout.write("Warning: " + update_to_fa +
                                             " not available!\n")
                        else:
                            new_hash_val = cal_f_sha256(update_to_fa)
                            if new_hash_val != existing_seed_db[sub_o_type][
                                    "sha256"]:
                                # for try_version in sorted(SEED_DB_HASH, reverse=True):
                                #     if sub_o_type in SEED_DB_HASH[try_version] and \
                                #             new_hash_val == SEED_DB_HASH[try_version][sub_o_type]["sha256"]:
                                #         existing_seed_db[sub_o_type] = {"version": try_version, "sha256": new_hash_val}
                                # else:
                                #     existing_seed_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val}
                                existing_seed_db[sub_o_type] = \
                                    {"version": find_version(sub_o_type, new_hash_val, SEED_DB_HASH),
                                     "sha256": new_hash_val}
                                if os.path.realpath(
                                        os.path.split(update_to_fa)
                                    [0]) != os.path.realpath(_SEQ_DB_PATH):
                                    copy(update_to_fa, _SEQ_DB_PATH)
                                initialize_seed_database(
                                    which_bowtie2=options.which_bowtie2,
                                    fasta_f=target_output,
                                    overwrite=True,
                                    verbose=options.verbose)
                            else:  # match existed
                                # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n")
                                initialize_seed_database(
                                    which_bowtie2=options.which_bowtie2,
                                    fasta_f=target_output,
                                    overwrite=False,
                                    verbose=options.verbose)
                    else:
                        if existing_seed_db[sub_o_type][
                                "version"] == options.db_version:
                            # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n")
                            initialize_seed_database(
                                which_bowtie2=options.which_bowtie2,
                                fasta_f=target_output,
                                overwrite=False,
                                verbose=options.verbose)
                        else:
                            these_urls = [
                                sub_url.format(options.db_version, sub_o_type)
                                for sub_url in seed_url_temp
                            ]
                            check_sha256 = SEED_DB_HASH[
                                options.db_version][sub_o_type]["sha256"]
                            status = download_file_with_progress(
                                remote_url=these_urls[0],
                                output_file=target_output,
                                sha256_v=check_sha256,
                                timeout=time_out,
                                alternative_url_list=these_urls[1:],
                                verbose=options.verbose)
                            if not status["status"]:
                                sys.stdout.write(
                                    "Installing %s Seed Database failed: %s\n"
                                    % (sub_o_type, status["info"]))
                                continue
                            initialize_seed_database(
                                which_bowtie2=options.which_bowtie2,
                                fasta_f=target_output,
                                overwrite=True,
                                verbose=options.verbose)
                            existing_seed_db[sub_o_type] = {
                                "version": options.db_version,
                                "sha256": check_sha256
                            }
                write_version_file(version_dict=existing_seed_db,
                                   output_to_file=seed_version_f)

        if options.db_type in ("label", "both"):
            for sub_o_type in ORGANELLE_TYPE_LIST:
                target_output = os.path.join(_LBL_DB_PATH,
                                             sub_o_type + ".fasta")
                if sub_o_type not in existing_label_db:
                    pass
                else:
                    if options.use_local:
                        update_to_fa = os.path.join(options.use_local,
                                                    LBL_NAME,
                                                    sub_o_type + ".fasta")
                        if not os.path.exists(update_to_fa):
                            sys.stdout.write("Warning: " + update_to_fa +
                                             " not available!\n")
                        else:
                            new_hash_val = cal_f_sha256(update_to_fa)
                            if new_hash_val != existing_label_db[sub_o_type][
                                    "sha256"]:  # match existed
                                # for try_version in sorted(LABEL_DB_HASH, reverse=True):
                                #     if sub_o_type in LABEL_DB_HASH[try_version] and \
                                #             new_hash_val == LABEL_DB_HASH[try_version][sub_o_type]["sha256"]:
                                #         existing_label_db[sub_o_type] = {"version": try_version,
                                #                                          "sha256": new_hash_val}
                                # else:
                                #     existing_label_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val}
                                existing_label_db[sub_o_type] = \
                                    {"version": find_version(sub_o_type, new_hash_val, LABEL_DB_HASH),
                                     "sha256": new_hash_val}
                                if os.path.realpath(
                                        os.path.split(update_to_fa)
                                    [0]) != os.path.realpath(_LBL_DB_PATH):
                                    copy(update_to_fa, _LBL_DB_PATH)
                                initialize_notation_database(
                                    which_blast=options.which_blast,
                                    fasta_f=target_output,
                                    overwrite=True,
                                    verbose=options.verbose)
                            else:
                                # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n")
                                initialize_notation_database(
                                    which_blast=options.which_blast,
                                    fasta_f=target_output,
                                    overwrite=False,
                                    verbose=options.verbose)
                    else:
                        if existing_seed_db[sub_o_type][
                                "version"] == options.db_version:
                            # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n")
                            initialize_notation_database(
                                which_blast=options.which_blast,
                                fasta_f=target_output,
                                overwrite=False,
                                verbose=options.verbose)
                        else:
                            these_urls = [
                                sub_url.format(options.db_version, sub_o_type)
                                for sub_url in label_url_temp
                            ]
                            check_sha256 = LABEL_DB_HASH[
                                options.db_version][sub_o_type]["sha256"]
                            status = download_file_with_progress(
                                remote_url=these_urls[0],
                                output_file=target_output,
                                sha256_v=check_sha256,
                                timeout=time_out,
                                alternative_url_list=these_urls[1:],
                                verbose=options.verbose)
                            if not status["status"]:
                                sys.stdout.write(
                                    "Installing %s Label Database failed: %s\n"
                                    % (sub_o_type, status["info"]))
                                continue
                            initialize_notation_database(
                                which_blast=options.which_blast,
                                fasta_f=target_output,
                                overwrite=True,
                                verbose=options.verbose)
                            existing_label_db[sub_o_type] = {
                                "version": options.db_version,
                                "sha256": check_sha256
                            }
                write_version_file(version_dict=existing_label_db,
                                   output_to_file=label_version_f)

    # Case 4
    if options.add_organelle_type:
        if options.db_type in ("seed", "both"):
            for sub_o_type in options.add_organelle_type:
                target_output = os.path.join(_SEQ_DB_PATH,
                                             sub_o_type + ".fasta")
                if options.use_local:
                    update_to_fa = os.path.join(options.use_local, SEQ_NAME,
                                                sub_o_type + ".fasta")
                    if not os.path.exists(update_to_fa):
                        sys.stdout.write("Warning: " + update_to_fa +
                                         " not available!\n")
                    else:
                        new_hash_val = cal_f_sha256(update_to_fa)
                        # for try_version in sorted(SEED_DB_HASH, reverse=True):
                        #     if sub_o_type in SEED_DB_HASH[try_version] and \
                        #             new_hash_val == SEED_DB_HASH[try_version][sub_o_type]["sha256"]:
                        #         existing_seed_db[sub_o_type] = {"version": try_version, "sha256": new_hash_val}
                        # else:
                        #     existing_seed_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val}
                        existing_seed_db[sub_o_type] = \
                            {"version": find_version(sub_o_type, new_hash_val, SEED_DB_HASH), "sha256": new_hash_val}
                        if os.path.realpath(
                                os.path.split(update_to_fa)
                            [0]) != os.path.realpath(_SEQ_DB_PATH):
                            copy(update_to_fa, _SEQ_DB_PATH)
                        initialize_seed_database(
                            which_bowtie2=options.which_bowtie2,
                            fasta_f=target_output,
                            overwrite=True,
                            verbose=options.verbose)
                else:
                    these_urls = [
                        sub_url.format(options.db_version, sub_o_type)
                        for sub_url in seed_url_temp
                    ]
                    check_sha256 = SEED_DB_HASH[
                        options.db_version][sub_o_type]["sha256"]
                    status = download_file_with_progress(
                        remote_url=these_urls[0],
                        output_file=target_output,
                        sha256_v=check_sha256,
                        timeout=time_out,
                        alternative_url_list=these_urls[1:],
                        verbose=options.verbose)
                    if not status["status"]:
                        sys.stdout.write(
                            "Installing %s Seed Database failed: %s\n" %
                            (sub_o_type, status["info"]))
                        continue
                    initialize_seed_database(
                        which_bowtie2=options.which_bowtie2,
                        fasta_f=target_output,
                        overwrite=True,
                        verbose=options.verbose)
                    existing_seed_db[sub_o_type] = {
                        "version": options.db_version,
                        "sha256": check_sha256
                    }
                write_version_file(version_dict=existing_seed_db,
                                   output_to_file=seed_version_f)

        if options.db_type in ("label", "both"):
            for sub_o_type in options.add_organelle_type:
                target_output = os.path.join(_LBL_DB_PATH,
                                             sub_o_type + ".fasta")
                if options.use_local:
                    update_to_fa = os.path.join(options.use_local, LBL_NAME,
                                                sub_o_type + ".fasta")
                    if not os.path.exists(update_to_fa):
                        sys.stdout.write("Warning: " + update_to_fa +
                                         " not available!\n")
                    else:
                        new_hash_val = cal_f_sha256(update_to_fa)
                        # for try_version in sorted(LABEL_DB_HASH, reverse=True):
                        #     if sub_o_type in LABEL_DB_HASH[try_version] and \
                        #             new_hash_val == LABEL_DB_HASH[try_version][sub_o_type]["sha256"]:
                        #         existing_label_db[sub_o_type] = {"version": try_version,
                        #                                          "sha256": new_hash_val}
                        # else:
                        #     existing_label_db[sub_o_type] = {"version": "customized", "sha256": new_hash_val}
                        existing_label_db[sub_o_type] = \
                            {"version": find_version(sub_o_type, new_hash_val, LABEL_DB_HASH), "sha256": new_hash_val}
                        if os.path.realpath(
                                os.path.split(update_to_fa)
                            [0]) != os.path.realpath(_LBL_DB_PATH):
                            copy(update_to_fa, _LBL_DB_PATH)
                        initialize_notation_database(
                            which_blast=options.which_blast,
                            fasta_f=target_output,
                            overwrite=True,
                            verbose=options.verbose)
                else:
                    these_urls = [
                        sub_url.format(options.db_version, sub_o_type)
                        for sub_url in label_url_temp
                    ]
                    check_sha256 = LABEL_DB_HASH[
                        options.db_version][sub_o_type]["sha256"]
                    status = download_file_with_progress(
                        remote_url=these_urls[0],
                        output_file=target_output,
                        sha256_v=check_sha256,
                        timeout=time_out,
                        alternative_url_list=these_urls[1:],
                        verbose=options.verbose)
                    if not status["status"]:
                        sys.stdout.write(
                            "Installing %s Label Database failed: %s\n" %
                            (sub_o_type, status["info"]))
                        continue
                    initialize_notation_database(
                        which_blast=options.which_blast,
                        fasta_f=target_output,
                        overwrite=True,
                        verbose=options.verbose)
                    existing_label_db[sub_o_type] = {
                        "version": options.db_version,
                        "sha256": check_sha256
                    }
                write_version_file(version_dict=existing_label_db,
                                   output_to_file=label_version_f)

    sys.stdout.write("\nTotal cost: %.2f s\n" % (time.time() - time_start))
Example #13
0
def get_options(print_title):
    parser = ArgumentParser(
        "disentangle_organelle_assembly.py -F embplant_pt -g input.fastg -t input.tab -o output_dir"
    )
    parser.add_argument("-g",
                        dest="fastg_file",
                        help="input fastg format file.")
    parser.add_argument(
        "-t",
        dest="tab_file",
        help=
        "input tab format file (*.csv; the postfix 'csv' was in conformity with Bandage) "
        "produced by slim_graph.py.")
    parser.add_argument("-o",
                        dest="output_directory",
                        help="output directory.")
    parser.add_argument(
        "-F",
        dest="mode",
        help=
        "organelle type: embplant_pt/other_pt/embplant_mt/embplant_nr/animal_mt/fungus_mt/fungus_nr/anonym."
    )
    parser.add_argument(
        "--linear",
        dest="acyclic_allowed",
        default=False,
        action="store_true",
        help=
        "By default, this script would only disentangle the circular graph (the complete circular "
        "organelle genome), and would directly give up linear/broken graphs. Choose this option "
        "to try for linear/broken cases.")
    parser.add_argument(
        "--weight-f",
        dest="weight_factor",
        type=float,
        default=100.0,
        help=
        "weight factor for excluding non-target contigs. Default:%(default)s")
    parser.add_argument(
        "--depth-f",
        dest="depth_factor",
        type=float,
        default=10.,
        help=
        "Depth factor for excluding non-target contigs. Default:%(default)s")
    parser.add_argument(
        "--type-f",
        dest="type_factor",
        type=float,
        default=3.,
        help="Type factor for identifying genome type tag. Default:%(default)s"
    )
    parser.add_argument(
        "--contamination-depth",
        dest="contamination_depth",
        default=3.,
        type=float,
        help=
        "Depth factor for confirming contaminating contigs. Default:%(default)s"
    )
    parser.add_argument(
        "--contamination-similarity",
        dest="contamination_similarity",
        default=0.9,
        type=float,
        help=
        "Similarity threshold for confirming contaminating contigs. Default:%(default)s"
    )
    parser.add_argument(
        "--no-degenerate",
        dest="degenerate",
        default=True,
        action="store_false",
        help=
        "Disable making consensus from parallel contig based on nucleotide degenerate table."
    )
    parser.add_argument(
        "--degenerate-depth",
        dest="degenerate_depth",
        default=1.5,
        type=float,
        help="Depth factor for confirming parallel contigs. Default:%(default)s"
    )
    parser.add_argument(
        "--degenerate-similarity",
        dest="degenerate_similarity",
        default=0.98,
        type=float,
        help=
        "Similarity threshold for confirming parallel contigs. Default:%(default)s"
    )
    parser.add_argument(
        "--expected-max-size",
        dest="expected_max_size",
        default=200000,
        type=int,
        help=
        "Expected maximum target genome size. Default: 200000 (-F embplant_pt/fungus_mt), "
        "25000 (-F embplant_nr/animal_mt/fungus_nr), 600000 (-F embplant_mt/other_pt)"
    )
    parser.add_argument(
        "--expected-min-size",
        dest="expected_min_size",
        default=10000,
        type=int,
        help="Expected mininum target genome size. Default: %(default)s")
    parser.add_argument(
        "--reverse-lsc",
        dest="reverse_lsc",
        default=False,
        action="store_true",
        help="For '-F embplant_pt' with complete circular result, "
        "by default, the direction of the starting contig (usually "
        "the LSC contig) is determined as the direction with less ORFs. Choose this option "
        "to reverse the direction of the starting contig when result is circular. "
        "Actually, both directions are biologically equivalent to each other. The "
        "reordering of the direction is only for easier downstream analysis.")
    parser.add_argument(
        "--max-paths-num",
        dest="max_paths_num",
        default=1000,
        type=int,
        help=
        "Repeats would dramatically increase the number of potential isomers (paths). "
        "This option was used to export a certain amount of paths out of all possible paths "
        "per assembly graph. Default: %(default)s")
    parser.add_argument(
        "--keep-all-polymorphic",
        dest="only_keep_max_cov",
        default=True,
        action="store_false",
        help=
        "By default, this script would pick the contig with highest coverage among all parallel "
        "(polymorphic) contigs when degenerating was not applicable. "
        "Choose this flag to export all combinations.")
    parser.add_argument(
        "--min-sigma",
        dest="min_sigma_factor",
        type=float,
        default=0.1,
        help=
        "Minimum deviation factor for excluding non-target contigs. Default:%(default)s"
    )
    parser.add_argument(
        "--min-depth",
        dest="min_cov",
        type=float,
        default=0.,
        help=
        "Minimum coverage for a contig to be included in disentangling. Default:%(default)s"
    )
    parser.add_argument(
        "--max-depth",
        dest="max_cov",
        type=float,
        default=inf,
        help=
        "Minimum coverage for a contig to be included in disentangling. Default:%(default)s"
    )
    parser.add_argument(
        "--max-multiplicity",
        dest="max_multiplicity",
        type=int,
        default=8,
        help="Maximum multiplicity of contigs for disentangling genome paths. "
        "Should be 1~12. Default:%(default)s")
    parser.add_argument(
        "--prefix",
        dest="prefix",
        default="target",
        help=
        "Prefix of output files inside output directory. Default:%(default)s")
    parser.add_argument("--keep-temp",
                        dest="keep_temp_graph",
                        default=False,
                        action="store_true",
                        help="export intermediate graph file.")
    parser.add_argument(
        "--time-limit",
        dest="time_limit",
        default=3600,
        type=int,
        help="time limit for the disentangling process. Default:%(default)s")
    parser.add_argument(
        "--random-seed",
        dest="random_seed",
        default=12345,
        type=int,
        help=
        "Random seed (only for disentangling at this moment). Default: %(default)s"
    )
    parser.add_argument("--continue",
                        dest="resume",
                        default=False,
                        action="store_true",
                        help="continue mode.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    parser.add_argument("--verbose",
                        dest="verbose",
                        default=False,
                        action="store_true",
                        help="verbose logging.")
    parser.add_argument("--debug",
                        dest="debug",
                        default=False,
                        action="store_true",
                        help="for debug.")
    options = parser.parse_args()
    if (options.fastg_file is None) or (options.tab_file is None) or (options.output_directory is None) \
            or (options.mode is None):
        if options.fastg_file is None:
            sys.stdout.write("Missing option \"-g\"!\n")
        if options.tab_file is None:
            sys.stdout.write("Missing option \"-t\"!\n")
        if options.output_directory is None:
            sys.stdout.write("Missing option \"-o\"!\n")
        if options.mode is None:
            sys.stdout.write("Missing option \"-F\"!\n")
        sys.stdout.write("Insufficient arguments!\n")
        sys.exit()
    else:
        assert 12 >= options.max_multiplicity >= 1
        assert options.max_paths_num > 0
        if options.output_directory and not os.path.exists(
                options.output_directory):
            os.mkdir(options.output_directory)
        log_handler = simple_log(logging.getLogger(), options.output_directory,
                                 options.prefix + ".disentangle.")
        log_handler.info(print_title)
        log_handler.info(" ".join(
            ["\"" + arg + "\"" if " " in arg else arg
             for arg in sys.argv]) + "\n")
        log_handler = timed_log(log_handler, options.output_directory,
                                options.prefix + ".disentangle.")
        if "--expected-max-size" not in sys.argv:
            if options.mode in ("embplant_mt", "other_pt"):
                options.expected_max_size *= 3
            # elif options.mode == "fungus_mt":
            #     options.expected_max_size /= 2
            elif options.mode in ("embplant_nr", "animal_mt", "fungus_nr"):
                options.expected_max_size /= 8
        random.seed(options.random_seed)
        np.random.seed(options.random_seed)
        return options, log_handler
Example #14
0
def get_options():
    parser = ArgumentParser(
        "round_statistics.py -f fasta_file -d output_per_round_folder -i Initial_mapped.fq -o output"
    )
    parser.add_argument("-f", dest="fasta", help="input fasta file.")
    parser.add_argument("-d",
                        dest="output_per_round_dir",
                        help="output per round directory.")
    parser.add_argument("-i", dest="initial_mapped", help="seed fastq.")
    parser.add_argument("-o", dest="output_base", help="output folder.")
    parser.add_argument("-R",
                        dest="round",
                        type=int,
                        help="rounds to check. default:automatic stop!")
    parser.add_argument("-t",
                        dest="threads",
                        type=int,
                        default=2,
                        help="threads.")
    parser.add_argument(
        "--which-bowtie2",
        dest="which_bowtie2",
        default="",
        help="Assign the path to Bowtie2 binary files if not added to the path. "
        "Default: try GetOrganelleDep/" + SYSTEM_NAME +
        "/bowtie2 first, then $PATH")
    parser.add_argument(
        '--random-seed',
        dest="random_seed",
        type=int,
        default=12345,
        help="seed for random generator for bowtie2. Default: %(default)s")
    parser.add_argument(
        "--threshold",
        dest="threshold",
        default="0,10",
        help=
        "sites with coverage above the threshold would be marked as covered. default: %(default)s"
    )
    parser.add_argument("--continue",
                        dest="resume",
                        default=False,
                        action="store_true")
    parser.add_argument("--keep-temp",
                        dest="keep_temp",
                        default=False,
                        action="store_true")
    parser.add_argument(
        "--draw",
        dest="draw_plot",
        default=False,
        action="store_true",
        help="Draw density plot using matplotlib, which should be installed.")
    parser.add_argument("--max-coverage-tick", dest="max_cov_tick")
    # parser.add_argument("--average", default=False, action="store_true",
    #                   help="output average coverage.")
    parser.add_argument("--debug", dest="debug", help="Debug mode.")
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="GetOrganelle v{version}".format(version=get_versions()))
    options = parser.parse_args()
    if not (options.fasta and options.initial_mapped and options.output_base
            and options.output_per_round_dir):
        sys.stderr.write("Insufficient arguments!\n")
        sys.exit()
    if not os.path.isdir(options.output_base):
        os.mkdir(options.output_base)
    if options.debug:
        log_level = "DEBUG"
    else:
        log_level = "INFO"
    log_handler = simple_log(logging.getLogger(),
                             options.output_base,
                             "",
                             log_level=log_level)
    log_handler.info("")
    log_handler.info(" ".join(
        ["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n")
    if not options.which_bowtie2:
        try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2")
        if os.path.isfile(try_this_bin) and executable(try_this_bin):
            options.which_bowtie2 = os.path.split(try_this_bin)[0]
    if not executable(os.path.join(options.which_bowtie2, "bowtie2")):
        log_handler.error(
            os.path.join(options.which_bowtie2, "bowtie2") +
            " not accessible!")
        exit()
    if not executable(
            os.path.join(options.which_bowtie2, "bowtie2-build") +
            " --large-index"):
        log_handler.error(
            os.path.join(options.which_bowtie2, "bowtie2-build") +
            " not accessible!")
        exit()
    # if not executable(os.path.join(options.which_bowtie2, "bowtie2-build-l")):
    #     log_handler.error(os.path.join(options.which_bowtie2, "bowtie2-build-l") + " not accessible!")
    #     exit()
    log_handler = timed_log(log_handler,
                            options.output_base,
                            "",
                            log_level=log_level)
    return options, log_handler
    default="",
    help="Assign the path to Bowtie2 binary files if not added to the path. "
    "Default: try GetOrganelleDep/" + SYSTEM_NAME +
    "/bowtie2 first, then $PATH")
parser.add_argument(
    "--which-spades",
    dest="which_spades",
    default="",
    help="Assign the path to SPAdes binary files if not added to the path. "
    "Default: try GetOrganelleDep/" + SYSTEM_NAME +
    "/SPAdes first, then $PATH")
parser.add_argument(
    "-v",
    "--version",
    action="version",
    version="GetOrganelle v{version}".format(version=get_versions()))
options = parser.parse_args()
if not (options.seed_dir and options.fastq_file_1 and options.fastq_file_2
        and options.output_sh_file):
    parser.print_help()
    sys.stdout.write('\nERROR: Insufficient arguments!\n')
    exit()
if options.fastq_file_1 == options.fastq_file_2:
    raise IOError("1st fastq file should NOT be the same with 2nd fastq file!")
if not options.which_bowtie2:
    try_this_bin = os.path.join(GO_DEP_PATH, "bowtie2", "bowtie2")
    if os.path.isfile(try_this_bin) and executable(try_this_bin):
        options.which_bowtie2 = os.path.split(try_this_bin)[0]
if not options.which_spades:
    try_this_bin = os.path.join(GO_DEP_PATH, "SPAdes", "bin", "spades.py")
    if os.path.isfile(try_this_bin) and executable(try_this_bin):