def prepare_configs(src_config_dir, ds_args, log):
    config_dir = os.path.join(ds_args.output_dir, "dipspades_configs")
    copy_configs(src_config_dir, config_dir)
    #log.info("dipSPAdes configs were copied to " + config_dir)
    config_fname = os.path.join(config_dir, "config.info")
    if not os.path.exists(config_fname):
        support.check_file_existence(config_fname + ".template")
        os.rename(config_fname + ".template", config_fname)
    return os.path.abspath(config_fname)
Example #2
0
def prepare_configs(src_config_dir, ds_args, log):
    config_dir = os.path.join(ds_args.output_dir, "dipspades_configs")
    copy_configs(src_config_dir, config_dir)
    #log.info("dipSPAdes configs were copied to " + config_dir)
    config_fname = os.path.join(config_dir, "config.info")
    if not os.path.exists(config_fname):
        support.check_file_existence(config_fname + ".template")
        os.rename(config_fname + ".template", config_fname)
    return os.path.abspath(config_fname)
Example #3
0
def parse_arguments(argv, log):
    try:
        options, not_options = getopt.gnu_getopt(argv, DS_Args_List.short_options, DS_Args_List.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage("", dipspades=True)
        sys.exit(1)

    ds_args = DS_Args()
    for opt, arg in options:
        if opt == '-o':
            ds_args.output_dir = os.path.abspath(arg)
        elif opt == '--expect-gaps':
            ds_args.allow_gaps = True
        elif opt == '--expect-rearrangements':
            ds_args.weak_align = True
        elif opt == '--hap':
            ds_args.haplocontigs_fnames.append(support.check_file_existence(arg, 'haplocontigs', log, dipspades=True))
        elif opt == '-t' or opt == "--threads":
            ds_args.max_threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            ds_args.max_memory = int(arg)
        elif opt == '--tmp-dir':
            ds_args.tmp_dir = os.path.abspath(arg)
    ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs")

    if not ds_args.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log, dipspades=True)
    if not ds_args.haplocontigs_fnames:
        support.error("cannot start dipSPAdes without at least one haplocontigs file!", log, dipspades=True)
    if not ds_args.tmp_dir:
        ds_args.tmp_dir = os.path.join(ds_args.output_dir, options_storage.TMP_DIR)
    return ds_args
def parse_arguments(argv, log):
    try:
        options, not_options = getopt.gnu_getopt(argv,
                                                 DS_Args_List.short_options,
                                                 DS_Args_List.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage("", dipspades=True)
        sys.exit(1)

    ds_args = DS_Args()
    for opt, arg in options:
        if opt == '-o':
            ds_args.output_dir = os.path.abspath(arg)
        elif opt == '--expect-gaps':
            ds_args.allow_gaps = True
        elif opt == '--expect-rearrangements':
            ds_args.weak_align = True
        elif opt == '--hap':
            ds_args.haplocontigs_fnames.append(
                support.check_file_existence(arg,
                                             'haplocontigs',
                                             log,
                                             dipspades=True))
        elif opt == '-t' or opt == "--threads":
            ds_args.max_threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            ds_args.max_memory = int(arg)
        elif opt == '--tmp-dir':
            ds_args.tmp_dir = os.path.abspath(arg)
        elif opt == '--dsdebug':
            ds_args.dev_mode = True
        elif opt == '--hap-assembly':
            ds_args.haplotype_assembly = True
        elif opt == '--dsK':
            ds_args.k = int(arg)
    ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs")

    if not ds_args.output_dir:
        support.error(
            "the output_dir is not set! It is a mandatory parameter (-o output_dir).",
            log,
            dipspades=True)
    if not ds_args.haplocontigs_fnames:
        support.error(
            "cannot start dipSPAdes without at least one haplocontigs file!",
            log,
            dipspades=True)
    if not ds_args.tmp_dir:
        ds_args.tmp_dir = os.path.join(ds_args.output_dir,
                                       options_storage.TMP_DIR)
    return ds_args
Example #5
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    if len(not_options) > 1:
        for opt, arg in options:
            if opt == "-k" and arg.strip().endswith(','):
                support.error("Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55", log)
        support.error("Please specify option (e.g. -1, -2, -s, etc) for the following paths: " + ", ".join(not_options[1:]) + "\n", log)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * len(options_storage.SHORT_READS_TYPES.keys()))]  # "[{}] * num" doesn't work here!

    # for parsing options from "previous run command"
    options_storage.continue_mode = False
    options_storage.k_mers = None

    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = os.path.abspath(arg)
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = os.path.abspath(arg)
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            if arg == 'auto':
                options_storage.k_mers = arg
            else:
                options_storage.k_mers = list(map(int, arg.split(",")))
                for k in options_storage.k_mers:
                    if k < options_storage.MIN_K or k > options_storage.MAX_K:
                        support.error('wrong k value ' + str(k) + ': all k values should be between %d and %d' %
                                                                  (options_storage.MIN_K, options_storage.MAX_K), log)
                    if k % 2 == 0:
                        support.error('wrong k value ' + str(k) + ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--iontorrent":
            options_storage.iontorrent = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True
        elif opt == "--disable-gzip-output:false":
            options_storage.disable_gzip_output = False
        elif opt == "--disable-rr":
            options_storage.disable_rr = True
        elif opt == "--disable-rr:false":
            options_storage.disable_rr = False

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_assembler = True

        elif opt == "--read-buffer-size":
            options_storage.read_buffer_size = int(arg)
        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True
        elif opt == "--restart-from":
            if arg not in ['ec', 'as', 'mc'] and not arg.startswith('k'):
                support.error("wrong value for --restart-from option: " + arg + " (only 'ec', 'as', 'k<int>', 'mc' are available)", log)
            options_storage.continue_mode = True
            options_storage.restart_from = arg

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if arg == 'auto':
                options_storage.qvoffset = arg
            elif arg in ['33', '64']:
                options_storage.qvoffset = int(arg)
            else:
                support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True
        elif opt == "--debug:false":
            options_storage.developer_mode = False

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True
        elif opt == "--mismatch-correction:false":
            options_storage.mismatch_corrector = False

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True
        elif opt == "--careful:false":
            options_storage.mismatch_corrector = False
            options_storage.careful = False

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
            support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
            #break
        elif opt == "--diploid":
            options_storage.diploid_mode = True
        else:
            raise ValueError


    if not options_storage.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error("the output_dir should exist for --continue and for --restart-from!", log)
        os.makedirs(options_storage.output_dir)
    if options_storage.restart_from:
        if options_storage.continue_mode: # saving parameters specified with --restart-from
            if not support.dataset_is_empty(dataset_data):
                support.error("you cannot specify reads with --restart-from option!", log)
            options_storage.save_restart_options(log)
        else:  # overriding previous run parameters
            options_storage.load_restart_options()
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
        dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
    if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
        support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')

    options_storage.set_default_values()
    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = options_storage.output_dir
    cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["iontorrent"] = options_storage.iontorrent
    cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations > 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
        cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
        cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check
        if options_storage.read_buffer_size:
            cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = None
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir

    return cfg, dataset_data
Example #6
0
    def check(self):
        '''
        Check correctness of parameters. Unless throws error.
        '''
        if self.multinom is None:
            if self.model_func_file is None:
                self.multinom = False
            else:
                self.multinom = True

        if self.pop_labels is not None:
            self.pop_labels = [x.strip() for x in self.pop_labels.split(',')]
        if self.ns is not None:
            self.ns = support.check_comma_sep_list(self.ns)

        self.input_file = support.check_file_existence(self.input_file)
        
        if self.resume_dir is not None:
            self.resume_dir = support.check_dir_existence(self.resume_dir)
        if self.resume_dir is not None and self.output_dir is None:
            self.output_dir = support.ensure_dir_existence(
                self.resume_dir + "_resumed", check_emptiness=True)
        elif self.output_dir is None:
            support.error("Parameter `Output directory` is required")
        else:
            self.output_dir = support.ensure_dir_existence(
                self.output_dir, check_emptiness=True)

        if self.input_file is None:
            support.error(
                "Parameter `Input file` is required")
        if self.theta is None:
            support.warning(
                "`Theta0` is not specified. It would be 1.0.")
        if self.gen_time is None:
            support.warning(
                "`Time for one generation` is not specified. Time will be in genetic units.")

        self.input_data, self.ns, self.pop_labels = support.load_spectrum(
                self.input_file, self.ns, self.pop_labels)
        self.ns = np.array(self.ns)
        self.number_of_populations = len(self.ns)

        # Linked or unlinked data
        if not self.linked_snp and self.boot_dir is not None:
            support.warning(
                    "SNP's are marked as unlinked, so the directory with bootstrap will be ignored.")
        elif self.linked_snp:
            if self.boot_dir is not None:
                self.boot_dir = support.check_dir_existence(self.boot_dir)
                self.boots = gadma.Inference.load_bootstrap_data_from_dir(self.boot_dir, self.ns, self.pop_labels)

        # Custom model
        if self.model_func_file is not None:
            self.model_func_file = support.check_file_existence(self.model_func_file)
            file_with_model_func = imp.load_source('module', self.model_func_file)
            try:
                self.model_func = file_with_model_func.model_func  
            except:
                support.error(
                    "File " + self.model_func_file + ' does not contain function named `model_func`.')

        
        if self.model_func_file is not None:
            if self.p_ids is not None:
                self.p_ids = support.check_comma_sep_list(self.p_ids, is_int=False)
                
        self.fracs = [float(x) for x in self.fracs.split(",")]
        if len(self.fracs) != 3:
            support.error(
                "length of `Fractions` (Parameters of genetic algorithm) must be 3")
        self.frac_of_old_models = self.fracs[0]
        self.frac_of_mutated_models = self.fracs[1]
        self.frac_of_crossed_models = self.fracs[2]

        if self.moments_scenario and self.dadi_pts is not None:
            support.warning(
                "Moments doesn't use --pts argument, so it would be ignored")
        if self.dadi_pts is None:
            max_n = max(self.ns)
            self.dadi_pts = [max_n, max_n + 10, max_n + 20]
        else:
            self.dadi_pts = support.check_comma_sep_list(self.dadi_pts)

        self.put_default_structures()

        self.final_check()
Example #7
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse,
                                                 options_storage.short_options,
                                                 options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER paired-end libs and MAX_LIBS_NUMBER mate-pair libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * 2)]

    options_storage.continue_mode = False
    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = arg
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = arg
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(
                arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(
                arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            options_storage.k_mers = list(map(int, arg.split(",")))
            for k in options_storage.k_mers:
                if k > 127:
                    support.error(
                        'wrong k value ' + str(k) +
                        ': all k values should be less than 128', log)
                if k % 2 == 0:
                    support.error(
                        'wrong k value ' + str(k) +
                        ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error(
                    'you cannot specify --only-error-correction and --only-assembler simultaneously'
                )
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error(
                    'you cannot specify --only-error-correction and --only-assembler simultaneously'
                )
            options_storage.only_assembler = True

        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if int(arg) in [33, 64]:
                options_storage.qvoffset = int(arg)
            else:
                support.error(
                    'wrong PHRED quality offset value ' + str(arg) +
                    ': should be either 33 or 64', log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True

        elif opt == "--rectangles":
            options_storage.rectangles = True

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset(
                '-1', os.path.join(spades_home,
                                   "test_dataset/ecoli_1K_1.fq.gz"),
                dataset_data)
            support.add_to_dataset(
                '-2', os.path.join(spades_home,
                                   "test_dataset/ecoli_1K_2.fq.gz"),
                dataset_data)
            #break
        else:
            raise ValueError

    if not options_storage.output_dir:
        support.error(
            "the output_dir is not set! It is a mandatory parameter (-o output_dir).",
            log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error("the output_dir should exist for --continue!", log)
        os.makedirs(options_storage.output_dir)
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(
                open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' +
                          options_storage.dataset_yaml_filename + '):\n' +
                          str(exc))
        dataset_data = support.relative2abs_paths(
            dataset_data,
            os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(
            options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data,
                    open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler,
                                log)
    if support.dataset_has_only_mate_pairs_libraries(dataset_data):
        support.error(
            'you should specify at least one paired-end or unpaired library (only mate-pairs libraries were found)!'
        )
    if options_storage.rectangles and (len(dataset_data) > 1):
        support.error(
            'rectangle graph algorithm for repeat resolution cannot work with multiple libraries!'
        )

    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = os.path.abspath(
        options_storage.output_dir)
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["yaml_filename"] = os.path.abspath(
        options_storage.dataset_yaml_filename)
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations >
                                                 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(
            cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__[
            "max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__[
            "gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__[
                "qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__[
                "heap_check"] = options_storage.bh_heap_check
        if options_storage.tmp_dir:
            cfg["error_correction"].__dict__[
                "tmp_dir"] = options_storage.tmp_dir
        else:
            cfg["error_correction"].__dict__["tmp_dir"] = cfg[
                "error_correction"].output_dir
        cfg["error_correction"].tmp_dir = os.path.join(
            os.path.abspath(cfg["error_correction"].tmp_dir), 'tmp')

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__[
                "iterative_K"] = options_storage.k_mers_short
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__[
                "heap_check"] = options_storage.spades_heap_check

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction
        ) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = ""
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(
            bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__[
            "output-dir"] = options_storage.output_dir

    return cfg, dataset_data
Example #8
0
try:
    options, datasets = getopt.gnu_getopt(sys.argv[1:], short_options,
                                          long_options)
except getopt.GetoptError, err:
    print str(err)
    print ""
    usage()
    sys.exit(1)

for opt, arg in options:
    if opt in ('-o', "--output-dir"):
        output_dir = arg
        make_latest_symlink = False
    elif opt in ('-r', "--reference"):
        support.check_file_existence(arg, "reference")
        reference = arg
    elif opt in ('-t', "--thread-num"):
        thread_num = int(arg)
        if thread_num < 1:
            thread_num = 1
    elif opt in ('-b', "--bin-size"):
        if int(arg) > 0:
            bin_size = int(arg)
    elif opt in ('-k', "--kmer-size"):
        if int(arg) > 0:
            kmer = int(arg)
    elif opt in ('-x', "--max-is"):
        if int(arg) > 0:
            max_is = int(arg)
    elif opt in ('-s', "--skip-trimming"):
Example #9
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse,
                                                 options_storage.short_options,
                                                 options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    if len(not_options) > 1:
        for opt, arg in options:
            if opt == "-k" and arg.strip().endswith(','):
                support.error(
                    "Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55",
                    log)
        support.error(
            "Please specify option (e.g. -1, -2, -s, etc) for the following paths: "
            + ", ".join(not_options[1:]) + "\n", log)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs
    dataset_data = [
        {} for i in range(options_storage.MAX_LIBS_NUMBER *
                          len(options_storage.SHORT_READS_TYPES.keys()) +
                          len(options_storage.LONG_READS_TYPES))
    ]  # "[{}]*num" doesn't work here!

    # for parsing options from "previous run command"
    options_storage.continue_mode = False
    options_storage.k_mers = None

    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = os.path.abspath(arg)
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = os.path.abspath(arg)
        elif opt == "--configs-dir":
            options_storage.configs_dir = support.check_dir_existence(arg)
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(
                arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(
                arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            if arg == 'auto':
                options_storage.k_mers = arg
            else:
                options_storage.k_mers = list(map(int, arg.split(",")))
                for k in options_storage.k_mers:
                    if k < options_storage.MIN_K or k > options_storage.MAX_K:
                        support.error(
                            'wrong k value ' + str(k) +
                            ': all k values should be between %d and %d' %
                            (options_storage.MIN_K, options_storage.MAX_K),
                            log)
                    if k % 2 == 0:
                        support.error(
                            'wrong k value ' + str(k) +
                            ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--iontorrent":
            options_storage.iontorrent = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True
        elif opt == "--disable-gzip-output:false":
            options_storage.disable_gzip_output = False
        elif opt == "--disable-rr":
            options_storage.disable_rr = True
        elif opt == "--disable-rr:false":
            options_storage.disable_rr = False

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error(
                    'you cannot specify --only-error-correction and --only-assembler simultaneously'
                )
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error(
                    'you cannot specify --only-error-correction and --only-assembler simultaneously'
                )
            options_storage.only_assembler = True

        elif opt == "--read-buffer-size":
            options_storage.read_buffer_size = int(arg)
        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True
        elif opt == "--restart-from":
            if arg not in ['ec', 'as', 'mc'] and not arg.startswith('k'):
                support.error(
                    "wrong value for --restart-from option: " + arg +
                    " (should be 'ec', 'as', 'k<int>', or 'mc'", log)
            options_storage.continue_mode = True
            options_storage.restart_from = arg

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if arg == 'auto':
                options_storage.qvoffset = arg
            elif arg in ['33', '64']:
                options_storage.qvoffset = int(arg)
            else:
                support.error(
                    'wrong PHRED quality offset value: ' + arg +
                    ' (should be either 33, 64, or \'auto\')', log)
        elif opt == "--cov-cutoff":
            if arg == 'auto' or arg == 'off':
                options_storage.cov_cutoff = arg
            elif support.is_float(arg) and float(arg) > 0.0:
                options_storage.cov_cutoff = float(arg)
            else:
                support.error(
                    'wrong value for --cov-cutoff option: ' + arg +
                    ' (should be a positive float number, or \'auto\', or \'off\')',
                    log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True
        elif opt == "--debug:false":
            options_storage.developer_mode = False

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True
        elif opt == "--mismatch-correction:false":
            options_storage.mismatch_corrector = False

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True
        elif opt == "--careful:false":
            options_storage.mismatch_corrector = False
            options_storage.careful = False

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset(
                '-1', os.path.join(spades_home,
                                   "test_dataset/ecoli_1K_1.fq.gz"),
                dataset_data)
            support.add_to_dataset(
                '-2', os.path.join(spades_home,
                                   "test_dataset/ecoli_1K_2.fq.gz"),
                dataset_data)
            #break
        elif opt == "--diploid":
            options_storage.diploid_mode = True
        else:
            raise ValueError

    if not options_storage.output_dir:
        support.error(
            "the output_dir is not set! It is a mandatory parameter (-o output_dir).",
            log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error(
                "the output_dir should exist for --continue and for --restart-from!",
                log)
        os.makedirs(options_storage.output_dir)
    if options_storage.restart_from:
        if options_storage.continue_mode:  # saving parameters specified with --restart-from
            if not support.dataset_is_empty(dataset_data):
                support.error(
                    "you cannot specify reads with --restart-from option!",
                    log)
            options_storage.save_restart_options(log)
        else:  # overriding previous run parameters
            options_storage.load_restart_options()
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(
                open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' +
                          options_storage.dataset_yaml_filename + '):\n' +
                          str(exc))
        dataset_data = support.relative2abs_paths(
            dataset_data,
            os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(
            options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data,
                    open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler,
                                log)
    if not support.get_lib_ids_by_type(
            dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
        support.error(
            'you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!'
        )

    options_storage.set_default_values()
    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = options_storage.output_dir
    cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["iontorrent"] = options_storage.iontorrent
    cfg["dataset"].__dict__[
        "yaml_filename"] = options_storage.dataset_yaml_filename
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations >
                                                 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(
            cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__[
            "max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__[
            "gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__[
                "qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__[
                "heap_check"] = options_storage.bh_heap_check
        cfg["error_correction"].__dict__[
            "iontorrent"] = options_storage.iontorrent

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__[
                "iterative_K"] = options_storage.K_MERS_SHORT
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
        cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
        cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__[
                "heap_check"] = options_storage.spades_heap_check
        if options_storage.read_buffer_size:
            cfg["assembly"].__dict__[
                "read_buffer_size"] = options_storage.read_buffer_size

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction
        ) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = None
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(
            bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__[
            "output-dir"] = options_storage.output_dir

    return cfg, dataset_data
Example #10
0
    log.addHandler(console)

    check_binaries(bin_home, log)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER paired-end libs and MAX_LIBS_NUMBER mate-pair libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * 2)]

    for opt, arg in options:
        if opt == "-o":
            options_storage.output_dir = arg
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = arg
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, "reference", log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, "dataset", log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == "-k":
            options_storage.k_mers = map(int, arg.split(","))
            for k in options_storage.k_mers:
                if k > 127:
                    support.error("wrong k value " + str(k) + ": all k values should be less than 128", log)
                if k % 2 == 0:
                    support.error("wrong k value " + str(k) + ": all k values should be odd", log)

        elif opt == "--sc":
Example #11
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER paired-end libs and MAX_LIBS_NUMBER mate-pair libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * 2)]

    options_storage.continue_mode = False
    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = arg
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = arg
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            options_storage.k_mers = list(map(int, arg.split(",")))
            for k in options_storage.k_mers:
                if k > 127:
                    support.error('wrong k value ' + str(k) + ': all k values should be less than 128', log)
                if k % 2 == 0:
                    support.error('wrong k value ' + str(k) + ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_assembler = True

        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if int(arg) in [33, 64]:
                options_storage.qvoffset = int(arg)
            else:
                support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True

        elif opt == "--rectangles":
            options_storage.rectangles = True

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
            support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
            #break
        else:
            raise ValueError


    if not options_storage.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error("the output_dir should exist for --continue!", log)
        os.makedirs(options_storage.output_dir)
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
        dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
    if support.dataset_has_only_mate_pairs_libraries(dataset_data):
        support.error('you should specify at least one paired-end or unpaired library (only mate-pairs libraries were found)!')
    if options_storage.rectangles and (len(dataset_data) > 1):
        support.error('rectangle graph algorithm for repeat resolution cannot work with multiple libraries!')

    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = os.path.abspath(options_storage.output_dir)
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["yaml_filename"] = os.path.abspath(options_storage.dataset_yaml_filename)
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations > 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
        if options_storage.tmp_dir:
            cfg["error_correction"].__dict__["tmp_dir"] = options_storage.tmp_dir
        else:
            cfg["error_correction"].__dict__["tmp_dir"] = cfg["error_correction"].output_dir
        cfg["error_correction"].tmp_dir = os.path.join(os.path.abspath(cfg["error_correction"].tmp_dir), 'tmp')

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers_short
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = ""
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir

    return cfg, dataset_data