def main(argv): options = launch_options.Options(argv, bin_home, truspades_home) support.ensure_dir_existence(options.output_dir) log = create_log(options) dataset_file = os.path.join(options.output_dir, "dataset.info") if options.continue_launch: dataset = barcode_extraction.ReadDataset(dataset_file, log) elif options.input_dirs is not None: dataset = generate_dataset(options.input_dirs, log) if dataset is None: log.info("Error: could not parse dataset from input directories\n") sys.exit(1) barcode_extraction.print_dataset(dataset, dataset_file, log) log.info("Dataset generated. See result in " + dataset_file) else: dataset = barcode_extraction.ReadDataset(options.dataset_file, log) barcode_extraction.print_dataset(dataset, dataset_file, log) log_dir = os.path.join(options.output_dir, "logs") support.ensure_dir_existence(log_dir) # if options.print_commands: # verify_exists(options.output_dir) # print_commands(commands, options) if options.mode == "run_truspades": RunTruSPAdes(dataset, log_dir, options, log) elif options.mode == "construct_subreferences": reference_construction.ConstructSubreferences( dataset, options.reference, options.output_dir, options.index, options.threads, log=None ) log.info("TruSPAdes launch successfully finished") if options.test: CheckTestSuccess(options, log)
def ConstructSubreferences(datasets, reference_file, output_dir, index = None, threads = 1, log = None): bwa_command = "bin/spades-bwa" if log == None: log = logging.getLogger('reference_construction') log.setLevel(logging.INFO) console = logging.StreamHandler(sys.stderr) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.INFO) log.addHandler(console) support.ensure_dir_existence(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) if index == None: log.info("Constructing index\n") index = alignment.index_bwa(bwa_command, log, reference_file, os.path.join(output_dir, "bwa_index"), "bwtsw") sam_dir = os.path.join(output_dir, "alignments") log.info("Aligning barcodes\n") sam_files = AlignToReference(datasets, sam_dir, bwa_command, log, index, threads) subreference_dir = os.path.join(output_dir, "subreferences") filtered_dir = os.path.join(output_dir, "filtered") support.recreate_dir(subreference_dir) support.recreate_dir(filtered_dir) log.info("Constructing subreferences") subreferences_list = [(barcode_id, ConstructSubreferenceFromSam(barcode_sam)) for barcode_id, barcode_sam in sam_files] log.info("Reading reference") reference = ReadReference(reference_file) log.info("Printing output") PrintAll([(barcode, filtered) for barcode, (filtered, subreference) in subreferences_list], reference, filtered_dir) PrintAll([(barcode, subreference) for barcode, (filtered, subreference) in subreferences_list], reference, subreference_dir) log.info("Subreference construction finished. See results in " + output_dir)
def RunTruQuast(input_dir, reference_dir, output_dir, threads): support.ensure_dir_existence(output_dir) if os.path.exists(os.path.join(input_dir, "dataset.info")): ids = [ barcode.id for barcode in barcode_extraction.ReadDataset( os.path.join(input_dir, "dataset.info")) ] files = [ os.path.join(input_dir, "barcodes", bid, "truseq_long_reads.fasta") for bid in ids ] else: files = [ os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith(".fasta") or file.endswith(".fa") ] ids = [ f[:f.rfind(".")] for f in os.listdir(input_dir) if file.endswith(".fasta") or file.endswith(".fa") ] barcode_quast_dir = os.path.join(output_dir, "barcode_quast") RunBarcodeQuast(zip(ids, files), barcode_quast_dir, reference_dir, threads) names, reports = ParseResults(barcode_quast_dir, ids) names.append("#partially unaligned") values = CollectResults(names, reports) results = open(os.path.join(output_dir, "results.tsv"), "w") for name in names: results.write(name + "\t" + str(int(values[name])) + "\t" + str(values[name] / len(ids)) + "\n") results.close()
def main(argv): options = launch_options.Options(argv, spades_home, truspades_home, spades_version) support.ensure_dir_existence(options.output_dir) if options.test and not options.continue_launch: support.recreate_dir(options.output_dir) log = create_log(options) dataset_file = os.path.join(options.output_dir, "dataset.info") if options.continue_launch: dataset = barcode_extraction.ReadDataset(dataset_file, log) elif options.input_dirs is not None: dataset = generate_dataset(options.input_dirs, log) if dataset is None: log.info("Error: could not parse dataset from input directories\n") sys.exit(1) barcode_extraction.print_dataset(dataset, dataset_file, log) log.info("Dataset generated. See result in " + dataset_file) else: dataset = barcode_extraction.ReadDataset(options.dataset_file, log) barcode_extraction.print_dataset(dataset, dataset_file, log) log_dir = os.path.join(options.output_dir, "logs") support.ensure_dir_existence(log_dir) # if options.print_commands: # verify_exists(options.output_dir) # print_commands(commands, options) if options.mode == "run_truspades": RunTruSPAdes(dataset, log_dir, options, log) elif options.mode == "construct_subreferences": reference_construction.ConstructSubreferences(dataset, options.reference, options.output_dir, options.index, options.threads, log = None) log.info("TruSPAdes launch successfully finished") if options.test: CheckTestSuccess(options, log)
def RunTruSPAdes(dataset, log_dir, options, log): log.info("Launching truSPAdes assembly in " + str(options.threads) + " threads") log.info("You can find logs for separate barcodes in " + log_dir) barcodes_dir = os.path.join(options.output_dir, "barcodes") support.ensure_dir_existence(barcodes_dir) commands = [(barcode.id, command_line(barcode, barcodes_dir, options.spades_options, options.continue_launch)) for barcode in dataset] task = parallel_launcher.ExternalCallTask(os.path.join(log_dir, "{0}.log"), "", log.name) errors = parallel_launcher.run_in_parallel(task, commands, options.threads) if errors != 0: log.info(str(errors) + " barcodes failed to assemble") check_results(dataset, barcodes_dir, log) output_base = os.path.join(options.output_dir, "TSLR") collect_contigs(dataset, barcodes_dir, output_base, "fasta") collect_contigs(dataset, barcodes_dir, output_base, "fastq") log.info("Assembled virtual long TruSeq reads can be found in " + os.path.join(options.output_dir, "TSLR.fasta")) if options.clean: SaveContigs(barcodes_dir, dataset, "fasta") SaveContigs(barcodes_dir, dataset, "fastq") for barcode in dataset: shutil.rmtree(os.path.join(barcodes_dir, barcode.id))
def ConstructSubreferences(datasets, reference_file, output_dir, index = None, threads = 1, log = None): bwa_command = "bin/bwa-spades" if log == None: log = logging.getLogger('reference_construction') log.setLevel(logging.INFO) console = logging.StreamHandler(sys.stderr) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.INFO) log.addHandler(console) support.ensure_dir_existence(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) if index == None: log.info("Constructing index\n") index = alignment.index_bwa(bwa_command, log, reference_file, os.path.join(output_dir, "bwa_index"), "bwtsw") sam_dir = os.path.join(output_dir, "alignments") log.info("Aligning barcodes\n") sam_files = AlignToReference(datasets, sam_dir, bwa_command, log, index, threads) subreference_dir = os.path.join(output_dir, "subreferences") filtered_dir = os.path.join(output_dir, "filtered") support.recreate_dir(subreference_dir) support.recreate_dir(filtered_dir) log.info("Constructing subreferences") subreferences_list = [(barcode_id, ConstructSubreferenceFromSam(barcode_sam)) for barcode_id, barcode_sam in sam_files] log.info("Reading reference") reference = ReadReference(reference_file) log.info("Printing output") PrintAll([(barcode, filtered) for barcode, (filtered, subreference) in subreferences_list], reference, filtered_dir) PrintAll([(barcode, subreference) for barcode, (filtered, subreference) in subreferences_list], reference, subreference_dir) log.info("Subreference construction finished. See results in " + output_dir)
def SaveContigs(barcodes_dir, dataset, format): contig_dir = os.path.join(barcodes_dir, format) support.ensure_dir_existence(contig_dir) for barcode in dataset: if os.path.isfile( os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)): shutil.copyfileobj( open( os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format), "rb"), gzip.open( os.path.join(contig_dir, barcode.id + "." + format + ".gz"), "wb"))
def RunTruSPAdes(dataset, log_dir, options, log): log.info("Launching truSPAdes assembly in " + str(options.threads) + " threads") log.info("You can find logs for separate barcodes in " + log_dir) barcodes_dir = os.path.join(options.output_dir, "barcodes") support.ensure_dir_existence(barcodes_dir) commands = [ (barcode.id, command_line(barcode, barcodes_dir, options.spades_options, options.continue_launch)) for barcode in dataset ] task = parallel_launcher.ExternalCallTask(os.path.join(log_dir, "{0}.log"), "", log.name) errors = parallel_launcher.run_in_parallel(task, commands, options.threads) if errors != 0: log.info(str(errors) + " barcodes failed to assemble") check_results(dataset, barcodes_dir, log) output_base = os.path.join(options.output_dir, "TSLR") collect_contigs(dataset, barcodes_dir, output_base, "fasta") collect_contigs(dataset, barcodes_dir, output_base, "fastq") log.info("Assembled virtual long TruSeq reads can be found in " + os.path.join(options.output_dir, "TSLR.fasta"))
def test_args(): ''' Put default args for test case. ''' global options_storage import tempfile options_storage.output_dir = tempfile.mkdtemp("gadma_test_dir") options_storage.output_dir = support.ensure_dir_existence( options_storage.output_dir) options_storage.input_file = os.path.join(support.get_home_dir(), "..", "fs_examples", "test.fs") options_storage.input_data, options_storage.ns, options_storage.pop_labels = support.load_spectrum( options_storage.input_file, None, None) options_storage.ns = np.array(options_storage.ns) options_storage.number_of_populations = 1 options_storage.linked_snp = False options_storage.theta = 0.37976 options_storage.gen_time = 25 options_storage.initial_structure = np.array([1]) options_storage.final_structure = np.array([2]) options_storage.size_of_generation = 5 options_storage.fracs = [ float(x) for x in options_storage.fracs.split(",") ] options_storage.frac_of_old_models = options_storage.fracs[0] options_storage.frac_of_mutated_models = options_storage.fracs[1] options_storage.frac_of_crossed_models = options_storage.fracs[2] options_storage.optimize_name = 'hill_climbing' options_storage.moments_scenario = True options_storage.relative_params = False options_storage.dadi_pts = [20, 30, 40] options_storage.repeats = 2 options_storage.processes = 2 options_storage.epsilon = 1 options_storage.test = True options_storage.multinom = True options_storage.final_check() return options_storage
def check(self): ''' Check correctness of parameters. Unless throws error. ''' if self.multinom is None: if self.model_func_file is None: self.multinom = False else: self.multinom = True if self.pop_labels is not None: self.pop_labels = [x.strip() for x in self.pop_labels.split(',')] if self.ns is not None: self.ns = support.check_comma_sep_list(self.ns) self.input_file = support.check_file_existence(self.input_file) if self.resume_dir is not None: self.resume_dir = support.check_dir_existence(self.resume_dir) if self.resume_dir is not None and self.output_dir is None: self.output_dir = support.ensure_dir_existence( self.resume_dir + "_resumed", check_emptiness=True) elif self.output_dir is None: support.error("Parameter `Output directory` is required") else: self.output_dir = support.ensure_dir_existence( self.output_dir, check_emptiness=True) if self.input_file is None: support.error( "Parameter `Input file` is required") if self.theta is None: support.warning( "`Theta0` is not specified. It would be 1.0.") if self.gen_time is None: support.warning( "`Time for one generation` is not specified. Time will be in genetic units.") self.input_data, self.ns, self.pop_labels = support.load_spectrum( self.input_file, self.ns, self.pop_labels) self.ns = np.array(self.ns) self.number_of_populations = len(self.ns) # Linked or unlinked data if not self.linked_snp and self.boot_dir is not None: support.warning( "SNP's are marked as unlinked, so the directory with bootstrap will be ignored.") elif self.linked_snp: if self.boot_dir is not None: self.boot_dir = support.check_dir_existence(self.boot_dir) self.boots = gadma.Inference.load_bootstrap_data_from_dir(self.boot_dir, self.ns, self.pop_labels) # Custom model if self.model_func_file is not None: self.model_func_file = support.check_file_existence(self.model_func_file) file_with_model_func = imp.load_source('module', self.model_func_file) try: self.model_func = file_with_model_func.model_func except: support.error( "File " + self.model_func_file + ' does not contain function named `model_func`.') if self.model_func_file is not None: if self.p_ids is not None: self.p_ids = support.check_comma_sep_list(self.p_ids, is_int=False) self.fracs = [float(x) for x in self.fracs.split(",")] if len(self.fracs) != 3: support.error( "length of `Fractions` (Parameters of genetic algorithm) must be 3") self.frac_of_old_models = self.fracs[0] self.frac_of_mutated_models = self.fracs[1] self.frac_of_crossed_models = self.fracs[2] if self.moments_scenario and self.dadi_pts is not None: support.warning( "Moments doesn't use --pts argument, so it would be ignored") if self.dadi_pts is None: max_n = max(self.ns) self.dadi_pts = [max_n, max_n + 10, max_n + 20] else: self.dadi_pts = support.check_comma_sep_list(self.dadi_pts) self.put_default_structures() self.final_check()
def SaveContigs(barcodes_dir, dataset, format): contig_dir = os.path.join(barcodes_dir, format) support.ensure_dir_existence(contig_dir) for barcode in dataset: if os.path.isfile(os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)): shutil.copyfileobj(open(os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format), "rb"), gzip.open(os.path.join(contig_dir, barcode.id + "." + format + ".gz"), "wb"))