def _index_vcf(cls, vcffile): """Index VCF file""" bgzip_binary = dependencies.find_binary("bgzip") command = " ".join([bgzip_binary, "-c", vcffile, ">", vcffile + ".gz"]) utils.syscall(command) tabix_binary = dependencies.find_binary("tabix") command = " ".join([tabix_binary, "-p", "vcf", vcffile + ".gz"]) utils.syscall(command)
def get_version_of_program(program, binary=None, allow_fail=False): if binary is None: binary = find_binary(program, allow_fail=allow_fail) if program == "bcftools": bcftools_process = utils.syscall(binary, allow_fail=True) for line in bcftools_process.stderr.split("\n"): # example version line: # Version: 1.3.1 (using htslib 1.3.1) if line.startswith("Version:"): try: version = line.rstrip().split(maxsplit=1)[1] except: return None return version return None elif program == "gramtools": gramtools_process = utils.syscall(binary + " --version") gramtools_json = json.loads(gramtools_process.stdout) return gramtools_json.get("version_number", None) elif program == "bwa": # To get version of BWA, need to run it with no options. # This returns an error code of 1, which we need to ignore bwa_process = utils.syscall(binary, allow_fail=True) for line in bwa_process.stderr.split("\n"): if line.strip().startswith("Version:"): try: version = line.rstrip().split()[-1] except: return None return version return None elif program == "dnadiff": dnadiff_process = utils.syscall(binary + " --version", allow_fail=True) for line in dnadiff_process.stderr.split("\n"): if line.strip().startswith("DNAdiff version"): try: version = line.rstrip().split()[-1] except: return None return version elif program == "nextflow": nextflow_process = utils.syscall(binary + " -version", allow_fail=True) # example line that we want to capture; # version 0.27.6 build 4775 for line in nextflow_process.stdout.split("\n"): if line.strip().startswith("version"): try: version = line.rstrip().split(maxsplit=1)[1] except: return None return version return None else: raise Exception('Program name "' + program + '" not recognised. Cannot continue')
def _map_seqs_to_ref(cls, seqs_file, ref_file, outfile): """Map seqs_file to ref_file using BWA MEM. Output is SAM file written to outfile""" bwa_binary = dependencies.find_binary("bwa") command = " ".join([ bwa_binary, "mem", "-a", # report all mappings "-Y", # use soft clipping for supplementary alignments ref_file, seqs_file, ">", outfile, ]) utils.syscall(command)
def _map_seqs_to_ref(cls, seqs_file, ref_file, outfile): '''Map seqs_file to ref_file using BWA MEM. Output is SAM file written to outfile''' bwa_binary = dependencies.find_binary('bwa') command = ' '.join([ bwa_binary, 'mem', '-a', # report all mappings '-Y', # use soft clipping for supplementary alignments ref_file, seqs_file, '>', outfile, ]) utils.syscall(command)
def test_regenotype_pipeline(): outdir = "tmp.nextflow_regeno_test.out" utils.rm_rf(outdir) os.mkdir(outdir) manifest = "tmp.nextflow_regeno_test.tsv" _write_manifest(os.path.join(outdir, manifest)) regeno_nf = os.path.join(minos_dir, "nextflow", "regenotype.nf") regeno_config = os.path.join(minos_dir, "nextflow", "regenotype.config") dag = "tmp.nextflow_regeno_test.dag.pdf" ref_fasta = os.path.join(data_dir, "data.ref.fa") mask_bed = os.path.join(data_dir, "mask.bed") command = f"nextflow run -c {regeno_config} -profile tiny -with-dag {dag} {regeno_nf} --make_distance_matrix --mask_bed_file {mask_bed} --max_variants_per_sample 10 --ref_fasta {ref_fasta} --manifest {manifest} --outdir OUT" utils.syscall(command, cwd=outdir) expect_failed_samples = os.path.join(data_dir, "expect.failed_samples.txt") got_failed_samples = os.path.join(outdir, "OUT", "failed_samples.txt") assert filecmp.cmp(got_failed_samples, expect_failed_samples, shallow=False) expect_dist_matrix = os.path.join(data_dir, "expect.distance_matrix.txt") got_dist_matrix = os.path.join(outdir, "OUT", "distance_matrix.txt") assert filecmp.cmp(got_dist_matrix, expect_dist_matrix, shallow=False) # Don't know order of lines in the manifest tsv, or the filename that will # be given to each sample. We'll load in each VCF and check it matches the # sample name from the manifest. Also check info in json and tsv files # match manifest_json = os.path.join(outdir, "OUT", "manifest.json") assert os.path.exists(manifest_json) manifest_tsv = os.path.join(outdir, "OUT", "manifest.tsv") with open(manifest_json) as f: manifest_data = json.load(f) with open(manifest_tsv) as f: reader = csv.DictReader(f, delimiter="\t") for d in reader: vcf = os.path.join(outdir, "OUT", d["vcf_file"]) assert d["sample"] == vcf_file_read.get_sample_name_from_vcf_file( vcf) assert os.path.exists(os.path.join(outdir, "OUT", d["log_file"])) assert manifest_data[d["sample"]]["log_file"] == d["log_file"] assert manifest_data[d["sample"]]["vcf_file"] == d["vcf_file"] utils.rm_rf(outdir)
def run_gramtools_build(outdir, vcf_file, ref_file, max_read_length, kmer_size=10): '''Runs gramtools build. Makes new directory called 'outdir' for the output''' gramtools_exe = dependencies.find_binary('gramtools') build_command = ' '.join([ gramtools_exe, 'build', '--gram-directory', outdir, '--vcf', vcf_file, '--reference', ref_file, '--max-read-length', str(max_read_length), '--all-kmers', '--kmer-size', str(kmer_size), ]) logging.info('Running gramtools build: ' + build_command) completed_process = utils.syscall(build_command, allow_fail=True) logging.info('Finished running gramtools build. Return code: ' + str(completed_process.returncode)) build_report = os.path.join(outdir, 'build_report.json') ran_ok = _build_json_file_is_good( build_report) and completed_process.returncode == 0 if not ran_ok: logging.info('Error running gramtools build. See build report file ' + build_report) raise Error('Error running gramtools build: ' + build_command) # Older gramtools called the perl generated VCF file perl_generated_vcf. # New gramtools calls it perl_generated.vcf. # Whichever one doesn't exist, symlink it to the one that does cwd = os.getcwd() os.chdir(outdir) vcf1 = 'perl_generated_vcf' vcf2 = 'perl_generated.vcf' if os.path.exists(vcf1): assert not os.path.exists(vcf2) os.symlink(vcf1, vcf2) elif os.path.exists(vcf2): assert not os.path.exists(vcf1) os.symlink(vcf2, vcf1) else: message = f'Could not find perl generated VCF file in directory {outdir}. Looked for {vcf1}, {vcf2}. Cannot continue' logging.error(message) raise Error(message) os.chdir(cwd) logging.info('Build report file looks good from gramtools build: ' + build_report)
def run(self): snps_file = self.outprefix + ".snps" qdiff_file = self.outprefix + ".qdiff" self.unmerged_vcf = self.outprefix + ".raw.vcf" self.merged_vcf = self.outprefix + ".merged.vcf" for filename in [snps_file, qdiff_file]: if os.path.exists(filename): os.unlink(filename) tmp_prefix = self.outprefix + ".tmp" for ref_name, query_name in zip(self.ref_seq_names, self.query_seq_names): ref_fasta = tmp_prefix + ".ref.fa" query_fasta = tmp_prefix + ".query.fa" with open(ref_fasta, "w") as f: print(self.ref_seqs[ref_name], file=f) with open(query_fasta, "w") as f: print(self.query_seqs[query_name], file=f) Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix) utils.syscall("cat " + tmp_prefix + ".snps >> " + snps_file) utils.syscall("cat " + tmp_prefix + ".qdiff >> " + qdiff_file) Dnadiff.clean_dnadiff_files(tmp_prefix) os.unlink(ref_fasta) os.unlink(query_fasta) Dnadiff._snps_file_file_to_unmerged_vcf( self.outprefix + ".snps", self.query_seqs, self.unmerged_vcf ) clusterer = vcf_clusterer.VcfClusterer( [self.unmerged_vcf], self.query_fasta, self.merged_vcf, merge_method="simple", cluster_boundary_size=31, ) clusterer.run() header, self.variants = vcf_file_read.vcf_file_to_dict( self.merged_vcf, remove_useless_start_nucleotides=True ) self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix + ".qdiff") self.all_variant_intervals = Dnadiff._make_all_variants_intervals( self.variants, self.big_variant_intervals )
def _index_vcf(cls, vcffile): '''Index VCF file''' bgzip_binary = dependencies.find_binary('bgzip') command = ' '.join([ bgzip_binary, '-c', vcffile, '>', vcffile + ".gz", ]) utils.syscall(command) tabix_binary = dependencies.find_binary('tabix') command = ' '.join([ tabix_binary, '-p', 'vcf', vcffile + ".gz", ]) utils.syscall(command)
def _map_seqs_to_seqs(cls, seqs_file_ref, seqs_file_query, outfile): '''Map seqs_file to ref_file using BWA MEM. Output is SAM file written to outfile''' bwa_binary = dependencies.find_binary('bwa') command = ' '.join([ bwa_binary, 'index', seqs_file_ref, ]) utils.syscall(command) command = ' '.join([ bwa_binary, 'aln', seqs_file_ref, seqs_file_query, '>', outfile + ".tmp", ]) utils.syscall(command) command = ' '.join([ bwa_binary, 'samse', seqs_file_ref, outfile + ".tmp", seqs_file_query, '>', outfile, ]) utils.syscall(command)
def run(self): snps_file = self.outprefix + '.snps' qdiff_file = self.outprefix + '.qdiff' self.unmerged_vcf = self.outprefix + '.raw.vcf' self.merged_vcf = self.outprefix + '.merged.vcf' for filename in [snps_file, qdiff_file]: if os.path.exists(filename): os.unlink(filename) tmp_prefix = self.outprefix + '.tmp' for ref_name, query_name in zip(self.ref_seq_names, self.query_seq_names): ref_fasta = tmp_prefix + '.ref.fa' query_fasta = tmp_prefix + '.query.fa' with open(ref_fasta, 'w') as f: print(self.ref_seqs[ref_name], file=f) with open(query_fasta, 'w') as f: print(self.query_seqs[query_name], file=f) Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix) utils.syscall('cat ' + tmp_prefix + '.snps >> ' + snps_file) utils.syscall('cat ' + tmp_prefix + '.qdiff >> ' + qdiff_file) Dnadiff.clean_dnadiff_files(tmp_prefix) os.unlink(ref_fasta) os.unlink(query_fasta) Dnadiff._snps_file_file_to_unmerged_vcf(self.outprefix + '.snps', self.query_seqs, self.unmerged_vcf) clusterer = vcf_clusterer.VcfClusterer([self.unmerged_vcf], self.query_fasta, self.merged_vcf, merge_method='simple') clusterer.run() header, self.variants = vcf_file_read.vcf_file_to_dict( self.merged_vcf, remove_useless_start_nucleotides=True) self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix + '.qdiff') self.all_variant_intervals = Dnadiff._make_all_variants_intervals( self.variants, self.big_variant_intervals)
def get_version_of_program(program, binary=None, allow_fail=False): if binary is None: binary = find_binary(program, allow_fail=allow_fail) if program == "gramtools": gramtools_process = utils.syscall(binary + " --version") gramtools_json = json.loads(gramtools_process.stdout) return gramtools_json.get("version_number", None) elif program == "nextflow": nextflow_process = utils.syscall(binary + " -version", allow_fail=True) # example line that we want to capture; # version 0.27.6 build 4775 for line in nextflow_process.stdout.split("\n"): if line.strip().startswith("version"): try: version = line.rstrip().split(maxsplit=1)[1] except: return None return version return None elif program == "vt": vt_process = utils.syscall(binary + " -v", allow_fail=True) # Example line: # vt v0.57721 for line in vt_process.stderr.split("\n"): if line.rstrip().startswith("vt "): try: version = line.rstrip().split(maxsplit=1)[1] except: return None return version return None elif program in [ "vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vcflib" ]: return "Unknown" else: raise Exception('Program name "' + program + '" not recognised. Cannot continue')
def run_gramtools_build(outdir, vcf_file, ref_file, max_read_length, kmer_size=10): """Runs gramtools build. Makes new directory called 'outdir' for the output""" if os.path.exists(outdir): raise FileExistsError( f"Gramtools build output directory '{outdir}' already exists. Cannot continue" ) os.mkdir(outdir) gramtools_exe = dependencies.find_binary("gramtools") build_command = " ".join([ gramtools_exe, "build", "--gram-directory", outdir, "--vcf", vcf_file, "--reference", ref_file, "--max-read-length", str(max_read_length), "--kmer-size", str(kmer_size), ]) logging.info("Running gramtools build: " + build_command) completed_process = utils.syscall(build_command, allow_fail=True) logging.info("Finished running gramtools build. Return code: " + str(completed_process.returncode)) build_report = os.path.join(outdir, "build_report.json") ran_ok = (_build_json_file_is_good(build_report) and completed_process.returncode == 0) if not ran_ok: logging.info("Error running gramtools build. See build report file " + build_report) raise Exception( f"Error running gramtools build: {build_command}\nstdout:{completed_process.stdout}\nstderr:\n{completed_process.stderr}" ) logging.info("Build report file looks good from gramtools build: " + build_report)
def _map_seqs_to_seqs(cls, seqs_file_ref, seqs_file_query, outfile): """Map seqs_file to ref_file using BWA MEM. Output is SAM file written to outfile""" bwa_binary = dependencies.find_binary("bwa") command = " ".join([bwa_binary, "index", seqs_file_ref]) utils.syscall(command) command = " ".join([ bwa_binary, "aln", seqs_file_ref, seqs_file_query, ">", outfile + ".tmp" ]) utils.syscall(command) command = " ".join([ bwa_binary, "samse", seqs_file_ref, outfile + ".tmp", seqs_file_query, ">", outfile, ]) utils.syscall(command)
def compress_file(filenames): infile, outfile = filenames zipper = "bgzip" if infile.endswith(".vcf") else "gzip -9" utils.syscall(f"{zipper} -c {infile} > {outfile}")
def run(self): self._make_output_dir() fh = logging.FileHandler(self.log_file, mode='w') log = logging.getLogger() formatter = logging.Formatter( '[minos %(asctime)s %(levelname)s] %(message)s', datefmt='%d-%m-%Y %H:%M:%S') fh.setFormatter(formatter) log.addHandler(fh) dependencies.check_and_report_dependencies(programs=['nextflow']) self._prepare_nextflow_input_files() original_dir = os.getcwd() os.chdir(self.output_dir) nextflow_script = 'nextflow.run.nf' MultiSamplePipeline._write_nextflow_script(nextflow_script) logging.info('Prepared nextflow files. cd ' + self.output_dir) nextflow = dependencies.find_binary('nextflow') nextflow_command = [ nextflow, 'run', '-work-dir', self.nextflow_work_dir, '-with-dag', 'nextflow.out.dag.pdf', '-with-trace', 'nextflow.out.trace.txt', ] if self.nextflow_config_file is not None: nextflow_command.extend(['-c', self.nextflow_config_file]) nextflow_command += [ nextflow_script, '--ref_fasta', self.ref_fasta, '--data_in_tsv', self.nextflow_input_tsv, '--max_alleles_per_cluster', str(self.max_alleles_per_cluster), '--min_large_ref_length', str(self.min_large_ref_length), '--final_outdir', self.output_dir, '--gramtools_max_read_length', str(self.gramtools_max_read_length), '--cluster_small_vars_ram', str(self.nf_ram_cluster_small_vars), '--gramtools_build_small_vars_ram', str(self.nf_ram_gramtools_build_small), '--gramtools_kmer_size', str(self.gramtools_kmer_size), '--gramtools_build_threads', str(self.gramtools_build_threads), '--minos_small_vars_ram', str(self.nf_ram_minos_small_vars), '--merge_small_vars_ram', str(self.nf_ram_merge_small_vars), ] if self.testing: nextflow_command.append('--testing') if self.use_unmapped_reads: nextflow_command.append('--use_unmapped_reads') if self.variants_per_split is not None: nextflow_command.append('--variants_per_split ' + str(self.variants_per_split)) if self.alleles_per_split is not None: nextflow_command.append('--alleles_per_split ' + str(self.alleles_per_split)) elif self.total_splits is not None: nextflow_command.append('--total_splits ' + str(self.total_splits)) nextflow_command = ' '.join(nextflow_command) if self.no_run: print( 'Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:' ) print(nextflow_command) return else: logging.info('Start running nextflow: ' + nextflow_command) syscall_process = utils.syscall(nextflow_command) logging.info( 'Finish running nextflow. Writing nextflow stdout/stderr to files' ) with open('nextflow.stdout', 'w') as f: print(syscall_process.stdout.rstrip(), file=f) with open('nextflow.stderr', 'w') as f: print(syscall_process.stderr.rstrip(), file=f) logging.info('cd ' + original_dir) if self.clean: logging.info('Delete nextflow work directory ' + self.nextflow_work_dir) shutil.rmtree(self.nextflow_work_dir) logging.info('Delete .nextflow directory') shutil.rmtree('.nextflow') logging.info('Rename .nextflow.log -> nextflow.log') os.rename('.nextflow.log', 'nextflow.log') os.chdir(original_dir)
def run_gramtools( build_dir, quasimap_dir, vcf_file, ref_file, reads, max_read_length, kmer_size=10, seed=42, ): """If build_dir does not exist, runs runs gramtools build and quasimap. Otherwise, just runs quasimap. quasimap output is in new directory called quasimap_dir. "reads" can be one filename, or a list of filenames. Raises Error if either of the expected json coverage files made by quasimap are not found.""" gramtools_exe = dependencies.find_binary("gramtools") if not os.path.exists(build_dir): run_gramtools_build(build_dir, vcf_file, ref_file, max_read_length, kmer_size=kmer_size) if type(reads) is not list: assert type(reads) is str reads = [reads] quasimap_command = " ".join([ gramtools_exe, "quasimap", f"--seed {seed}", "--gram-dir", build_dir, "--run-dir", quasimap_dir, " ".join(["--reads " + x for x in reads]), ]) logging.info("Running gramtools quasimap: " + quasimap_command) utils.syscall(quasimap_command) logging.info("Finished running gramtools quasimap") build_report = os.path.join(build_dir, "build_report.json") quasimap_report = os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json") allele_base_counts_file = os.path.join(quasimap_dir, "quasimap_outputs", "allele_base_coverage.json") grouped_allele_counts_file = os.path.join( quasimap_dir, "quasimap_outputs", "grouped_allele_counts_coverage.json") files_ok = True for filename in ( build_report, quasimap_report, allele_base_counts_file, grouped_allele_counts_file, ): if not (os.path.exists(filename)): files_ok = False logging.error("gramtools file not found: " + filename) if not files_ok: error_message = "Looks like something went wrong during gramtools run. At least one output file not present. Cannot continue." logging.error(error_message) raise Exception(error_message) with open(build_report) as f: json_build_report = json.load(f) with open(quasimap_report) as f: json_quasimap_report = json.load(f) return json_build_report, json_quasimap_report
def run_gramtools(build_dir, quasimap_dir, vcf_file, ref_file, reads, max_read_length, kmer_size=10, seed=42): '''If build_dir does not exist, runs runs gramtools build and quasimap. Otherwise, just runs quasimap. quasimap output is in new directory called quasimap_dir. "reads" can be one filename, or a list of filenames. Raises Error if either of the expected json coverage files made by quasimap are not found.''' gramtools_exe = dependencies.find_binary('gramtools') if not os.path.exists(build_dir): run_gramtools_build(build_dir, vcf_file, ref_file, max_read_length, kmer_size=kmer_size) if type(reads) is not list: assert type(reads) is str reads = [reads] quasimap_command = ' '.join([ gramtools_exe, 'quasimap', f'--seed {seed}', '--gram-directory', build_dir, '--output-directory', quasimap_dir, ' '.join(['--reads ' + x for x in reads]), ]) logging.info('Running gramtools quasimap: ' + quasimap_command) utils.syscall(quasimap_command) logging.info('Finished running gramtools quasimap') build_report = os.path.join(build_dir, 'build_report.json') quasimap_report = os.path.join(quasimap_dir, 'report.json') allele_base_counts_file = os.path.join(quasimap_dir, 'allele_base_coverage.json') grouped_allele_counts_file = os.path.join( quasimap_dir, 'grouped_allele_counts_coverage.json') files_ok = True for filename in build_report, quasimap_report, allele_base_counts_file, grouped_allele_counts_file: if not (os.path.exists(filename)): files_ok = False logging.error('gramtools file not found: ' + filename) if not files_ok: error_message = 'Looks like something went wrong duing gramtools run. At least one output file not present. Cannot continue.' logging.error(error_message) raise Error(error_message) with open(build_report) as f: json_build_report = json.load(f) with open(quasimap_report) as f: json_quasimap_report = json.load(f) return json_build_report, json_quasimap_report
def run(self): self._make_output_dir() fh = logging.FileHandler(self.log_file, mode="w") log = logging.getLogger() formatter = logging.Formatter( "[minos %(asctime)s %(levelname)s] %(message)s", datefmt="%d-%m-%Y %H:%M:%S") fh.setFormatter(formatter) log.addHandler(fh) dependencies.check_and_report_dependencies(programs=["nextflow"]) self._prepare_nextflow_input_files() original_dir = os.getcwd() os.chdir(self.output_dir) nextflow_script = "nextflow.run.nf" MultiSamplePipeline._write_nextflow_script(nextflow_script) logging.info("Prepared nextflow files. cd " + self.output_dir) nextflow = dependencies.find_binary("nextflow") nextflow_command = [ nextflow, "run", "-work-dir", self.nextflow_work_dir, "-with-dag", "nextflow.out.dag.pdf", "-with-trace", "nextflow.out.trace.txt", ] if self.nextflow_config_file is not None: nextflow_command.extend(["-c", self.nextflow_config_file]) nextflow_command += [ nextflow_script, "--ref_fasta", self.ref_fasta, "--data_in_tsv", self.nextflow_input_tsv, "--max_alleles_per_cluster", str(self.max_alleles_per_cluster), "--min_large_ref_length", str(self.min_large_ref_length), "--final_outdir", self.output_dir, "--gramtools_max_read_length", str(self.gramtools_max_read_length), "--cluster_small_vars_ram", str(self.nf_ram_cluster_small_vars), "--gramtools_build_small_vars_ram", str(self.nf_ram_gramtools_build_small), "--gramtools_kmer_size", str(self.gramtools_kmer_size), "--gramtools_build_threads", str(self.gramtools_build_threads), "--minos_small_vars_ram", str(self.nf_ram_minos_small_vars), "--merge_small_vars_ram", str(self.nf_ram_merge_small_vars), ] if self.testing: nextflow_command.append("--testing") if self.use_unmapped_reads: nextflow_command.append("--use_unmapped_reads") if self.variants_per_split is not None: nextflow_command.append("--variants_per_split " + str(self.variants_per_split)) if self.alleles_per_split is not None: nextflow_command.append("--alleles_per_split " + str(self.alleles_per_split)) elif self.total_splits is not None: nextflow_command.append("--total_splits " + str(self.total_splits)) nextflow_command = " ".join(nextflow_command) if self.no_run: print( "Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:" ) print(nextflow_command) return else: logging.info("Start running nextflow: " + nextflow_command) syscall_process = utils.syscall(nextflow_command) logging.info( "Finish running nextflow. Writing nextflow stdout/stderr to files" ) with open("nextflow.stdout", "w") as f: print(syscall_process.stdout.rstrip(), file=f) with open("nextflow.stderr", "w") as f: print(syscall_process.stderr.rstrip(), file=f) logging.info("cd " + original_dir) if self.clean: logging.info("Delete nextflow work directory " + self.nextflow_work_dir) shutil.rmtree(self.nextflow_work_dir) logging.info("Delete .nextflow directory") shutil.rmtree(".nextflow") logging.info("Rename .nextflow.log -> nextflow.log") os.rename(".nextflow.log", "nextflow.log") os.chdir(original_dir)
def _run_dnadiff(cls, ref_fasta, query_fasta, outprefix): command = ' '.join(['dnadiff -p', outprefix, ref_fasta, query_fasta]) utils.syscall(command)