def test_from_existing_bam(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") assert lane.name == "test_lane" assert lane.load()[0] is bam_job assert isinstance(lane.load()[1], ppg.FileInvariant) assert lane.genome is genome assert not lane.is_paired assert lane.vid == "AA123" with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True, "AA123") assert lane2.is_paired b = lane.get_bam() assert isinstance(b, pysam.Samfile) b = lane.get_unique_aligned_bam() assert isinstance(b, pysam.Samfile) assert lane.get_bam_names()[0] == bam_path assert lane.get_bam_names()[1] == bam_path + ".bai" assert lane.mapped_reads() == 8 assert lane.unmapped_reads() == 0 for job in get_qc_jobs(): assert job._pruned
def test_to_fastq(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = object() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") fastq_path = "out.fastq" lane.to_fastq(fastq_path) ppg.run_pipegraph() assert Path(fastq_path).exists() assert (Path(fastq_path).read_text() == """@read_28833_29006_6945 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @read_28701_28881_323b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_323c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324a TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324b TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @read_28701_28881_324c TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT + <<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< @test_clipped1 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG + <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< """) lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, is_paired=True, vid="AA123") with pytest.raises(ValueError): lane2.to_fastq( "nope.fastq") # no support for paired end data at this point
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements): input_path_bam = Path(input_bam_path) output_bam_path = Path(output_bam_path) def do_replace(replacements=replacements): reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements) output_bam_path.parent.mkdir(exist_ok=True, parents=True) return ppg.MultiFileGeneratingJob( [output_bam_path, output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on( ppg.FileInvariant(input_bam_path), ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes", reheader_and_rename_chromosomes), )
def __init__( self, name, genome_fasta_filename, cdna_fasta_filename, protein_fasta_filename, gtf_filename, cache_dir, ): """ A FileBasedGenome used for interactive work, uses files that a FileBasedGenome has created in a previous ppg run. """ super().__init__() self.name = name self.cache_dir = Path(cache_dir) self.genome_fasta_filename = genome_fasta_filename self.cdna_fasta_filename = cdna_fasta_filename self.protein_fasta_filename = protein_fasta_filename self.gtf_filename = gtf_filename self._filename_lookups = { "genome.fasta": self.genome_fasta_filename, "cdna.fasta": self.cdna_fasta_filename, "protein.fasta": self.protein_fasta_filename, "genes.gtf": self.gtf_filename, "df_genes.msgpack": self.cache_dir / "lookup" / "df_genes.msgpack", "df_transcripts.msgpack": self.cache_dir / "lookup" / "df_transcripts.msgpack", } if ppg.util.inside_ppg(): self.gene_gtf_dependencies = ppg.FileInvariant(self.gtf_filename) else: self.gene_gtf_dependencies = []
def PseudoNotebookRun(notebook_python_file, target_object, chdir=False): notebook_python_file = str(notebook_python_file) inv = ppg.FileInvariant(notebook_python_file) def run(): import marburg_biobank.create source = Path(notebook_python_file).read_text() collector = {} def write_dfs(d): res = {} for k, v in d.items(): if isinstance(v, tuple): collector[k] = v[0] # throw away description else: collector[k] = v return res def get_dummy_ipython(): class DummyIpython: def run_line_magic(self, *args, **kwargs): pass return DummyIpython() marburg_biobank.create.write_dfs = write_dfs g = globals().copy() g["get_ipython"] = get_dummy_ipython g['here'] = Path(notebook_python_file).parent.absolute() ppg.util.global_pipegraph = None if chdir: os.chdir(Path(notebook_python_file).parent) exec(source, g) os.chdir("/project") return collector return ppg.CachedAttributeLoadingJob(notebook_python_file + ".result", target_object, "data", run).depends_on(inv)
def test_chromosome_mapping(self): bam_path = get_sample_data(Path("mbf_align/ex2.bam")) bam_job = ppg.FileInvariant(bam_path) genome = DummyGenome() lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") assert lane.name == "test_lane" assert lane.load()[0] is bam_job assert isinstance(lane.load()[1], ppg.FileInvariant) assert lane.genome is genome assert not lane.is_paired assert lane.vid == "AA123" with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", bam_job, genome, False, "AA123") lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True, "AA123") assert lane2.is_paired b = lane.get_bam() assert isinstance(b, pysam.Samfile) b
def _parse_alignment_job_input(self, alignment_job): if isinstance(alignment_job, (str, Path)): alignment_job = ppg.FileInvariant(alignment_job) if not isinstance(alignment_job, (ppg.FileInvariant, ppg.FileGeneratingJob)): raise ValueError( "alignment_job must be a ppg.FileGeneratingJob or FileChecksumInvariant" "was %s" % (type(alignment_job))) bam_name = None bai_name = None for fn in alignment_job.filenames: if str(fn).endswith(".bam"): if bam_name is None: bam_name = str(fn) else: raise ValueError( "Job passed to AlignedSample had multiple .bam filenames" ) elif str(fn).endswith(".bai"): if bai_name is None: index_fn = str(fn) bai_name = index_fn else: raise ValueError( "Job passed to AlignedSample had multiple .bai filenames" ) if bam_name is None: raise ValueError( "Job passed to AlignedSample had no .bam filenames") if isinstance(alignment_job, ppg.MultiFileGeneratingJob): if bai_name is None: index_fn = bam_name + ".bai" index_job = ppg.FileGeneratingJob( index_fn, self._index(bam_name, index_fn)) index_job.depends_on(alignment_job) else: index_fn = bai_name index_job = alignment_job elif isinstance(alignment_job, ppg.FileGeneratingJob): index_fn = bam_name + ".bai" index_job = ppg.FileGeneratingJob(index_fn, self._index(bam_name, index_fn)) index_job.depends_on(alignment_job) elif isinstance(alignment_job, ppg.FileInvariant): index_fn = bam_name + ".bai" if Path(index_fn).exists(): index_job = ppg.FileInvariant(index_fn) else: cache_dir = Path( ppg.util.global_pipegraph.cache_folder) / "bam_indices" cache_dir.mkdir(exist_ok=True) index_fn = cache_dir / (self.name + "_" + Path(bam_name).name + ".bai") index_job = ppg.FileGeneratingJob( index_fn, self._index(bam_name, index_fn)) index_job.depends_on(alignment_job) else: raise NotImplementedError( "Should not happe / covered by earlier if") return alignment_job, index_job, Path(bam_name), Path(index_fn)
def deps(self, ddf): """Return ppg.jobs""" return ppg.FileInvariant(self.tablepath)