def read_or_make(cls, *, path_to_genome, path_to_index=None): """ Create an index for the genome and write to file. Attempt to read from file instead if it already exists. Default `path_to_index` appends the suffix ".index". Returns the index. RA, 2020-10-23 """ from pathlib import Path DEFAULT_SUFFIX = ".index" path_to_genome = Path(path_to_genome) path_to_index = Path(path_to_index or (str(path_to_genome) + DEFAULT_SUFFIX)) assert path_to_genome.is_file() if path_to_index.is_file(): return cls.read(path_to_index) else: from humdum.io import from_fasta from humdum.utils import unlist1 return cls(unlist1(list( from_fasta(path_to_genome))).seq).write(path_to_index)
def test_sw_on_data_small(self, verbose=0): fa = Path( __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa" reference = str(unlist1(list(from_fasta(fa))).seq) in_file = list((Path(__file__).parent / "data_for_tests/data_small/").glob("*.sam")).pop() max_reads = 2 for (read, __) in zip(from_sam(in_file), range(max_reads)): read: Read ref = reference query = read.seq aligner = SmithWaterman() for alignment in aligner(ref=ref, query=query): if verbose: print(alignment.cigar, ' vs ', read.cigar) print(read.mapq, ' vs ', alignment.score) x, y, z = alignment.visualize(ref=ref, query=query) print(x) print(y) print(z) print(alignment.matching_subsegments(), ' vs ', read.cigar) self.assertEqual( alignment.cigar, read.cigar, f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}' )
def test_on_data_small(self): (read_file1, read_file2) = sorted(source_path.glob("*.fq")) genome_file = unlist1(source_path.glob("genome*.fa")) sam = AllTheKingsHorses.from_files(fa=genome_file, fq1=read_file1, fq2=read_file2) mine: AlignedSegment theirs: AlignedSegment for ((mine, theirs), n) in zip( zip(sam.alignments, from_sam(unlist1(source_path.glob("*.sam")))), count()): # See io/sam.py for the explanations self.assertEqual(mine.flag.is_minus_strand, bool(theirs.flag.value & 16)) self.assertEqual(mine.flag.is_secondary_alignment, bool(theirs.flag.value & 256)) cigar_match = (mine.cigar == theirs.cigar) pos_match = (mine.pos == theirs.pos) tlen_match = (mine.tlen == theirs.tlen) if cigar_match and pos_match: print(F"Read {mine.qname} looks good.") else: print(F"Read {mine.qname} does not match.") print(F"Mine: ", mine.cigar, "at", mine.pos) print(F"Theirs:", theirs.cigar, "at", theirs.pos) print(F"Read: ", mine.seq) # print(F"Neighborhood: ", aligned_segments.ref_genome[(mine.pos - 10):(mine.pos + 10 + len(mine.seq))]) if not tlen_match: print( F"tlen mismatch: {mine.tlen} (mine) vs {theirs.tlen} (theirs)" )
def from_files(cls, *, fa, fq1, fq2): """ Reference genome file `fa`. FASTQ files `fq1` and `fq2`. Creates an instance of AllTheKingsHorses and yields from its map_paired(...) member function. """ ref_genome = unlist1(from_fasta(fa)) index = GenomeIndex.read_or_make(path_to_genome=fa) aligner = SequenceAligner() atkh = AllTheKingsHorses(genome_index=index, sequence_aligner=aligner, ref_genome=ref_genome) class _: headers = atkh.headers() alignments = atkh.map_paired(fq1, fq2) return _
# RA, 2020-10-13 from pathlib import Path from unittest import TestCase from itertools import count from humdum.utils import unlist1, at_most_n data_root = Path(__file__).parent / "data_for_tests" source_path = data_root / "data_small" file = unlist1(sorted(source_path.glob("*.sam"))) class TestIoSam(TestCase): def test_import(self): from humdum.io import from_sam pass def test_no_import(self): """ Test whether from_sam_pysam is gone. """ with self.assertRaises(ImportError): from humdum.io import from_sam_pysam pass def test_from_sam(self): from humdum.io import from_sam alignment = list(from_sam(file)).pop()
# RA, 2020-10-23 from unittest import TestCase from pathlib import Path from humdum.main import AllTheKingsHorses from humdum.utils import relpath, unlist1, at_most_n from humdum.io import AlignedSegment from humdum.io import from_sam from itertools import count data_root = Path(__file__).parent / "data_for_tests" source_path = data_root / "data" genome_file = unlist1(source_path.glob("genome*.fa.gz")) class TestATKH(TestCase): def test_on_data_large_5xCov(self): (read_file1, read_file2) = sorted(source_path.glob("*5xCov*.fq*")) sam = AllTheKingsHorses.from_files(fa=genome_file, fq1=read_file1, fq2=read_file2) for alignment in at_most_n(sam.alignments, 50): print(alignment)
import time from unittest import TestCase from pathlib import Path from humdum.io import open_maybe_gz from humdum.utils import unlist1 from humdum.index import FmIndex as GenomeIndex data_root = Path(__file__).parent / "data_for_tests/data" genome_file = unlist1(data_root.glob("*.fa.gz")) class TestFm(TestCase): def test_open_and_read(self): with open_maybe_gz(genome_file) as fd: fd.readline() fd.readline() fd.readline() def test_init_write(self): genome = "" with open_maybe_gz(genome_file) as fd: # skip first line line = fd.readline() line = fd.readline().rstrip() while True: genome += line
def read_or_make(cls, *, path_to_genome, ignored=None): return cls(unlist1(list(from_fasta(path_to_genome))).seq)