Ejemplo n.º 1
0
    def read_or_make(cls, *, path_to_genome, path_to_index=None):
        """
        Create an index for the genome and write to file.
        Attempt to read from file instead if it already exists.
        Default `path_to_index` appends the suffix ".index".
        Returns the index.

        RA, 2020-10-23
        """

        from pathlib import Path
        DEFAULT_SUFFIX = ".index"
        path_to_genome = Path(path_to_genome)
        path_to_index = Path(path_to_index
                             or (str(path_to_genome) + DEFAULT_SUFFIX))

        assert path_to_genome.is_file()

        if path_to_index.is_file():
            return cls.read(path_to_index)
        else:
            from humdum.io import from_fasta
            from humdum.utils import unlist1
            return cls(unlist1(list(
                from_fasta(path_to_genome))).seq).write(path_to_index)
    def test_sw_on_data_small(self, verbose=0):

        fa = Path(
            __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa"

        reference = str(unlist1(list(from_fasta(fa))).seq)

        in_file = list((Path(__file__).parent /
                        "data_for_tests/data_small/").glob("*.sam")).pop()
        max_reads = 2
        for (read, __) in zip(from_sam(in_file), range(max_reads)):
            read: Read
            ref = reference
            query = read.seq
            aligner = SmithWaterman()
            for alignment in aligner(ref=ref, query=query):
                if verbose:
                    print(alignment.cigar, ' vs ', read.cigar)
                    print(read.mapq, ' vs ', alignment.score)
                    x, y, z = alignment.visualize(ref=ref, query=query)
                    print(x)
                    print(y)
                    print(z)
                    print(alignment.matching_subsegments(), ' vs ', read.cigar)
                self.assertEqual(
                    alignment.cigar, read.cigar,
                    f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}'
                )
Ejemplo n.º 3
0
    def test_on_data_big(self):
        source_path = data_root / "data"
        files = list(source_path.glob("*.fa.gz"))
        assert files

        for file in files:
            for genome in from_fasta(file):
                self.assertEqual(len(genome.seq), 51304566)
                self.assertTrue(genome.seq.strip("N").endswith("CGGATT"))
Ejemplo n.º 4
0
 def test_reads_well(self):
     desc = "Hello"
     seq1 = "ABC"
     seq2 = "DEF"
     with NamedTemporaryFile(mode='w') as fn:
         print(*[">" + desc, seq1, seq2], sep='\n', file=fn, flush=True)
         record = first(from_fasta(fn.name))
         self.assertEqual(record.desc, desc)
         self.assertEqual(record.seq, seq1 + seq2)
Ejemplo n.º 5
0
    def test_data_small_vs_biopython(self):
        source_path = data_root / "data_small"
        files = list(source_path.glob("*.fa"))
        assert files

        from Bio import SeqIO

        for file in files:
            reference_reads = list(SeqIO.parse(file, format='fasta'))
            candidate_reads = list(from_fasta(file))
            self.assertEqual(len(reference_reads), len(candidate_reads))
            for (reference, candidate) in zip(reference_reads,
                                              candidate_reads):
                self.assertIsInstance(reference, SeqIO.SeqRecord)
                self.assertIsInstance(candidate, Sequence)
                self.assertEqual(str(reference.seq), candidate.seq)
                self.assertEqual(reference.description, candidate.desc)
Ejemplo n.º 6
0
    def from_files(cls, *, fa, fq1, fq2):
        """
        Reference genome file `fa`.
        FASTQ files `fq1` and `fq2`.

        Creates an instance of AllTheKingsHorses and
        yields from its map_paired(...) member function.
        """

        ref_genome = unlist1(from_fasta(fa))

        index = GenomeIndex.read_or_make(path_to_genome=fa)

        aligner = SequenceAligner()

        atkh = AllTheKingsHorses(genome_index=index,
                                 sequence_aligner=aligner,
                                 ref_genome=ref_genome)

        class _:
            headers = atkh.headers()
            alignments = atkh.map_paired(fq1, fq2)

        return _
Ejemplo n.º 7
0
 def test_fails_when_many(self):
     with NamedTemporaryFile(mode='w') as fn:
         print(*[">A", "N", ">B", "N"], sep='\n', file=fn, flush=True)
         with self.assertRaises(AssertionError):
             list(from_fasta(fn.name))
Ejemplo n.º 8
0
 def read_or_make(cls, *, path_to_genome, ignored=None):
     return cls(unlist1(list(from_fasta(path_to_genome))).seq)