Example #1
0
 def _map_reads(self,
                fwd_reads,
                rev_reads,
                out_prefix,
                required_flag=None,
                exclude_flag=None,
                sort_reads=False,
                mate_ref=None,
                no_map_contigs=None):
     if no_map_contigs is None:
         no_map_contigs = set()
     if self.verbose:
         print('    map reads', fwd_reads, rev_reads, sep='\t')
     reference = out_prefix + '.ref.fa'
     self.write_contigs_to_file(reference, do_not_write=no_map_contigs)
     mapping.map_reads(fwd_reads,
                       rev_reads,
                       reference,
                       out_prefix,
                       index_k=self.map_index_k,
                       index_s=self.map_index_s,
                       threads=self.threads,
                       max_insert=self.max_insert,
                       minid=self.map_minid,
                       verbose=self.verbose,
                       required_flag=required_flag,
                       sort=sort_reads,
                       exclude_flag=exclude_flag)
     if self.clean:
         os.unlink(reference)
     os.unlink(reference + '.fai')
Example #2
0
 def test_map_reads_wth_flag(self):
     '''Test map_reads with required flag'''
     ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa')
     reads_prefix = os.path.join(data_dir, 'mapping_test.reads')
     out_prefix = 'tmp.out'
     mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix, required_flag=12, verbose=3)
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.flag12.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #3
0
 def test_map_reads(self):
     '''Test mapping reads'''
     ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa')
     reads_prefix = os.path.join(data_dir, 'mapping_test.reads')
     out_prefix = 'tmp.out'
     mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix)
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #4
0
 def _map_reads(self, fwd_reads, rev_reads, out_prefix, required_flag=None, exclude_flag=None, sort_reads=False, mate_ref=None, no_map_contigs=None):
     if no_map_contigs is None:
         no_map_contigs = set()
     if self.verbose:
         print('    map reads', fwd_reads, rev_reads, sep='\t')
     reference = out_prefix + '.ref.fa'
     self.write_contigs_to_file(reference, do_not_write=no_map_contigs)
     mapping.map_reads(fwd_reads, rev_reads, reference, out_prefix, index_k=self.map_index_k, index_s=self.map_index_s, threads=self.threads, max_insert=self.max_insert, minid=self.map_minid, verbose=self.verbose, required_flag=required_flag, sort=sort_reads, exclude_flag=exclude_flag)
     if self.clean:
         os.unlink(reference)
     os.unlink(reference + '.fai')
Example #5
0
 def test_map_reads_and_sort(self):
     '''Test mapping reads and sort BAM'''
     ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa')
     reads_prefix = os.path.join(data_dir, 'mapping_test.reads')
     out_prefix = 'tmp.out'
     mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix, sort=True, verbose=3)
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
     os.unlink(out_prefix + '.bam.bai')
     os.unlink(out_prefix + '.unsorted.bam')
Example #6
0
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1):
    '''Makes a dict of the most common kmers from the kmer counts output file of kmc'''
    counts = {}
    if os.path.getsize(infile) == 0:
        return counts
    tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd())
    ref_seqs_file = os.path.join(tmpdir, 'ref.fa')
    counts_fasta_file = os.path.join(tmpdir, 'counts.fa')
    using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check)

    if not using_refs:
        if verbose > 2:
            print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True)
        f = pyfastaq.utils.open_file_read(infile)
        for line in f:
            if len(counts) >= number:
                break
            try:
                kmer, count = line.rstrip().split()
                count = int(count)
            except:
                raise Error('Error getting kmer info from this line:\n' + line)

            counts[kmer] = count
        pyfastaq.utils.close(f)
    else:
        if verbose > 2:
            print('Existing kmers or contigs to check against. Running mapping', flush=True)
        mapping_prefix = os.path.join(tmpdir, 'map')
        bam = mapping_prefix + '.bam'
        _counts_file_to_fasta(infile, counts_fasta_file)
        mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads)

        sam_reader = pysam.Samfile(bam, "rb")
        for sam in sam_reader.fetch(until_eof=True):
            if len(counts) >= number:
                break
            try:
                count = sam.qname.split('_')[1]
            except:
                raise Error('Error getting count from sequence name in bam:\n' + sam.qname)

            nucleotides = common.decode(sam.seq)
            if nucleotides not in kmers_to_ignore:
                counts[nucleotides] = count
            elif verbose >= 4:
                print('Skipping seed already found:', nucleotides)
        sam_reader.close()

    shutil.rmtree(tmpdir)
    return counts
Example #7
0
File: kcount.py Project: satta/iva
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1):
    '''Makes a dict of the most common kmers from the kmer counts output file of kmc'''
    counts = {}
    if os.path.getsize(infile) == 0:
        return counts
    tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd())
    ref_seqs_file = os.path.join(tmpdir, 'ref.fa')
    counts_fasta_file = os.path.join(tmpdir, 'counts.fa')
    using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check)

    if not using_refs:
        if verbose > 2:
            print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True)
        f = pyfastaq.utils.open_file_read(infile)
        for line in f:
            if len(counts) >= number:
                break
            try:
                kmer, count = line.rstrip().split()
                count = int(count)
            except:
                raise Error('Error getting kmer info from this line:\n' + line)

            counts[kmer] = count
        pyfastaq.utils.close(f)
    else:
        if verbose > 2:
            print('Existing kmers or contigs to check against. Running mapping', flush=True)
        mapping_prefix = os.path.join(tmpdir, 'map')
        bam = mapping_prefix + '.bam'
        _counts_file_to_fasta(infile, counts_fasta_file)
        mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads)

        sam_reader = pysam.Samfile(bam, "rb")
        for sam in sam_reader.fetch(until_eof=True):
            if len(counts) >= number:
                break
            try:
                count = sam.qname.split('_')[1]
            except:
                raise Error('Error getting count from sequence name in bam:\n' + sam.qname)

            nucleotides = common.decode(sam.seq)
            if nucleotides not in kmers_to_ignore:
                counts[nucleotides] = count
            elif verbose >= 4:
                print('Skipping seed already found:', nucleotides)
        sam_reader.close()

    shutil.rmtree(tmpdir)
    return counts
Example #8
0
def _trim_ends(fasta_in,
               fasta_out,
               to_trim,
               min_length=100,
               min_dist_to_end=25,
               window_length=10,
               min_pc=90):
    '''Trim sequences off contig ends.'''
    tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd())
    tmp_prefix = os.path.join(tmpdir, 'out')
    sorted_bam = tmp_prefix + '.bam'
    mapping.map_reads(to_trim,
                      None,
                      fasta_in,
                      tmp_prefix,
                      index_k=9,
                      index_s=1,
                      threads=1,
                      minid=0.75,
                      sort=True,
                      extra_smalt_map_ops='-d -1 -m 10')

    f_out = pyfastaq.utils.open_file_write(fasta_out)
    seq_reader = pyfastaq.sequences.file_reader(fasta_in)
    for seq in seq_reader:
        coverage = mapping.get_bam_region_coverage(sorted_bam,
                                                   seq.id,
                                                   len(seq),
                                                   both_strands=True)
        good_coords = _coverage_to_trimmed_coords(
            coverage,
            min_dist_to_end=min_dist_to_end,
            window_length=window_length,
            min_pc=min_pc)
        if good_coords is None:
            continue

        seq.seq = seq.seq[good_coords[0]:good_coords[1] + 1]
        if len(seq) >= min_length:
            print(seq, file=f_out)

    pyfastaq.utils.close(f_out)
    shutil.rmtree(tmpdir)
Example #9
0
    def process(self):
        self.tmpdir = tempfile.mkdtemp(prefix='tmp.process_seeds.',
                                       dir=os.getcwd())
        tmp_prefix = os.path.join(self.tmpdir, 'out')
        mapping.map_reads(self.reads1,
                          self.reads2,
                          self.seeds_fasta,
                          tmp_prefix,
                          index_k=self.index_k,
                          index_s=self.index_s,
                          threads=self.threads,
                          max_insert=self.max_insert,
                          minid=self.minid,
                          sort=True)
        self.bam_file = tmp_prefix + '.bam'
        threads = min(8, self.threads)  # to save peak memory going too high
        threads = self.threads

        if self.verbose:
            print('Processing seeds with', threads, 'threads:',
                  list(self.original_seeds.keys()))

        pool = multiprocessing.Pool(threads)
        pool.map(self._make_new_seed, list(self.original_seeds.keys()))
        pool.close()
        pool.join()
        if self.verbose:
            print('... finished processing seeds')

        new_seeds = {}
        for seed_name in self.original_seeds:
            fname = tmp_prefix + '.' + seed_name + '.fa'
            if os.path.exists(fname):
                pyfastaq.tasks.file_to_dict(fname, new_seeds)

        if len(new_seeds) == 0:
            raise Error('Error! did not make any new seeds. Cannot continue')
        f = pyfastaq.utils.open_file_write(self.outfile)
        for seq in new_seeds.values():
            print(seq, file=f)
        pyfastaq.utils.close(f)
        shutil.rmtree(self.tmpdir)
Example #10
0
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90):
    '''Trim sequences off contig ends.'''
    tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd())
    tmp_prefix = os.path.join(tmpdir, 'out')
    sorted_bam = tmp_prefix + '.bam'
    mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10')

    f_out = pyfastaq.utils.open_file_write(fasta_out)
    seq_reader = pyfastaq.sequences.file_reader(fasta_in)
    for seq in seq_reader:
        coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True)
        good_coords = _coverage_to_trimmed_coords(coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc)
        if good_coords is None:
            continue
 
        seq.seq = seq.seq[good_coords[0]:good_coords[1]+1]
        if len(seq) >= min_length:
            print(seq, file=f_out)

    pyfastaq.utils.close(f_out)
    shutil.rmtree(tmpdir)
Example #11
0
    def process(self):
        self.tmpdir = tempfile.mkdtemp(prefix='tmp.process_seeds.', dir=os.getcwd())
        tmp_prefix = os.path.join(self.tmpdir, 'out')
        mapping.map_reads(self.reads1, self.reads2, self.seeds_fasta, tmp_prefix,
            index_k = self.index_k,
            index_s = self.index_s,
            threads = self.threads,
            max_insert = self.max_insert,
            minid = self.minid,
            sort = True)
        self.bam_file = tmp_prefix + '.bam'
        threads = min(8, self.threads) # to save peak memory going too high
        threads = self.threads

        if self.verbose:
            print('Processing seeds with', threads, 'threads:', list(self.original_seeds.keys()))

        pool = multiprocessing.Pool(threads)
        pool.map(self._make_new_seed, list(self.original_seeds.keys()))
        pool.close()
        pool.join()
        if self.verbose:
            print('... finished processing seeds')

        new_seeds = {}
        for seed_name in self.original_seeds:
            fname = tmp_prefix + '.' + seed_name + '.fa'
            if os.path.exists(fname):
                pyfastaq.tasks.file_to_dict(fname, new_seeds)

        if len(new_seeds) == 0:
            raise Error('Error! did not make any new seeds. Cannot continue')
        f = pyfastaq.utils.open_file_write(self.outfile)
        for seq in new_seeds.values():
            print(seq, file=f)
        pyfastaq.utils.close(f)
        shutil.rmtree(self.tmpdir)