def _extend_contigs_with_bam(self, bam_in, out_prefix=None, output_all_useful_reads=False): if out_prefix is not None: fa_out1 = pyfastaq.utils.open_file_write(out_prefix + '_1.fa') fa_out2 = pyfastaq.utils.open_file_write(out_prefix + '_2.fa') keep_read_types = set([mapping.CAN_EXTEND_LEFT, mapping.CAN_EXTEND_RIGHT, mapping.KEEP]) if output_all_useful_reads: keep_read_types.add(mapping.BOTH_UNMAPPED) previous_sam = None left_seqs = [] right_seqs = [] sam_reader = pysam.Samfile(bam_in, "rb") for current_sam in sam_reader.fetch(until_eof=True): if previous_sam is None: previous_sam = current_sam continue previous_type, current_type = mapping.get_pair_type(previous_sam, current_sam, self._get_ref_length_sam_pair(sam_reader, previous_sam, current_sam), self.max_insert, min_clip=self.min_clip) for sam, sam_type in [(previous_sam, previous_type), (current_sam, current_type)]: if sam_type == mapping.CAN_EXTEND_LEFT: name = mapping.get_ref_name(sam, sam_reader) clipped = mapping.soft_clipped(sam)[0] self.contigs[name].add_left_kmer(common.decode(sam.seq[:clipped])) elif sam_type == mapping.CAN_EXTEND_RIGHT: name = mapping.get_ref_name(sam, sam_reader) self.contigs[name].add_right_kmer(common.decode(sam.seq[sam.qend:])) if out_prefix is not None and sam_type in keep_read_types: if sam.is_read1: print(mapping.sam_to_fasta(sam), file=fa_out1) else: print(mapping.sam_to_fasta(sam), file=fa_out2) previous_sam = None if out_prefix is not None: pyfastaq.utils.close(fa_out1) pyfastaq.utils.close(fa_out2) total_bases_added = 0 for ctg in self.contigs: left_length, right_length = self.contigs[ctg].extend(self.ext_min_cov, self.ext_min_ratio, self.ext_bases) if self.verbose: print(' extend contig ' + ctg, 'new_length:' + str(len(self.contigs[ctg])), 'added_left:' + str(left_length), 'added_right:' + str(right_length), sep='\t') self.contig_lengths[ctg].append([len(self.contigs[ctg]), left_length, right_length]) total_bases_added += left_length + right_length return total_bases_added
def find_incorrect_ref_bases(bam, ref_fasta): assert os.path.exists(bam) assert os.path.exists(ref_fasta) forward_keys = set(['A', 'C', 'G', 'T', 'N']) reverse_keys = set(['a', 'c', 'g', 't', 'n']) ref_seqs = {} bad_bases = {} pyfastaq.tasks.file_to_dict(ref_fasta, ref_seqs) mpileup_cmd = 'samtools mpileup ' + bam + ' | cut -f 1,2,5' mpileup_out = common.decode(subprocess.Popen(mpileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0]).split('\n')[:-1] for line in mpileup_out: # somteimes mpileup has an empty bases column, so skip those try: refname, position, pileup = line.rstrip().split() except: continue assert refname in ref_seqs position = int(position) - 1 pileup = strip_mpileup_coverage_string(pileup) counts = collections.Counter(pileup) consensus = consensus_base_both_strands(counts, forward_keys, reverse_keys, ratio=0.5) ref_base = ref_seqs[refname][position] if consensus not in [None, ref_base]: if refname not in bad_bases: bad_bases[refname] = [] bad_bases[refname].append((position, ref_base, consensus)) return bad_bases
def get_version(prog, must_be_in_path=True): assert prog in prog_to_version_cmd if not is_in_path(prog): if must_be_in_path: raise Error('Error getting version of ' + prog + ' - not found in path.') else: return 'UNKNOWN - not in path' cmd, regex = prog_to_version_cmd[prog] cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1] for line in cmd_output: hits = regex.search(line) if hits: return hits.group(1) return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"'
def get_version(prog, must_be_in_path=True): assert prog in prog_to_version_cmd if not is_in_path(prog): if must_be_in_path: raise Error('Error getting version of ' + prog + ' - not found in path.') else: return 'UNKNOWN - not in path' cmd, regex = prog_to_version_cmd[prog] cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode( cmd_output[1]).split('\n')[:-1] for line in cmd_output: hits = regex.search(line) if hits: return hits.group(1) return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"'
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1): '''Makes a dict of the most common kmers from the kmer counts output file of kmc''' counts = {} if os.path.getsize(infile) == 0: return counts tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd()) ref_seqs_file = os.path.join(tmpdir, 'ref.fa') counts_fasta_file = os.path.join(tmpdir, 'counts.fa') using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check) if not using_refs: if verbose > 2: print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True) f = pyfastaq.utils.open_file_read(infile) for line in f: if len(counts) >= number: break try: kmer, count = line.rstrip().split() count = int(count) except: raise Error('Error getting kmer info from this line:\n' + line) counts[kmer] = count pyfastaq.utils.close(f) else: if verbose > 2: print('Existing kmers or contigs to check against. Running mapping', flush=True) mapping_prefix = os.path.join(tmpdir, 'map') bam = mapping_prefix + '.bam' _counts_file_to_fasta(infile, counts_fasta_file) mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads) sam_reader = pysam.Samfile(bam, "rb") for sam in sam_reader.fetch(until_eof=True): if len(counts) >= number: break try: count = sam.qname.split('_')[1] except: raise Error('Error getting count from sequence name in bam:\n' + sam.qname) nucleotides = common.decode(sam.seq) if nucleotides not in kmers_to_ignore: counts[nucleotides] = count elif verbose >= 4: print('Skipping seed already found:', nucleotides) sam_reader.close() shutil.rmtree(tmpdir) return counts
def sam_to_fasta(s): name = s.qname if s.is_read1: name += '/1' elif s.is_read2: name += '/2' else: raise Error('Read', name, 'must be first of second of pair according to flag. Cannot continue') seq = pyfastaq.sequences.Fasta(name, common.decode(s.seq)) if s.is_reverse: seq.revcomp() return seq
def get_bam_region_coverage(bam, seqname, seq_length, rev=False, verbose=0, both_strands=False): assert os.path.exists(bam) assert os.path.exists(bam + '.bai') # mpileup only reports positions of non-zero coverage, so can't just # take its output. Need to add in the zero coverage bases cov = [0] * seq_length if both_strands: flags = '' elif rev: flags = '--rf 0x10' else: flags = '--ff 0x10' mpileup_cmd = 'samtools mpileup -r ' + seqname + ' ' + flags + ' ' + bam + ' | cut -f 2,4' if verbose >= 2: print(' get_bam_region_coverage:', mpileup_cmd) mpileup_out = common.decode(subprocess.Popen(mpileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0]).split('\n')[:-1] for line in mpileup_out: pos, depth = [int(killer_rabbit) for killer_rabbit in line.rstrip().split()] cov[pos - 1] = depth return cov