def reads_train_test_split(ref_root, test_size, ref_path): reference_len = len(bioinf_utils.read_fasta(ref_path)) train_size = 1 - test_size files = glob.glob(os.path.join(ref_root, '*.ref')) train_path = os.path.join(ref_root, 'train.txt') test_path = os.path.join(ref_root, 'test.txt') with open(train_path, 'w') as trainf, open(test_path, 'w') as testf: for file_path in tqdm(files): basename = os.path.basename(file_path) name, ext = os.path.splitext(basename) with open(file_path, 'r') as fin: next(fin) # skip header line = next( fin) # line2: abs_start_pos\tstart_position\tlength start, rel_start, length = [int(x) for x in line.split()] end = start + length if end < reference_len * train_size: # train data trainf.write("%s\t%d\n" % (name, length)) elif start > reference_len * train_size and end <= reference_len: # starts after 'split' and does not overlap in case of circular aligment # test data testf.write("%s\t%d\n" % (name, length)) else: logging.info('Skipping ref, overlaps train and test')
def extend_cigars_in_sam(sam_in, ref_path, fastx_path, sam_out=None): tmp_dir = None tmp_sam_out = sam_out inplace = sam_out is None if inplace: # inplace change using tmp file tmp_dir = tempfile.mkdtemp() tmp_sam_out = os.path.join(tmp_dir, 'tmp.sam') ref = butil.read_fasta(ref_path) reads = {} with pysam.FastxFile(fastx_path, 'r') as fh: for r in fh: reads[r.name] = r with pysam.AlignmentFile(sam_in, "r") as in_sam, \ pysam.AlignmentFile(tmp_sam_out, "w", template=in_sam) as out_sam: for x in tqdm(in_sam.fetch(), unit='reads'): if x.query_name not in reads: logging.warning("read %s in sam not found in .fastx", x.query_name) continue if x.is_unmapped: logging.warning("read %s is unmapped, copy to out sam as is", x.query_name) out_sam.write(x) continue read_seq = reads[x.query_name].sequence ref_seq = ref[x.reference_start:x.reference_end] cigar_pairs = x.cigartuples if x.is_reverse: read_seq = butil.reverse_complement(read_seq) x.cigarstring = extend_cigar(read_seq, ref_seq, cigar_pairs) out_sam.write(x) if inplace: # clear tmp files shutil.move(tmp_sam_out, sam_in) shutil.rmtree(tmp_dir)
def process_mpileup(name, alignments_path, reference_path, mpileup_path, coverage_threshold, output_prefix): def _nlines(path): with open(path, 'r') as f: n_lines = sum(1 for _ in f) return n_lines n_lines = _nlines(mpileup_path) with open(mpileup_path, 'r') as fp: # snp_count, # insertion_count, # deletion_count, counts = np.zeros((7, )) fp_variant = None fp_vcf = None if output_prefix: os.makedirs(output_prefix, exist_ok=True) variant_file = os.path.join( output_prefix, 'cov_%d.variant.csv' # # num_undercovered_bases, # num_called_bases, # num_correct_bases, # coverage_sum% coverage_threshold ) fp_variant = open(variant_file, 'w') vcf_file = os.path.join(output_prefix, 'cov_%d.variant.vcf' % coverage_threshold) fp_vcf = open(vcf_file, 'w') fp_vcf.write('##fileformat=VCFv4.0\n') fp_vcf.write('##fileDate=20150409\n') fp_vcf.write('##source=none\n') fp_vcf.write('##reference=%s\n' % reference_path) fp_vcf.write( '##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">\n' ) fp_vcf.write( '##INFO=<ID=TYPE,Number=A,Type=String,Description="Type of each allele (snp, ins, del, mnp, complex)">\n' ) fp_vcf.write( '##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">\n' ) fp_vcf.write( '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">\n' ) fp_vcf.write( '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">\n' ) fp_vcf.write( '##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">\n' ) fp_vcf.write( '##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">\n' ) fp_vcf.write( '##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">\n' ) fp_vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n') fp_vcf.flush() i = 0 j = 0 num_bases_to_skip = 0 for line in tqdm(fp, total=n_lines, desc="processing_mpileup"): if num_bases_to_skip > 0: num_bases_to_skip -= 1 continue num_bases_to_skip, new_counts = process_mpileup_line( line, coverage_threshold, fp_variant, fp_vcf) counts += new_counts i += num_bases_to_skip i += 1 j += 1 fp.close() if fp_variant: fp_variant.close() if fp_vcf: fp_vcf.close() # transfrorm coverage sum to average coverage counts[-1] /= (i + 1) fields = [ 'alignments_file', 'mpileup_file', 'coverage_threshold', 'snp_count', 'insertion_count', 'deletion_count', 'num_undercovered_bases', 'num_called_bases', 'num_correct_bases', 'average_coverage' ] values = [alignments_path, mpileup_path, coverage_threshold ] + counts.tolist() report = pd.DataFrame([values], columns=fields, index=[name]) report.num_called_bases = report.num_correct_bases + report.snp_count + report.insertion_count reference_len = len(butil.read_fasta(reference_path)) for col in filter(lambda c: c.endswith('_count'), report.columns): new_col = col.replace('count', 'rate') report[new_col] = 100 * report[col] / report.num_called_bases report[ 'correct_rate'] = 100 * report.num_correct_bases / report.num_called_bases report[ 'identity_percentage'] = 100 * report.num_correct_bases / reference_len if output_prefix: summary_file = os.path.join(output_prefix, 'cov_%d.sum.vcf' % coverage_threshold) report.to_csv(summary_file, sep=';', index=False) return report