def frequencies(freq_dstr, num_haplotypes, ratio=0.75, infile=None): "Compute the expected haplotype frequencies" if freq_dstr == "unif": haplotype_freqs = np.repeat(1 / num_haplotypes, num_haplotypes) elif freq_dstr == "geom": haplotype_freqs = [ratio ** (i + 1) for i in range(num_haplotypes)] haplotype_freqs = np.asarray(haplotype_freqs) haplotype_freqs = haplotype_freqs / np.sum(haplotype_freqs) elif freq_dstr == "dirichlet": # Read haplotype frequencies from output file if infile is None: raise IOError("Input file containing haplotype frequencies is expected") ids, haplotype_freqs = read_fasta(infile) haplotype_freqs = np.asarray(haplotype_freqs, dtype=float) return haplotype_freqs
def main(): args = parse_args() outdir_haps = args.outdir_haps if args.outdir_haps is not None else os.getcwd( ) outdir_reads = args.outdir_reads if args.outdir_reads is not None else os.getcwd( ) seed = args.seed if args.seed is not None else np.random.randint( 1, 1000, size=1)[0] num_haplotypes = args.num_haplotypes if args.output == "master": # Simulate master sequence haplotype_seq = sim_master(length=args.genome_length, outdir_haps=outdir_haps, seed=seed) elif args.output == "haplotypes" or args.output == "all": # Simulate true underlying haplotypes if args.haplotype_seqs is None: # Strategy 1 - Simulate haplotypes from scratch # Simulate master sequence and save record to FASTA file haplotype_seq = sim_master(length=args.genome_length, outdir_haps=outdir_haps, seed=seed) if args.use_master: # Strategy 1.a - Generate haplotype sequences from a master # haplotype if args.verbose: print_params(args.mutation_rate, args.deletion_rate, args.insertion_rate) haplotype_seqs = [ mutate(haplotype_seq, args.mutation_rate, args.deletion_rate, args.insertion_rate, noFR=args.no_FR, del_len=args.deletion_length, seed=seed + i, verbose=args.verbose) for i in range(num_haplotypes) ] elif args.tree_like: # Strategy 1.b - Sample genotypes from the leaves of a perfect # tree # Max depth of the perfect tree max_depth = np.ceil(np.log2(num_haplotypes)) root = Tree(haplotype_seq, max_depth) # Populate tree leaf(root, args.mutation_rate, args.deletion_rate, args.insertion_rate, args.no_FR, args.deletion_length, seed, verbose=args.verbose) sequences = root.get_leaves() np.random.seed(seed) idxs = np.random.randint(0, len(sequences) - 1, size=num_haplotypes) haplotype_seqs = [sequences[idx] for idx in idxs] else: # Strategy 1.b - Generate num_haplotypes sequences randomly, # each of length args.genome_length # FIXME: Replace hard coded weights weights = (('A', 0.245), ('C', 0.245), ('G', 0.245), ('T', 0.245), ('-', 0.02)) haplotype_seqs = [ sim_haplotypes(length=args.genome_length, weights=weights, s=seed) for _x in range(num_haplotypes) ] # Save each haplotype sequence to a separate file write_fasta(haplotype_seqs, outdir_haps) else: # Strategy 2 - Simulate haplotypes from a file providing the # underlying # haplotype(s) if args.use_master: # Strategy 2.a - From the first sequence provided generate # num_haplotypes sequences by mutating the original sequence. header, haplotype_seq = read_fasta(args.haplotype_seqs) if args.verbose: print_params(args.mutation_rate, args.deletion_rate, args.insertion_rate) haplotype_seqs = [ mutate(haplotype_seq[0], args.mutation_rate, args.deletion_rate, args.insertion_rate, noFR=args.no_FR, del_len=args.deletion_length, seed=seed + i, verbose=args.verbose) for i in range(num_haplotypes) ] write_fasta(haplotype_seqs, outdir_haps) elif args.tree_like: # Strategy 2.b - Sample genotypes from the leaves of a perfect # tree # Max depth of the perfect tree header, haplotype_seq = read_fasta(args.haplotype_seqs) max_depth = np.ceil(np.log2(num_haplotypes)) root = Tree(haplotype_seq[0], max_depth) # Populate tree leaf(root, args.mutation_rate, args.deletion_rate, args.insertion_rate, args.no_FR, args.deletion_length, seed, verbose=args.verbose) sequences = root.get_leaves() np.random.seed(seed) idxs = np.random.randint(0, len(sequences) - 1, size=num_haplotypes) haplotype_seqs = [sequences[idx] for idx in idxs] write_fasta(haplotype_seqs, outdir_haps) else: # Strategy 2.c - Split haplotypes into different files # (implicitly, it is assumed the number of haplotypes sequences # is larger than 1) idx = 0 aux = [] output_file = '' haplotype_id = '' with open(args.haplotype_seqs, 'r') as infile: # The following is needed because the number of characters # per line in a FASTA file often do not exceed a certain number for line in infile: record = line.rstrip() if record and record[0] == '>': if idx > 0: with open(output_file, 'w') as outfile: outfile.write(haplotype_id + '\n') outfile.write(''.join(aux)) aux = [] haplotype_id = record output_file = os.path.join( outdir_haps, ''.join( ("haplotype", str(idx), ".fasta"))) idx += 1 else: aux.append(record) output_file = os.path.join( outdir_haps, ''.join( ("haplotype", str(idx - 1), ".fasta"))) with open(output_file, 'w') as outfile: outfile.writelines([haplotype_id, '\n', ''.join(aux)]) num_haplotypes = idx haplotype_file = os.path.join(outdir_haps, "haplotypes.fasta") shutil.copyfile(args.haplotype_seqs, haplotype_file) if args.output == "reads" or args.output == "all": fragment_mean, fragment_sd = [ int(x) for x in args.fragment_size.split(",") ] coverage = [int(x) for x in args.coverage.split(",")] if len(coverage) > 1: assert len(coverage) == num_haplotypes, ( "More than one value for coverage specified, but it does not " "coincide with the number of haplotypes") if args.freq_dstr == 'unif': if len(coverage) > 1: print("More than one value for coverage specified, only first " "value is used") haplotype_file = os.path.join(outdir_haps, "haplotypes.fasta") print( f"Reading file containing sequences of underlying haplotypes " f"from {haplotype_file}") tmp_file = os.path.join(outdir_haps, "tmp.fasta") shutil.copyfile(haplotype_file, tmp_file) # remove true deletions (if present) before generating reads sed('s/-//g', tmp_file, verbose=args.verbose) outprefix = os.path.join(outdir_reads, "simreads_R") sim_reads(args.art, haplotype_seq=tmp_file, coverage=coverage[0], read_len=args.read_length, fragment_mean=fragment_mean, fragment_sd=fragment_sd, outprefix=outprefix, paired=args.paired, highQ=args.quality, num_reads=args.num_reads, seed=seed, verbose=args.verbose) os.remove(tmp_file) # Make headers compatible with output from Illumina platforms # (expected by ngshmmalign) if args.paired: sed('/^@.*-[0-9]*\/1$/ s/\/1$/ 1:N:0:5/', ''.join((outprefix, '1.fq')), verbose=args.verbose) sed('/^@.*-[0-9]*\/2$/ s/\/2$/ 2:N:0:5/', ''.join((outprefix, '2.fq')), verbose=args.verbose) # Rename output file if args.paired: os.rename(''.join((outprefix, '1.fq')), ''.join( (outprefix, '1.fastq'))) os.rename(''.join((outprefix, '2.fq')), ''.join( (outprefix, '2.fastq'))) else: os.rename(''.join((outprefix, '.fq')), ''.join( (outprefix, '1.fastq'))) elif args.freq_dstr == 'geom' or args.freq_dstr == 'dirichlet' or args.freq_dstr == 'cust': if args.freq_dstr == 'geom': if len(coverage) > 1: print( "More than one value for coverage specified, only first " "value is used") freqs = [ args.geom_ratio**(i + 1) for i in range(num_haplotypes) ] freqs = np.asarray(freqs) freqs = freqs / np.sum(freqs) np.set_printoptions(precision=4) if args.verbose: print("Relative abundances: ", freqs) coverage = freqs * coverage[0] coverage = coverage.astype(int) elif args.freq_dstr == 'dirichlet': if len(coverage) > 1: print( "More than one value for coverage specified, only first " "value is used") if args.dirichlet_alpha is None: alpha = np.ones(num_haplotypes) else: alpha = [float(x) for x in args.dirichlet_alpha.split(",")] if len(alpha) == 1: alpha = np.repeat(alpha, num_haplotypes) assert len(alpha) == num_haplotypes, ( "The number of Dirichlet parameters and number of " "haplotypes does not coincide") np.random.seed(seed) freqs = np.random.dirichlet(alpha) np.set_printoptions(precision=4) if args.verbose: print("Relative abundances: ", freqs) coverage = freqs * coverage[0] coverage = coverage.astype(int) # write to output fasta_record = collections.namedtuple("fasta_record", "id seq") output_file = os.path.join(outdir_haps, "haplotype_frequencies.fasta") with open(output_file, 'w') as outfile: for i in range(num_haplotypes): haplotype_id = ''.join(("haplotype", str(i))) line = fasta_record(id=haplotype_id, seq=freqs[i]) outfile.write(">{}\n{}\n".format(line.id, line.seq)) elif args.freq_dstr == 'cust': print("Not implemented yet!") sys.exit() if args.paired: outfiles_R1 = [] outfiles_R2 = [] else: outfiles = [] for idx in range(num_haplotypes): haplotype_file = os.path.join( outdir_haps, ''.join(("haplotype", str(idx), ".fasta"))) print(f"Reading file containing sequence of haplotype {idx} " f"from {haplotype_file}") # remove true deletions (if present) before generating reads sed('s/-//g', haplotype_file, verbose=args.verbose) outprefix = os.path.join( outdir_reads, ''.join(("reads_hap", str(idx), "_R"))) if args.paired: outfiles_R1.append(''.join((outprefix, "1.fq"))) outfiles_R2.append(''.join((outprefix, "2.fq"))) else: outfiles.append(''.join((outprefix, ".fq"))) sim_reads(args.art, haplotype_seq=haplotype_file, coverage=coverage[idx], read_len=args.read_length, fragment_mean=fragment_mean, fragment_sd=fragment_sd, outprefix=outprefix, paired=args.paired, highQ=args.quality, num_reads=args.num_reads, seed=seed, verbose=args.verbose) if args.paired: sh.cat(outfiles_R1, _out=os.path.join(outdir_reads, "simreads_R1.fastq")) sh.cat(outfiles_R2, _out=os.path.join(outdir_reads, "simreads_R2.fastq")) for idx in range(len(outfiles_R1)): os.remove(outfiles_R1[idx]) os.remove(outfiles_R2[idx]) else: sh.cat(outfiles, _out=os.path.join(outdir_reads, "simreads_R1.fastq")) for f in outfiles: os.remove(f) # Make headers compatible with output from Illumina platforms # (expected by ngshmmalign) if args.paired: sed('/^@.*-[0-9]*\/1$/ s/\/1$/ 1:N:0:5/', os.path.join(outdir_reads, "simreads_R1.fastq"), verbose=args.verbose) sed('/^@.*-[0-9]*\/2$/ s/\/2$/ 2:N:0:5/', os.path.join(outdir_reads, "simreads_R2.fastq"), verbose=args.verbose)
def main(): args = parse_args() alphabet = ['-', 'A', 'C', 'G', 'T'] alphabet = np.array(alphabet, dtype='c') # Compute average frequency for SNVs called using ShoRAH loci_inferred, ref_inferred, snvs_inferred, freq_inferred = inferred_snvs( args.snvs) if not loci_inferred: print("No called SNVs") with open(args.outfile, 'w') as outfile: outfile.write('ID\tTP\tFP\tFN\tTN\n') return outdir = args.outdir if args.outdir is not None else os.getcwd() if args.haplotype_master is not None: # Parse file containing reference/consensus sequence (sequence w.r.t # which SNVs were called) header, haplotype_master = read_fasta(args.haplotype_master) header = header[0] haplotype_master = haplotype_master[0].upper() haplotype_master_array = np.array(list(haplotype_master)) reference_len = haplotype_master_array.size if args.msa: # Expected if cohort consensus has gaps if args.reference: tmp, reference = read_fasta(args.reference) reference = reference[0].upper() reference = np.array(list(reference)) assert reference.size == haplotype_master_array.size, ( "Reference and cohort consensus have different lengths") idxs_gaps = haplotype_master_array == '-' haplotype_master_array[idxs_gaps] = reference[idxs_gaps] args.haplotype_master = os.path.join(outdir, 'cohort_consensus.fasta') cohort_consensus = SeqRecord(Seq( ''.join(haplotype_master_array)), id=header, description="") with open(args.haplotype_master, 'w') as outfile: SeqIO.write(cohort_consensus, outfile, "fasta") haplotype_master_array = haplotype_master_array.astype('c') # construct msa: haplotypes + reference/consensus sequence infile = os.path.join(outdir, "tmp.fasta") sh.cat([args.haplotype_seqs, args.haplotype_master], _out=infile) msa_file = os.path.join(outdir, 'haplotypes_re-msa.fasta') mafft(infile, msa_file, mafft=args.mafft) os.remove(infile) # Parse fasta file containing msa haplotype_ids, haplotype_seqs = read_fasta(msa_file) num_haplotypes = len(haplotype_ids) - 1 haplotype_ref = haplotype_seqs[-1] haplotype_ref = haplotype_ref.upper() haplotype_ref = np.array(haplotype_ref, dtype='c') if haplotype_ref.size != reference_len: assert haplotype_ref.size > reference_len, ( "Length of the consensus/reference sequence after the " "MSA is smaller") # Deletions '-' were placed on the consensus/reference # sequence after the msa idx_master = 0 idx_ref = 0 idxs_ref = np.arange(haplotype_ref.size) del_idxs = np.zeros(haplotype_ref.size, dtype=bool) for i in range(haplotype_ref.size - reference_len): left = min(reference_len + i - idx_ref, haplotype_master_array[idx_master:].size) idxs = haplotype_ref[idx_ref:( idx_ref + left)] == haplotype_master_array[idx_master:] aux = idxs_ref[idx_ref:(idx_ref + left)][~idxs] if aux.size == 0: # gaps '-' were placed until the end of haplotype_ref del_idxs[(idx_ref + left):] = True break else: idx_master = aux[0] - i idx_ref = aux[0] + 1 del_idxs[aux[0]] = True assert np.all( haplotype_ref[~del_idxs] == haplotype_master_array ), "After substracting gaps sequences do not agree" assert np.all( haplotype_ref[del_idxs] == b'-'), "All substracted loci do not correspond to '-'" # Parse sequences of the true haplotype haplotype_ids = haplotype_ids[0:num_haplotypes] haplotype_seqs = haplotype_seqs[0:num_haplotypes] haplotype_seqs_array = np.array(haplotype_seqs, dtype='c') # Remove insertions with respect to consensus/reference sequence if haplotype_ref.size != reference_len: haplotype_seqs_array = haplotype_seqs_array[:, ~del_idxs] # Restore gaps into the master sequence if args.reference: haplotype_master_array[idxs_gaps] = b'-' else: # Sequences of true haplotypes are already reported using the same # indexing as reference/consensus # Parse file containing true haplotype sequences haplotype_ids, haplotype_seqs = read_fasta(args.haplotype_seqs) num_haplotypes = len(haplotype_ids) haplotype_seqs_array = np.array(haplotype_seqs, dtype='c') haplotype_master_array = haplotype_master_array.astype('c') else: # if master sequence is not provided, report with respect to the # consensus. Note that SNVs are called with respect to the cohort # consensus. from scipy.stats import mode outfile = os.path.join(outdir, 'true_haplotype_msa.fasta') mafft(args.haplotype_seqs, outfile, mafft=args.mafft) haplotype_ids, haplotype_seqs = read_fasta(outfile) num_haplotypes = len(haplotype_ids) haplotype_seqs_array = np.array(haplotype_seqs, dtype='c') if args.freq_dstr != 'unif': haplotype_freqs = frequencies(args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs) aux = np.repeat(haplotype_seqs_array, np.round(haplotype_freqs * 100).astype(int), axis=0) consensus = mode(aux, nan_policy='omit') else: consensus = mode(haplotype_seqs_array, nan_policy='omit') if np.any(consensus[1] < 1): print("At some loci the consensus base is ambiguous") haplotype_master_array = consensus[0][0] haplotype_freqs = frequencies(args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs) # True haplotypes - expected SNVs loci_true, ref_true, snvs_true, freq_true, haps_true = true_snvs( haplotype_master_array, haplotype_seqs_array, num_haplotypes, haplotype_freqs, alphabet) if args.output_true: output_file = os.path.join(outdir, 'true_snvs.tsv') with open(output_file, 'w') as outfile: outfile.write('Locus\tRef\tVar\tFreq\tHaplotypes\n') for idx in range(len(loci_true)): outfile.write('{}\t{}\t{}\t{}\t{}\n'.format( loci_true[idx] + 1, ref_true[idx], snvs_true[idx].decode('utf-8'), freq_true[idx], haps_true[idx])) missed = np.zeros(num_haplotypes) # TP: loci that are truly polymorphic TP = 0 # FP: technical error reported as SNVs FP = 0 # TN: loci that are not polymorphic TN = 0 # FN: SNVs that are missed FN = 0 # SNV frequencies TP_freq = [] FP_freq = [] FN_freq = [] loci = np.arange(reference_len) i = 0 j = 0 if args.coverage_intervals is not None: with open(args.coverage_intervals, 'r') as infile: for line in infile: record = line.rstrip().split('\t') if record[0] == args.sampleID: if len(record) == 1: print("Empty target region") with open(args.outfile, 'w') as outfile: outfile.write('ID\tTP\tFP\tFN\tTN\n') return regions = record[1] break regions = regions.split(',') idxs = np.zeros(reference_len, dtype=bool) print("Reporting using 1-based indexing (and closed intervals)") for r in regions: aux = r.split(':') ref_name = aux[0] if args.haplotype_master is not None: assert header == ref_name, ( f"Name of the reference, {ref_name}, does not agree with " f"fasta file, {header}") aux = aux[1].split('-') start = int(aux[0]) end = int(aux[1]) if args.snv_caller: # Region is interpreted as a closed interval and using 1-based # indexing start -= 1 start = max(0, start) else: # ShoRAH was used for SNV calling # Assuming 3 windows were used for SNV calling, identify # region that is covered by at least 2 windows (below, using # 0-based indexing and closed intervals) start_ = max(0, start - args.window_len - 1) end_ = min(reference_len, end + args.window_len) num_windows = np.floor( (end_ - (start_ + args.window_len - 1)) / (args.window_len // args.window_shift)) + 1 offset = ((args.window_shift - 1) * args.window_len / args.window_shift) start = max(0, start - offset - 1) # In order to identify the region which is covered by at least # two windows, add to the end of the first window the # increment multiply by the number of windows - 2 (i.e., # discarding last window). In this case assuming half-open # interval [start, end) end = min( reference_len, start_ + args.window_len + (num_windows - 2) * (args.window_len // args.window_shift)) idxs[range(int(start), int(end))] = True loci_region = loci[int(start):int(end)] if DBG: print(f"DBG loci_true[i]: {loci_true[i]}") print(f"DBG loci_region[0]: {loci_region[0]}") # Here, loci are reported using 1-based indexing and a closed # interval print("Region with enough support: {:d}-{:d}".format( trunc(start) + 1, trunc(end))) TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed, i, j = get_performance( loci_true, loci_inferred, snvs_true, snvs_inferred, freq_true, freq_inferred, haps_true, num_haplotypes, loci_region, i, j, TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed) loci = loci[idxs] if loci_inferred[0] < loci[0] or loci_inferred[-1] > loci[-1]: print("Warning: some reported SNVs are outside the target region." " It can happen when target region is smaller than region" " where SNVs were called.") else: if not args.snv_caller: idxs = np.zeros(reference_len, dtype=bool) offset = (args.window_len // args.window_shift) # Parse coverage intervals from ShoRAH output with open(args.coverage, 'r') as infile: # Look for regions at least covered by two windows start_w = 1 end_w = 1 for count, line in enumerate(infile): record = line.rstrip().split("\t") if count == 0: start_w = int(record[2]) end_w = int(record[3]) else: if int(record[2]) == start_w + offset: start_w = int(record[2]) idxs[(start_w - 1):end_w] = True else: start_w = int(record[2]) end_w = int(record[3]) loci_region = np.extract(idxs, loci) else: if args.coverage is not None: with open(args.coverage, 'r') as infile: header = infile.readline().rstrip().split("\t") sampleID_idx = [ idx for idx, name in enumerate(header) if args.sampleID in name ] coverage = np.loadtxt(args.coverage, dtype=int, delimiter='\t', skiprows=1, usecols=(sampleID_idx[0], )) assert coverage.size == reference_len, ( "Coverage file and reference file do not have the same " "number of loci") # Do not account for position with zero coverage for reporting # TP, FP, FN, and specially TN mask = coverage <= 0 loci_region = loci[~mask] else: raise IOError( "Expected coverage file as input when target region is not specified" ) regions = consecutive(loci_region) TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed, i, j = get_performance( loci_true, loci_inferred, snvs_true, snvs_inferred, freq_true, freq_inferred, haps_true, num_haplotypes, loci_region, i, j, TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed, coverage_file=True, regions=regions) # Sensitivity if TP or FN: print("Sensitivity: {:.6f}".format(TP / (TP + FN))) # Precision if TP or FP: print("Precision: {:.6f}".format(TP / (TP + FP))) # Specificity if TN or FP: print("Specificity: {:.6f}".format(TN / (TN + FP))) print("TP: ", TP) print("FP: ", FP) print("FN: ", FN) print("TN: ", TN) print("Number of FN per haplotype: ", missed) # Write to output file with open(args.outfile, 'w') as outfile: outfile.write('ID\tTP\tFP\tFN\tTN\n') outfile.write(f'{args.sampleID}\t{TP}\t{FP}\t{FN}\t{TN}\n') output_file = os.path.join(outdir, 'FN_per_haplotype.tsv') with open(output_file, 'w') as outfile: for idx, name in enumerate(haplotype_ids): aux = name.split(' ')[0] outfile.write(f'{aux}\t{missed[idx]}\n') output_file = os.path.join(outdir, 'TP_frequencies.tsv') with open(output_file, 'w', newline='') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerow(['Loci', 'Variant', 'Freq', 'Inferred freq']) writer.writerows(TP_freq) output_file = os.path.join(outdir, 'FP_frequencies.tsv') with open(output_file, 'w', newline='') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerow(['Loci', 'Variant', 'Inferred freq']) writer.writerows(FP_freq) output_file = os.path.join(outdir, 'FN_frequencies.tsv') with open(output_file, 'w', newline='') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerow(['Loci', 'Variant', 'Freq']) writer.writerows(FN_freq)
def main(): args = parse_args() alphabet = ["-", "A", "C", "G", "T"] alphabet = np.array(alphabet, dtype="c") # Compute average frequency for SNVs called using ShoRAH df_snvs = parse_vcf(args.snvs, args.snv_caller) if df_snvs.empty: print("No called SNVs") with open(args.outfile, "w") as outfile: outfile.write("ID\tTP\tFP\tFN\tTN\n") return # Drop insertions ins_mask = df_snvs["ALT"].str.len() > 1 df_snvs = df_snvs[~ins_mask] if args.only_deletions: # Only look at deletions # NOTE: temporary work-around while ShoRAH (v1.99.2) is modified to # report indels complying to VCF format if args.snv_caller == "shorah": is_deletion = df_snvs["ALT"] == "-" elif args.snv_caller == "lofreq": is_deletion = df_snvs["REF"].str.len() > 1 df_snvs = df_snvs[is_deletion] # NOTE: once ShoRAH (v1.99.2) is upgraded to report indels complying to # VCF format, --long-dels can also be executed and raising an error # won't be needed if args.long_deletions and args.snv_caller == "shorah": raise ValueError("No curent support for --long-dels and ShoRAH") if df_snvs.empty: print("No called SNVs") with open(args.outfile, "w") as outfile: outfile.write("ID\tTP\tFP\tFN\tTN\n") return if not args.long_deletions: # Unroll deletions into one-base deletions del_mask = df_snvs["REF"].str.len() > 1 assert ( df_snvs.loc[del_mask, "ALT"] == df_snvs.loc[del_mask, "REF"].str[0] ).all(), "Reference base preceding deletion does not match" del_len = df_snvs.loc[del_mask, "REF"].str.len() - 1 df_del = pd.DataFrame( np.repeat(df_snvs[del_mask].values, del_len.to_list(), axis=0) ) df_del.columns = df_snvs.columns df_del["ALT"] = "-" aux_idx = 0 aux_pos = df_del.columns.get_loc("POS") aux_ref = df_del.columns.get_loc("REF") for idx, row in df_snvs[del_mask].iterrows(): # ignore first base as it corresponds to the reference at the # preceding locus ref = list(row["REF"][1:]) pos = [row["POS"] + x + 1 for x in range(len(ref))] df_del.iloc[aux_idx : (aux_idx + del_len[idx]), aux_pos] = pos df_del.iloc[aux_idx : (aux_idx + del_len[idx]), aux_ref] = ref aux_idx += del_len[idx] # Handle special case: reference sequence might contain a gap character # and a long deletion could include it. When unrolling long deletions # the REF and ALT fields will contain both gaps symbols is_gap = (df_del["REF"] == "-") & (df_del["ALT"] == "-") df_del = df_del[~is_gap] # Remove previous rows corresponding to deletions and add the one-base # deletions df_snvs = df_snvs[~del_mask] df_snvs = pd.concat([df_snvs, df_del], ignore_index=True) df_snvs = df_snvs.set_index(["POS", "ALT", "REF"]) df_snvs = df_snvs.sort_index() # Merge on POS and ALT grpby = df_snvs.set_index("CHROM", append=True)[["INFO", "FREQ"]].groupby( ["POS", "ALT", "REF", "CHROM"] ) df_snvs = pd.concat( [grpby["INFO"].apply(lambda s: ";".join(s)), grpby["FREQ"].sum()], axis=1 ) # grpby["REF"].first() # If not part of the index outdir = args.outdir if args.outdir is not None else os.getcwd() if args.haplotype_master is not None: # Parse file containing reference/consensus sequence (sequence w.r.t # which SNVs were called) header, haplotype_master = read_fasta(args.haplotype_master) header = header[0] haplotype_master = haplotype_master[0].upper() haplotype_master_array = np.array(list(haplotype_master)) reference_len = haplotype_master_array.size if args.msa: # Expected if cohort consensus has gaps if args.reference: tmp, reference = read_fasta(args.reference) reference = reference[0].upper() reference = np.array(list(reference)) assert ( reference.size == haplotype_master_array.size ), "Reference and cohort consensus have different lengths" idxs_gaps = haplotype_master_array == "-" haplotype_master_array[idxs_gaps] = reference[idxs_gaps] args.haplotype_master = os.path.join(outdir, "cohort_consensus.fasta") cohort_consensus = SeqRecord( Seq("".join(haplotype_master_array)), id=header, description="" ) with open(args.haplotype_master, "w") as outfile: SeqIO.write(cohort_consensus, outfile, "fasta") haplotype_master_array = haplotype_master_array.astype("c") # construct msa: haplotypes + reference/consensus sequence infile = os.path.join(outdir, "tmp.fasta") sh.cat([args.haplotype_seqs, args.haplotype_master], _out=infile) msa_file = os.path.join(outdir, "haplotypes_re-msa.fasta") mafft(infile, msa_file, mafft=args.mafft) os.remove(infile) # Parse fasta file containing msa haplotype_ids, haplotype_seqs = read_fasta(msa_file) num_haplotypes = len(haplotype_ids) - 1 haplotype_ref = haplotype_seqs[-1] haplotype_ref = haplotype_ref.upper() haplotype_ref = np.array(haplotype_ref, dtype="c") if haplotype_ref.size != reference_len: assert haplotype_ref.size > reference_len, ( "Length of the consensus/reference sequence after the " "MSA is smaller" ) # Deletions '-' were placed on the consensus/reference # sequence after the msa idx_master = 0 idx_ref = 0 idxs_ref = np.arange(haplotype_ref.size) del_idxs = np.zeros(haplotype_ref.size, dtype=bool) for i in range(haplotype_ref.size - reference_len): left = min( reference_len + i - idx_ref, haplotype_master_array[idx_master:].size, ) idxs = ( haplotype_ref[idx_ref : (idx_ref + left)] == haplotype_master_array[idx_master:] ) aux = idxs_ref[idx_ref : (idx_ref + left)][~idxs] if aux.size == 0: # gaps '-' were placed until the end of haplotype_ref del_idxs[(idx_ref + left) :] = True break else: idx_master = aux[0] - i idx_ref = aux[0] + 1 del_idxs[aux[0]] = True assert np.all( haplotype_ref[~del_idxs] == haplotype_master_array ), "After substracting gaps sequences do not agree" assert np.all( haplotype_ref[del_idxs] == b"-" ), "All substracted loci do not correspond to '-'" # Parse sequences of the true haplotype haplotype_ids = haplotype_ids[0:num_haplotypes] haplotype_seqs = haplotype_seqs[0:num_haplotypes] haplotype_seqs_array = np.array(haplotype_seqs, dtype="c") # Remove insertions with respect to consensus/reference sequence if haplotype_ref.size != reference_len: haplotype_seqs_array = haplotype_seqs_array[:, ~del_idxs] # Restore gaps into the master sequence if args.reference: haplotype_master_array[idxs_gaps] = b"-" else: # Sequences of true haplotypes are already reported using the same # indexing as reference/consensus # Parse file containing true haplotype sequences haplotype_ids, haplotype_seqs = read_fasta(args.haplotype_seqs) num_haplotypes = len(haplotype_ids) haplotype_seqs_array = np.array(haplotype_seqs, dtype="c") haplotype_master_array = haplotype_master_array.astype("c") else: # if master sequence is not provided, report with respect to the # consensus. Note that SNVs are called with respect to the cohort # consensus. from scipy.stats import mode outfile = os.path.join(outdir, "true_haplotype_msa.fasta") mafft(args.haplotype_seqs, outfile, mafft=args.mafft) haplotype_ids, haplotype_seqs = read_fasta(outfile) num_haplotypes = len(haplotype_ids) haplotype_seqs_array = np.array(haplotype_seqs, dtype="c") if args.freq_dstr != "unif": haplotype_freqs = frequencies( args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs ) aux = np.repeat( haplotype_seqs_array, np.round(haplotype_freqs * 100).astype(int), axis=0, ) consensus = mode(aux, nan_policy="omit") else: consensus = mode(haplotype_seqs_array, nan_policy="omit") if np.any(consensus[1] < 1): print("At some loci the consensus base is ambiguous") haplotype_master_array = consensus[0][0] haplotype_freqs = frequencies( args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs ) # missed = np.zeros(num_haplotypes) df_snvs_expected = true_snvs( haplotype_master_array, haplotype_master, haplotype_seqs_array, num_haplotypes, haplotype_freqs, args.long_deletions, alphabet, ) if args.only_deletions: # Only look at deletions: drop other entries in expected SNVs dataframe if args.long_deletions: is_deletion = df_snvs_expected["REF"].str.len() > 1 else: is_deletion = df_snvs_expected["ALT"].str.startswith("-") df_snvs_expected = df_snvs_expected[is_deletion] # Keep track of SNVs that fall within targeted regions df_snvs["IS_CONTAINED"] = False df_snvs_expected["IS_CONTAINED"] = False if args.long_deletions: deletion_length = df_snvs["REF"].str.len() - 1 is_deletion = deletion_length > 0 # Using 0-based indexing start_locus = df_snvs["POS"] - 1 start_locus[is_deletion] += 1 end_locus = start_locus + deletion_length - 1 # Similarly for expected SNVs (Already uses 0-based indexing) deletion_length_exp = df_snvs_expected["REF"].str.len() - 1 is_deletion_exp = deletion_length_exp > 0 start_locus_exp = df_snvs_expected["POS"].copy() start_locus_exp[is_deletion_exp] += 1 end_locus_exp = start_locus_exp + deletion_length_exp - 1 else: # Handle SNVs and single-nucleotide deletions # Using 0-based indexing start_locus = df_snvs.index.get_level_values("POS") - 1 end_locus = None # Similarly for expected SNVs (Already uses 0-based indexing) start_locus_exp = df_snvs_expected["POS"] end_locus_exp = None if args.coverage_intervals is not None: with open(args.coverage_intervals, "r") as infile: for line in infile: record = line.rstrip().split("\t") if record[0] == args.sampleID: if len(record) == 1: print("Empty target region") with open(args.outfile, "w") as outfile: outfile.write("ID\tTP\tFP\tFN\tTN\n") return regions = record[1] break regions = regions.split(",") idxs = np.zeros(reference_len, dtype=bool) print("Reporting using 1-based indexing (and closed intervals)") num_loci = 0 for r in regions: aux = r.split(":") ref_name = aux[0] if args.haplotype_master is not None: assert header == ref_name, ( f"Name of the reference, {ref_name}, does not agree with " f"fasta file, {header}" ) aux = aux[1].split("-") start = int(aux[0]) end = int(aux[1]) if args.snv_caller == "shorah" and not args.no_expansion: # ShoRAH was used for SNV calling # Assuming 3 windows were used for SNV calling, identify # region that is covered by at least 2 windows (below, using # 0-based indexing and closed intervals) start_ = max(0, start - args.window_len - 1) end_ = min(reference_len, end + args.window_len) num_windows = ( np.floor( (end_ - (start_ + args.window_len - 1)) / (args.window_len // args.window_shift) ) + 1 ) offset = (args.window_shift - 1) * args.window_len / args.window_shift start = max(0, start - offset - 1) # In order to identify the region which is covered by at least # two windows, add to the end of the first window the # increment multiply by the number of windows - 2 (i.e., # discarding last window). In this case assuming half-open # interval [start, end) end = min( reference_len, start_ + args.window_len + (num_windows - 2) * (args.window_len // args.window_shift), ) # idxs[range(int(start), int(end))] = True # loci_region = loci[int(start):int(end)] # if DBG: # print(f"DBG loci_true[i]: {loci_true[i]}") # print(f"DBG loci_region[0]: {loci_region[0]}") # Here, loci are reported using 1-based indexing and a closed # interval num_loci += end - start start = int(start) end = int(end) print(f"Region with enough support: {start + 1}-{end}") # Mark reported and expected SNVs within the region is_contained = target_snvs( start, end, start_locus, args.long_deletions, end_locus ) df_snvs["IS_CONTAINED"] = df_snvs["IS_CONTAINED"] | is_contained is_contained = target_snvs( start, end, start_locus_exp, args.long_deletions, end_locus_exp ) df_snvs_expected["IS_CONTAINED"] = ( df_snvs_expected["IS_CONTAINED"] | is_contained ) else: loci = np.arange(reference_len) if args.snv_caller == "shorah": idxs = np.zeros(reference_len, dtype=bool) offset = args.window_len // args.window_shift # Parse coverage intervals from ShoRAH output with open(args.coverage, "r") as infile: # Look for regions at least covered by two windows start_w = 1 end_w = 1 for count, line in enumerate(infile): record = line.rstrip().split("\t") if count == 0: start_w = int(record[2]) end_w = int(record[3]) else: if int(record[2]) == start_w + offset: start_w = int(record[2]) idxs[(start_w - 1) : end_w] = True else: start_w = int(record[2]) end_w = int(record[3]) loci_region = np.extract(idxs, loci) else: if args.coverage is not None: with open(args.coverage, "r") as infile: header = infile.readline().rstrip().split("\t") sampleID_idx = [ idx for idx, name in enumerate(header) if args.sampleID in name ] coverage = np.loadtxt( args.coverage, dtype=int, delimiter="\t", skiprows=1, usecols=(sampleID_idx[0],), ) assert coverage.size == reference_len, ( "Coverage file and reference file do not have the same " "number of loci" ) # Do not account for position with zero coverage for reporting # TP, FP, FN, and specially TN mask = coverage <= 0 loci_region = loci[~mask] else: raise IOError( "Expected coverage file as input when target region is not specified" ) num_loci = loci_region.size regions = consecutive(loci_region) start = [el[0] for el in regions] end = [el[-1] + 1 for el in regions] for si, ei in zip(start, end): # Mark reported and expected SNVs within the region is_contained = target_snvs( si, ei, start_locus, args.long_deletions, end_locus ) df_snvs["IS_CONTAINED"] = df_snvs["IS_CONTAINED"] | is_contained is_contained = target_snvs( si, ei, start_locus_exp, args.long_deletions, end_locus_exp ) df_snvs_expected["IS_CONTAINED"] = ( df_snvs_expected["IS_CONTAINED"] | is_contained ) # Drop SNVs that fall outside of the targeted regions. Otherwise, these # rows will be counted toward false positives/negatives. df_snvs = df_snvs[df_snvs["IS_CONTAINED"]] df_snvs_expected = df_snvs_expected[df_snvs_expected["IS_CONTAINED"]] if args.output_true: output_file = os.path.join(outdir, "true_snvs.tsv") # Report using 1-based indexing df_snvs_expected["POS"] += 1 df_snvs_expected.to_csv( output_file, sep="\t", columns=["POS", "REF", "ALT", "FREQ", "HAPLOTYPES"], header=["Loci", "Reference", "Variant", "Frequency", "Haplotypes"], index=False, compression=None, ) # join on POS and ALT df_pairs = df_snvs_expected.merge( df_snvs, how="outer", on=["POS", "ALT", "REF"], suffixes=["_exp", "_rep"] ) FN_mask = df_pairs["INFO"].isnull() FN = sum(FN_mask) FP_mask = df_pairs["HAPLOTYPES"].isnull() FP = sum(FP_mask) TP_mask = ~FN_mask & ~FP_mask TP = sum(TP_mask) TN = num_loci - len(df_pairs["POS"].value_counts()) # Sensitivity if TP or FN: print("Sensitivity: {:.6f}".format(TP / (TP + FN))) # Precision if TP or FP: print("Precision: {:.6f}".format(TP / (TP + FP))) # Specificity if TN or FP: print("Specificity: {:.6f}".format(TN / (TN + FP))) print("TP: ", TP) print("FP: ", FP) print("FN: ", FN) print("TN: ", int(TN)) # print("Number of FN per haplotype: ", missed) # Write to output file with open(args.outfile, "w") as outfile: outfile.write("ID\tTP\tFP\tFN\tTN\n") outfile.write(f"{args.sampleID}\t{TP}\t{FP}\t{FN}\t{int(TN)}\n") # output_file = os.path.join(outdir, 'FN_per_haplotype.tsv') # with open(output_file, 'w') as outfile: # for idx, name in enumerate(haplotype_ids): # aux = name.split(' ')[0] # outfile.write(f'{aux}\t{missed[idx]}\n') output_file = os.path.join(outdir, "TP_frequencies.tsv") df_pairs[TP_mask].to_csv( output_file, sep="\t", columns=["POS", "REF", "ALT", "FREQ_exp", "FREQ_rep", "INFO"], header=[ "Loci", "Reference", "Variant", "Freq (expected)", "Freq (reported)", "Info", ], index=False, compression=None, ) output_file = os.path.join(outdir, "FP_frequencies.tsv") df_pairs[FP_mask].to_csv( output_file, sep="\t", columns=["POS", "REF", "ALT", "FREQ_rep", "INFO"], header=["Loci", "Reference", "Variant", "Freq", "Info"], index=False, compression=None, ) output_file = os.path.join(outdir, "FN_frequencies.tsv") df_pairs[FN_mask].to_csv( output_file, sep="\t", columns=["POS", "REF", "ALT", "FREQ_exp", "HAPLOTYPES"], header=["Loci", "Reference", "Variant", "Freq", "Haplotypes"], index=False, compression=None, )