예제 #1
0
def frequencies(freq_dstr, num_haplotypes, ratio=0.75, infile=None):
    "Compute the expected haplotype frequencies"
    if freq_dstr == "unif":
        haplotype_freqs = np.repeat(1 / num_haplotypes, num_haplotypes)
    elif freq_dstr == "geom":
        haplotype_freqs = [ratio ** (i + 1) for i in range(num_haplotypes)]
        haplotype_freqs = np.asarray(haplotype_freqs)
        haplotype_freqs = haplotype_freqs / np.sum(haplotype_freqs)
    elif freq_dstr == "dirichlet":
        # Read haplotype frequencies from output file
        if infile is None:
            raise IOError("Input file containing haplotype frequencies is expected")
        ids, haplotype_freqs = read_fasta(infile)
        haplotype_freqs = np.asarray(haplotype_freqs, dtype=float)

    return haplotype_freqs
예제 #2
0
def main():
    args = parse_args()

    outdir_haps = args.outdir_haps if args.outdir_haps is not None else os.getcwd(
    )
    outdir_reads = args.outdir_reads if args.outdir_reads is not None else os.getcwd(
    )
    seed = args.seed if args.seed is not None else np.random.randint(
        1, 1000, size=1)[0]
    num_haplotypes = args.num_haplotypes

    if args.output == "master":
        # Simulate master sequence
        haplotype_seq = sim_master(length=args.genome_length,
                                   outdir_haps=outdir_haps,
                                   seed=seed)
    elif args.output == "haplotypes" or args.output == "all":
        # Simulate true underlying haplotypes
        if args.haplotype_seqs is None:
            # Strategy 1 - Simulate haplotypes from scratch
            # Simulate master sequence and save record to FASTA file
            haplotype_seq = sim_master(length=args.genome_length,
                                       outdir_haps=outdir_haps,
                                       seed=seed)
            if args.use_master:
                # Strategy 1.a - Generate haplotype sequences from a master
                # haplotype
                if args.verbose:
                    print_params(args.mutation_rate, args.deletion_rate,
                                 args.insertion_rate)
                haplotype_seqs = [
                    mutate(haplotype_seq,
                           args.mutation_rate,
                           args.deletion_rate,
                           args.insertion_rate,
                           noFR=args.no_FR,
                           del_len=args.deletion_length,
                           seed=seed + i,
                           verbose=args.verbose) for i in range(num_haplotypes)
                ]
            elif args.tree_like:
                # Strategy 1.b - Sample genotypes from the leaves of a perfect
                # tree
                # Max depth of the perfect tree
                max_depth = np.ceil(np.log2(num_haplotypes))
                root = Tree(haplotype_seq, max_depth)
                # Populate tree
                leaf(root,
                     args.mutation_rate,
                     args.deletion_rate,
                     args.insertion_rate,
                     args.no_FR,
                     args.deletion_length,
                     seed,
                     verbose=args.verbose)
                sequences = root.get_leaves()

                np.random.seed(seed)
                idxs = np.random.randint(0,
                                         len(sequences) - 1,
                                         size=num_haplotypes)
                haplotype_seqs = [sequences[idx] for idx in idxs]
            else:
                # Strategy 1.b - Generate num_haplotypes sequences randomly,
                # each of length args.genome_length
                # FIXME: Replace hard coded weights
                weights = (('A', 0.245), ('C', 0.245), ('G', 0.245),
                           ('T', 0.245), ('-', 0.02))
                haplotype_seqs = [
                    sim_haplotypes(length=args.genome_length,
                                   weights=weights,
                                   s=seed) for _x in range(num_haplotypes)
                ]

            # Save each haplotype sequence to a separate file
            write_fasta(haplotype_seqs, outdir_haps)
        else:
            # Strategy 2 - Simulate haplotypes from a file providing the
            # underlying
            # haplotype(s)
            if args.use_master:
                # Strategy 2.a - From the first sequence provided generate
                # num_haplotypes sequences by mutating the original sequence.
                header, haplotype_seq = read_fasta(args.haplotype_seqs)

                if args.verbose:
                    print_params(args.mutation_rate, args.deletion_rate,
                                 args.insertion_rate)
                haplotype_seqs = [
                    mutate(haplotype_seq[0],
                           args.mutation_rate,
                           args.deletion_rate,
                           args.insertion_rate,
                           noFR=args.no_FR,
                           del_len=args.deletion_length,
                           seed=seed + i,
                           verbose=args.verbose) for i in range(num_haplotypes)
                ]
                write_fasta(haplotype_seqs, outdir_haps)
            elif args.tree_like:
                # Strategy 2.b - Sample genotypes from the leaves of a perfect
                # tree
                # Max depth of the perfect tree
                header, haplotype_seq = read_fasta(args.haplotype_seqs)

                max_depth = np.ceil(np.log2(num_haplotypes))
                root = Tree(haplotype_seq[0], max_depth)
                # Populate tree
                leaf(root,
                     args.mutation_rate,
                     args.deletion_rate,
                     args.insertion_rate,
                     args.no_FR,
                     args.deletion_length,
                     seed,
                     verbose=args.verbose)
                sequences = root.get_leaves()

                np.random.seed(seed)
                idxs = np.random.randint(0,
                                         len(sequences) - 1,
                                         size=num_haplotypes)
                haplotype_seqs = [sequences[idx] for idx in idxs]
                write_fasta(haplotype_seqs, outdir_haps)
            else:
                # Strategy 2.c - Split haplotypes into different files
                # (implicitly, it is assumed the number of haplotypes sequences
                # is larger than 1)
                idx = 0
                aux = []
                output_file = ''
                haplotype_id = ''
                with open(args.haplotype_seqs, 'r') as infile:
                    # The following is needed because the number of characters
                    # per line in a FASTA file often do not exceed a certain number
                    for line in infile:
                        record = line.rstrip()
                        if record and record[0] == '>':
                            if idx > 0:
                                with open(output_file, 'w') as outfile:
                                    outfile.write(haplotype_id + '\n')
                                    outfile.write(''.join(aux))
                                aux = []
                            haplotype_id = record
                            output_file = os.path.join(
                                outdir_haps, ''.join(
                                    ("haplotype", str(idx), ".fasta")))
                            idx += 1
                        else:
                            aux.append(record)

                output_file = os.path.join(
                    outdir_haps, ''.join(
                        ("haplotype", str(idx - 1), ".fasta")))
                with open(output_file, 'w') as outfile:
                    outfile.writelines([haplotype_id, '\n', ''.join(aux)])
                num_haplotypes = idx
                haplotype_file = os.path.join(outdir_haps, "haplotypes.fasta")
                shutil.copyfile(args.haplotype_seqs, haplotype_file)

    if args.output == "reads" or args.output == "all":
        fragment_mean, fragment_sd = [
            int(x) for x in args.fragment_size.split(",")
        ]

        coverage = [int(x) for x in args.coverage.split(",")]
        if len(coverage) > 1:
            assert len(coverage) == num_haplotypes, (
                "More than one value for coverage specified, but it does not "
                "coincide with the number of haplotypes")

        if args.freq_dstr == 'unif':
            if len(coverage) > 1:
                print("More than one value for coverage specified, only first "
                      "value is used")

            haplotype_file = os.path.join(outdir_haps, "haplotypes.fasta")
            print(
                f"Reading file containing sequences of underlying haplotypes "
                f"from {haplotype_file}")
            tmp_file = os.path.join(outdir_haps, "tmp.fasta")
            shutil.copyfile(haplotype_file, tmp_file)
            # remove true deletions (if present) before generating reads
            sed('s/-//g', tmp_file, verbose=args.verbose)
            outprefix = os.path.join(outdir_reads, "simreads_R")
            sim_reads(args.art,
                      haplotype_seq=tmp_file,
                      coverage=coverage[0],
                      read_len=args.read_length,
                      fragment_mean=fragment_mean,
                      fragment_sd=fragment_sd,
                      outprefix=outprefix,
                      paired=args.paired,
                      highQ=args.quality,
                      num_reads=args.num_reads,
                      seed=seed,
                      verbose=args.verbose)
            os.remove(tmp_file)

            # Make headers compatible with output from Illumina platforms
            # (expected by ngshmmalign)
            if args.paired:
                sed('/^@.*-[0-9]*\/1$/ s/\/1$/ 1:N:0:5/',
                    ''.join((outprefix, '1.fq')),
                    verbose=args.verbose)
                sed('/^@.*-[0-9]*\/2$/ s/\/2$/ 2:N:0:5/',
                    ''.join((outprefix, '2.fq')),
                    verbose=args.verbose)

            # Rename output file
            if args.paired:
                os.rename(''.join((outprefix, '1.fq')), ''.join(
                    (outprefix, '1.fastq')))
                os.rename(''.join((outprefix, '2.fq')), ''.join(
                    (outprefix, '2.fastq')))
            else:
                os.rename(''.join((outprefix, '.fq')), ''.join(
                    (outprefix, '1.fastq')))

        elif args.freq_dstr == 'geom' or args.freq_dstr == 'dirichlet' or args.freq_dstr == 'cust':

            if args.freq_dstr == 'geom':
                if len(coverage) > 1:
                    print(
                        "More than one value for coverage specified, only first "
                        "value is used")
                freqs = [
                    args.geom_ratio**(i + 1) for i in range(num_haplotypes)
                ]
                freqs = np.asarray(freqs)
                freqs = freqs / np.sum(freqs)
                np.set_printoptions(precision=4)
                if args.verbose:
                    print("Relative abundances: ", freqs)
                coverage = freqs * coverage[0]
                coverage = coverage.astype(int)
            elif args.freq_dstr == 'dirichlet':
                if len(coverage) > 1:
                    print(
                        "More than one value for coverage specified, only first "
                        "value is used")
                if args.dirichlet_alpha is None:
                    alpha = np.ones(num_haplotypes)
                else:
                    alpha = [float(x) for x in args.dirichlet_alpha.split(",")]
                    if len(alpha) == 1:
                        alpha = np.repeat(alpha, num_haplotypes)
                    assert len(alpha) == num_haplotypes, (
                        "The number of Dirichlet parameters and number of "
                        "haplotypes does not coincide")
                np.random.seed(seed)
                freqs = np.random.dirichlet(alpha)
                np.set_printoptions(precision=4)
                if args.verbose:
                    print("Relative abundances: ", freqs)
                coverage = freqs * coverage[0]
                coverage = coverage.astype(int)
                # write to output
                fasta_record = collections.namedtuple("fasta_record", "id seq")
                output_file = os.path.join(outdir_haps,
                                           "haplotype_frequencies.fasta")
                with open(output_file, 'w') as outfile:
                    for i in range(num_haplotypes):
                        haplotype_id = ''.join(("haplotype", str(i)))
                        line = fasta_record(id=haplotype_id, seq=freqs[i])
                        outfile.write(">{}\n{}\n".format(line.id, line.seq))
            elif args.freq_dstr == 'cust':
                print("Not implemented yet!")
                sys.exit()

            if args.paired:
                outfiles_R1 = []
                outfiles_R2 = []
            else:
                outfiles = []
            for idx in range(num_haplotypes):
                haplotype_file = os.path.join(
                    outdir_haps, ''.join(("haplotype", str(idx), ".fasta")))
                print(f"Reading file containing sequence of haplotype {idx} "
                      f"from {haplotype_file}")
                # remove true deletions (if present) before generating reads
                sed('s/-//g', haplotype_file, verbose=args.verbose)
                outprefix = os.path.join(
                    outdir_reads, ''.join(("reads_hap", str(idx), "_R")))
                if args.paired:
                    outfiles_R1.append(''.join((outprefix, "1.fq")))
                    outfiles_R2.append(''.join((outprefix, "2.fq")))
                else:
                    outfiles.append(''.join((outprefix, ".fq")))

                sim_reads(args.art,
                          haplotype_seq=haplotype_file,
                          coverage=coverage[idx],
                          read_len=args.read_length,
                          fragment_mean=fragment_mean,
                          fragment_sd=fragment_sd,
                          outprefix=outprefix,
                          paired=args.paired,
                          highQ=args.quality,
                          num_reads=args.num_reads,
                          seed=seed,
                          verbose=args.verbose)

            if args.paired:
                sh.cat(outfiles_R1,
                       _out=os.path.join(outdir_reads, "simreads_R1.fastq"))
                sh.cat(outfiles_R2,
                       _out=os.path.join(outdir_reads, "simreads_R2.fastq"))
                for idx in range(len(outfiles_R1)):
                    os.remove(outfiles_R1[idx])
                    os.remove(outfiles_R2[idx])
            else:
                sh.cat(outfiles,
                       _out=os.path.join(outdir_reads, "simreads_R1.fastq"))
                for f in outfiles:
                    os.remove(f)

            # Make headers compatible with output from Illumina platforms
            # (expected by ngshmmalign)
            if args.paired:
                sed('/^@.*-[0-9]*\/1$/ s/\/1$/ 1:N:0:5/',
                    os.path.join(outdir_reads, "simreads_R1.fastq"),
                    verbose=args.verbose)
                sed('/^@.*-[0-9]*\/2$/ s/\/2$/ 2:N:0:5/',
                    os.path.join(outdir_reads, "simreads_R2.fastq"),
                    verbose=args.verbose)
예제 #3
0
def main():

    args = parse_args()

    alphabet = ['-', 'A', 'C', 'G', 'T']
    alphabet = np.array(alphabet, dtype='c')

    # Compute average frequency for SNVs called using ShoRAH
    loci_inferred, ref_inferred, snvs_inferred, freq_inferred = inferred_snvs(
        args.snvs)
    if not loci_inferred:
        print("No called SNVs")
        with open(args.outfile, 'w') as outfile:
            outfile.write('ID\tTP\tFP\tFN\tTN\n')
        return

    outdir = args.outdir if args.outdir is not None else os.getcwd()
    if args.haplotype_master is not None:
        # Parse file containing reference/consensus sequence (sequence w.r.t
        # which SNVs were called)
        header, haplotype_master = read_fasta(args.haplotype_master)
        header = header[0]
        haplotype_master = haplotype_master[0].upper()
        haplotype_master_array = np.array(list(haplotype_master))
        reference_len = haplotype_master_array.size

        if args.msa:
            # Expected if cohort consensus has gaps
            if args.reference:
                tmp, reference = read_fasta(args.reference)
                reference = reference[0].upper()
                reference = np.array(list(reference))
                assert reference.size == haplotype_master_array.size, (
                    "Reference and cohort consensus have different lengths")
                idxs_gaps = haplotype_master_array == '-'
                haplotype_master_array[idxs_gaps] = reference[idxs_gaps]
                args.haplotype_master = os.path.join(outdir,
                                                     'cohort_consensus.fasta')
                cohort_consensus = SeqRecord(Seq(
                    ''.join(haplotype_master_array)),
                                             id=header,
                                             description="")
                with open(args.haplotype_master, 'w') as outfile:
                    SeqIO.write(cohort_consensus, outfile, "fasta")

            haplotype_master_array = haplotype_master_array.astype('c')
            # construct msa: haplotypes + reference/consensus sequence
            infile = os.path.join(outdir, "tmp.fasta")
            sh.cat([args.haplotype_seqs, args.haplotype_master], _out=infile)
            msa_file = os.path.join(outdir, 'haplotypes_re-msa.fasta')
            mafft(infile, msa_file, mafft=args.mafft)
            os.remove(infile)
            # Parse fasta file containing msa
            haplotype_ids, haplotype_seqs = read_fasta(msa_file)
            num_haplotypes = len(haplotype_ids) - 1

            haplotype_ref = haplotype_seqs[-1]
            haplotype_ref = haplotype_ref.upper()
            haplotype_ref = np.array(haplotype_ref, dtype='c')
            if haplotype_ref.size != reference_len:
                assert haplotype_ref.size > reference_len, (
                    "Length of the consensus/reference sequence after the "
                    "MSA is smaller")
                # Deletions '-' were placed on the consensus/reference
                # sequence after the msa
                idx_master = 0
                idx_ref = 0
                idxs_ref = np.arange(haplotype_ref.size)
                del_idxs = np.zeros(haplotype_ref.size, dtype=bool)
                for i in range(haplotype_ref.size - reference_len):
                    left = min(reference_len + i - idx_ref,
                               haplotype_master_array[idx_master:].size)
                    idxs = haplotype_ref[idx_ref:(
                        idx_ref + left)] == haplotype_master_array[idx_master:]
                    aux = idxs_ref[idx_ref:(idx_ref + left)][~idxs]
                    if aux.size == 0:
                        # gaps '-' were placed until the end of haplotype_ref
                        del_idxs[(idx_ref + left):] = True
                        break
                    else:
                        idx_master = aux[0] - i
                        idx_ref = aux[0] + 1
                        del_idxs[aux[0]] = True

                assert np.all(
                    haplotype_ref[~del_idxs] == haplotype_master_array
                ), "After substracting gaps sequences do not agree"
                assert np.all(
                    haplotype_ref[del_idxs] ==
                    b'-'), "All substracted loci do not correspond to '-'"

            # Parse sequences of the true haplotype
            haplotype_ids = haplotype_ids[0:num_haplotypes]
            haplotype_seqs = haplotype_seqs[0:num_haplotypes]
            haplotype_seqs_array = np.array(haplotype_seqs, dtype='c')
            # Remove insertions with respect to consensus/reference sequence
            if haplotype_ref.size != reference_len:
                haplotype_seqs_array = haplotype_seqs_array[:, ~del_idxs]
            # Restore gaps into the master sequence
            if args.reference:
                haplotype_master_array[idxs_gaps] = b'-'
        else:
            # Sequences of true haplotypes are already reported using the same
            # indexing as reference/consensus
            # Parse file containing true haplotype sequences
            haplotype_ids, haplotype_seqs = read_fasta(args.haplotype_seqs)
            num_haplotypes = len(haplotype_ids)
            haplotype_seqs_array = np.array(haplotype_seqs, dtype='c')
            haplotype_master_array = haplotype_master_array.astype('c')
    else:
        # if master sequence is not provided, report with respect to the
        # consensus. Note that SNVs are called with respect to the cohort
        # consensus.
        from scipy.stats import mode
        outfile = os.path.join(outdir, 'true_haplotype_msa.fasta')
        mafft(args.haplotype_seqs, outfile, mafft=args.mafft)
        haplotype_ids, haplotype_seqs = read_fasta(outfile)
        num_haplotypes = len(haplotype_ids)
        haplotype_seqs_array = np.array(haplotype_seqs, dtype='c')
        if args.freq_dstr != 'unif':
            haplotype_freqs = frequencies(args.freq_dstr, num_haplotypes,
                                          args.ratio, args.dirichlet_freqs)
            aux = np.repeat(haplotype_seqs_array,
                            np.round(haplotype_freqs * 100).astype(int),
                            axis=0)
            consensus = mode(aux, nan_policy='omit')
        else:
            consensus = mode(haplotype_seqs_array, nan_policy='omit')
        if np.any(consensus[1] < 1):
            print("At some loci the consensus base is ambiguous")
        haplotype_master_array = consensus[0][0]

    haplotype_freqs = frequencies(args.freq_dstr, num_haplotypes, args.ratio,
                                  args.dirichlet_freqs)
    # True haplotypes - expected SNVs
    loci_true, ref_true, snvs_true, freq_true, haps_true = true_snvs(
        haplotype_master_array, haplotype_seqs_array, num_haplotypes,
        haplotype_freqs, alphabet)

    if args.output_true:
        output_file = os.path.join(outdir, 'true_snvs.tsv')
        with open(output_file, 'w') as outfile:
            outfile.write('Locus\tRef\tVar\tFreq\tHaplotypes\n')
            for idx in range(len(loci_true)):
                outfile.write('{}\t{}\t{}\t{}\t{}\n'.format(
                    loci_true[idx] + 1, ref_true[idx],
                    snvs_true[idx].decode('utf-8'), freq_true[idx],
                    haps_true[idx]))

    missed = np.zeros(num_haplotypes)
    # TP: loci that are truly polymorphic
    TP = 0
    # FP: technical error reported as SNVs
    FP = 0
    # TN: loci that are not polymorphic
    TN = 0
    # FN: SNVs that are missed
    FN = 0
    # SNV frequencies
    TP_freq = []
    FP_freq = []
    FN_freq = []

    loci = np.arange(reference_len)
    i = 0
    j = 0

    if args.coverage_intervals is not None:
        with open(args.coverage_intervals, 'r') as infile:
            for line in infile:
                record = line.rstrip().split('\t')
                if record[0] == args.sampleID:
                    if len(record) == 1:
                        print("Empty target region")
                        with open(args.outfile, 'w') as outfile:
                            outfile.write('ID\tTP\tFP\tFN\tTN\n')
                        return
                    regions = record[1]
                    break
        regions = regions.split(',')
        idxs = np.zeros(reference_len, dtype=bool)
        print("Reporting using 1-based indexing (and closed intervals)")
        for r in regions:
            aux = r.split(':')
            ref_name = aux[0]
            if args.haplotype_master is not None:
                assert header == ref_name, (
                    f"Name of the reference, {ref_name}, does not agree with "
                    f"fasta file, {header}")
            aux = aux[1].split('-')
            start = int(aux[0])
            end = int(aux[1])
            if args.snv_caller:
                # Region is interpreted as a closed interval and using 1-based
                # indexing
                start -= 1
                start = max(0, start)
            else:
                # ShoRAH was used for SNV calling
                # Assuming 3 windows were used for SNV calling, identify
                # region that is covered by at least 2 windows (below, using
                # 0-based indexing and closed intervals)
                start_ = max(0, start - args.window_len - 1)
                end_ = min(reference_len, end + args.window_len)
                num_windows = np.floor(
                    (end_ - (start_ + args.window_len - 1)) /
                    (args.window_len // args.window_shift)) + 1
                offset = ((args.window_shift - 1) * args.window_len /
                          args.window_shift)

                start = max(0, start - offset - 1)
                # In order to identify the region which is covered by at least
                # two windows, add to the end of the first window the
                # increment multiply by the number of windows - 2 (i.e.,
                # discarding last window). In this case assuming half-open
                # interval [start, end)
                end = min(
                    reference_len, start_ + args.window_len +
                    (num_windows - 2) * (args.window_len // args.window_shift))
            idxs[range(int(start), int(end))] = True
            loci_region = loci[int(start):int(end)]

            if DBG:
                print(f"DBG loci_true[i]: {loci_true[i]}")
                print(f"DBG loci_region[0]: {loci_region[0]}")
            # Here, loci are reported using 1-based indexing and a closed
            # interval
            print("Region with enough support: {:d}-{:d}".format(
                trunc(start) + 1, trunc(end)))

            TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed, i, j = get_performance(
                loci_true, loci_inferred, snvs_true, snvs_inferred, freq_true,
                freq_inferred, haps_true, num_haplotypes, loci_region, i, j,
                TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed)

        loci = loci[idxs]
        if loci_inferred[0] < loci[0] or loci_inferred[-1] > loci[-1]:
            print("Warning: some reported SNVs are outside the target region."
                  " It can happen when target region is smaller than region"
                  " where SNVs were called.")
    else:
        if not args.snv_caller:
            idxs = np.zeros(reference_len, dtype=bool)
            offset = (args.window_len // args.window_shift)
            # Parse coverage intervals from ShoRAH output
            with open(args.coverage, 'r') as infile:
                # Look for regions at least covered by two windows
                start_w = 1
                end_w = 1
                for count, line in enumerate(infile):
                    record = line.rstrip().split("\t")
                    if count == 0:
                        start_w = int(record[2])
                        end_w = int(record[3])
                    else:
                        if int(record[2]) == start_w + offset:
                            start_w = int(record[2])
                            idxs[(start_w - 1):end_w] = True
                        else:
                            start_w = int(record[2])
                        end_w = int(record[3])

            loci_region = np.extract(idxs, loci)

        else:
            if args.coverage is not None:
                with open(args.coverage, 'r') as infile:
                    header = infile.readline().rstrip().split("\t")
                sampleID_idx = [
                    idx for idx, name in enumerate(header)
                    if args.sampleID in name
                ]
                coverage = np.loadtxt(args.coverage,
                                      dtype=int,
                                      delimiter='\t',
                                      skiprows=1,
                                      usecols=(sampleID_idx[0], ))
                assert coverage.size == reference_len, (
                    "Coverage file and reference file do not have the same "
                    "number of loci")
                # Do not account for position with zero coverage for reporting
                # TP, FP, FN, and specially TN
                mask = coverage <= 0
                loci_region = loci[~mask]
            else:
                raise IOError(
                    "Expected coverage file as input when target region is not specified"
                )

        regions = consecutive(loci_region)

        TP, FP, TN, FN, TP_freq, FP_freq, FN_freq, missed, i, j = get_performance(
            loci_true,
            loci_inferred,
            snvs_true,
            snvs_inferred,
            freq_true,
            freq_inferred,
            haps_true,
            num_haplotypes,
            loci_region,
            i,
            j,
            TP,
            FP,
            TN,
            FN,
            TP_freq,
            FP_freq,
            FN_freq,
            missed,
            coverage_file=True,
            regions=regions)

    # Sensitivity
    if TP or FN:
        print("Sensitivity: {:.6f}".format(TP / (TP + FN)))

    # Precision
    if TP or FP:
        print("Precision: {:.6f}".format(TP / (TP + FP)))

    # Specificity
    if TN or FP:
        print("Specificity: {:.6f}".format(TN / (TN + FP)))

    print("TP: ", TP)
    print("FP: ", FP)
    print("FN: ", FN)
    print("TN: ", TN)
    print("Number of FN per haplotype: ", missed)

    # Write to output file
    with open(args.outfile, 'w') as outfile:
        outfile.write('ID\tTP\tFP\tFN\tTN\n')
        outfile.write(f'{args.sampleID}\t{TP}\t{FP}\t{FN}\t{TN}\n')

    output_file = os.path.join(outdir, 'FN_per_haplotype.tsv')
    with open(output_file, 'w') as outfile:
        for idx, name in enumerate(haplotype_ids):
            aux = name.split(' ')[0]
            outfile.write(f'{aux}\t{missed[idx]}\n')

    output_file = os.path.join(outdir, 'TP_frequencies.tsv')
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['Loci', 'Variant', 'Freq', 'Inferred freq'])
        writer.writerows(TP_freq)

    output_file = os.path.join(outdir, 'FP_frequencies.tsv')
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['Loci', 'Variant', 'Inferred freq'])
        writer.writerows(FP_freq)

    output_file = os.path.join(outdir, 'FN_frequencies.tsv')
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['Loci', 'Variant', 'Freq'])
        writer.writerows(FN_freq)
예제 #4
0
def main():

    args = parse_args()

    alphabet = ["-", "A", "C", "G", "T"]
    alphabet = np.array(alphabet, dtype="c")

    # Compute average frequency for SNVs called using ShoRAH
    df_snvs = parse_vcf(args.snvs, args.snv_caller)

    if df_snvs.empty:
        print("No called SNVs")
        with open(args.outfile, "w") as outfile:
            outfile.write("ID\tTP\tFP\tFN\tTN\n")
        return

    # Drop insertions
    ins_mask = df_snvs["ALT"].str.len() > 1
    df_snvs = df_snvs[~ins_mask]

    if args.only_deletions:
        # Only look at deletions
        # NOTE: temporary work-around while ShoRAH (v1.99.2) is modified to
        #       report indels complying to VCF format
        if args.snv_caller == "shorah":
            is_deletion = df_snvs["ALT"] == "-"
        elif args.snv_caller == "lofreq":
            is_deletion = df_snvs["REF"].str.len() > 1
        df_snvs = df_snvs[is_deletion]

    # NOTE: once ShoRAH (v1.99.2) is upgraded to report indels complying to
    #       VCF format, --long-dels can also be executed and raising an error
    #       won't be needed
    if args.long_deletions and args.snv_caller == "shorah":
        raise ValueError("No curent support for --long-dels and ShoRAH")

    if df_snvs.empty:
        print("No called SNVs")
        with open(args.outfile, "w") as outfile:
            outfile.write("ID\tTP\tFP\tFN\tTN\n")
        return

    if not args.long_deletions:
        # Unroll deletions into one-base deletions
        del_mask = df_snvs["REF"].str.len() > 1
        assert (
            df_snvs.loc[del_mask, "ALT"] == df_snvs.loc[del_mask, "REF"].str[0]
        ).all(), "Reference base preceding deletion does not match"

        del_len = df_snvs.loc[del_mask, "REF"].str.len() - 1
        df_del = pd.DataFrame(
            np.repeat(df_snvs[del_mask].values, del_len.to_list(), axis=0)
        )
        df_del.columns = df_snvs.columns
        df_del["ALT"] = "-"
        aux_idx = 0
        aux_pos = df_del.columns.get_loc("POS")
        aux_ref = df_del.columns.get_loc("REF")
        for idx, row in df_snvs[del_mask].iterrows():
            # ignore first base as it corresponds to the reference at the
            # preceding locus
            ref = list(row["REF"][1:])
            pos = [row["POS"] + x + 1 for x in range(len(ref))]
            df_del.iloc[aux_idx : (aux_idx + del_len[idx]), aux_pos] = pos
            df_del.iloc[aux_idx : (aux_idx + del_len[idx]), aux_ref] = ref
            aux_idx += del_len[idx]

        # Handle special case: reference sequence might contain a gap character
        # and a long deletion could include it. When unrolling long deletions
        # the REF and ALT fields will contain both gaps symbols
        is_gap = (df_del["REF"] == "-") & (df_del["ALT"] == "-")
        df_del = df_del[~is_gap]

        # Remove previous rows corresponding to deletions and add the one-base
        # deletions
        df_snvs = df_snvs[~del_mask]
        df_snvs = pd.concat([df_snvs, df_del], ignore_index=True)
        df_snvs = df_snvs.set_index(["POS", "ALT", "REF"])
        df_snvs = df_snvs.sort_index()

        # Merge on POS and ALT
        grpby = df_snvs.set_index("CHROM", append=True)[["INFO", "FREQ"]].groupby(
            ["POS", "ALT", "REF", "CHROM"]
        )
        df_snvs = pd.concat(
            [grpby["INFO"].apply(lambda s: ";".join(s)), grpby["FREQ"].sum()], axis=1
        )
        # grpby["REF"].first() # If not part of the index

    outdir = args.outdir if args.outdir is not None else os.getcwd()
    if args.haplotype_master is not None:
        # Parse file containing reference/consensus sequence (sequence w.r.t
        # which SNVs were called)
        header, haplotype_master = read_fasta(args.haplotype_master)
        header = header[0]
        haplotype_master = haplotype_master[0].upper()
        haplotype_master_array = np.array(list(haplotype_master))
        reference_len = haplotype_master_array.size

        if args.msa:
            # Expected if cohort consensus has gaps
            if args.reference:
                tmp, reference = read_fasta(args.reference)
                reference = reference[0].upper()
                reference = np.array(list(reference))
                assert (
                    reference.size == haplotype_master_array.size
                ), "Reference and cohort consensus have different lengths"
                idxs_gaps = haplotype_master_array == "-"
                haplotype_master_array[idxs_gaps] = reference[idxs_gaps]
                args.haplotype_master = os.path.join(outdir, "cohort_consensus.fasta")
                cohort_consensus = SeqRecord(
                    Seq("".join(haplotype_master_array)), id=header, description=""
                )
                with open(args.haplotype_master, "w") as outfile:
                    SeqIO.write(cohort_consensus, outfile, "fasta")

            haplotype_master_array = haplotype_master_array.astype("c")
            # construct msa: haplotypes + reference/consensus sequence
            infile = os.path.join(outdir, "tmp.fasta")
            sh.cat([args.haplotype_seqs, args.haplotype_master], _out=infile)
            msa_file = os.path.join(outdir, "haplotypes_re-msa.fasta")
            mafft(infile, msa_file, mafft=args.mafft)
            os.remove(infile)
            # Parse fasta file containing msa
            haplotype_ids, haplotype_seqs = read_fasta(msa_file)
            num_haplotypes = len(haplotype_ids) - 1

            haplotype_ref = haplotype_seqs[-1]
            haplotype_ref = haplotype_ref.upper()
            haplotype_ref = np.array(haplotype_ref, dtype="c")
            if haplotype_ref.size != reference_len:
                assert haplotype_ref.size > reference_len, (
                    "Length of the consensus/reference sequence after the "
                    "MSA is smaller"
                )
                # Deletions '-' were placed on the consensus/reference
                # sequence after the msa
                idx_master = 0
                idx_ref = 0
                idxs_ref = np.arange(haplotype_ref.size)
                del_idxs = np.zeros(haplotype_ref.size, dtype=bool)
                for i in range(haplotype_ref.size - reference_len):
                    left = min(
                        reference_len + i - idx_ref,
                        haplotype_master_array[idx_master:].size,
                    )
                    idxs = (
                        haplotype_ref[idx_ref : (idx_ref + left)]
                        == haplotype_master_array[idx_master:]
                    )
                    aux = idxs_ref[idx_ref : (idx_ref + left)][~idxs]
                    if aux.size == 0:
                        # gaps '-' were placed until the end of haplotype_ref
                        del_idxs[(idx_ref + left) :] = True
                        break
                    else:
                        idx_master = aux[0] - i
                        idx_ref = aux[0] + 1
                        del_idxs[aux[0]] = True

                assert np.all(
                    haplotype_ref[~del_idxs] == haplotype_master_array
                ), "After substracting gaps sequences do not agree"
                assert np.all(
                    haplotype_ref[del_idxs] == b"-"
                ), "All substracted loci do not correspond to '-'"

            # Parse sequences of the true haplotype
            haplotype_ids = haplotype_ids[0:num_haplotypes]
            haplotype_seqs = haplotype_seqs[0:num_haplotypes]
            haplotype_seqs_array = np.array(haplotype_seqs, dtype="c")
            # Remove insertions with respect to consensus/reference sequence
            if haplotype_ref.size != reference_len:
                haplotype_seqs_array = haplotype_seqs_array[:, ~del_idxs]
            # Restore gaps into the master sequence
            if args.reference:
                haplotype_master_array[idxs_gaps] = b"-"
        else:
            # Sequences of true haplotypes are already reported using the same
            # indexing as reference/consensus
            # Parse file containing true haplotype sequences
            haplotype_ids, haplotype_seqs = read_fasta(args.haplotype_seqs)
            num_haplotypes = len(haplotype_ids)
            haplotype_seqs_array = np.array(haplotype_seqs, dtype="c")
            haplotype_master_array = haplotype_master_array.astype("c")
    else:
        # if master sequence is not provided, report with respect to the
        # consensus. Note that SNVs are called with respect to the cohort
        # consensus.
        from scipy.stats import mode

        outfile = os.path.join(outdir, "true_haplotype_msa.fasta")
        mafft(args.haplotype_seqs, outfile, mafft=args.mafft)
        haplotype_ids, haplotype_seqs = read_fasta(outfile)
        num_haplotypes = len(haplotype_ids)
        haplotype_seqs_array = np.array(haplotype_seqs, dtype="c")
        if args.freq_dstr != "unif":
            haplotype_freqs = frequencies(
                args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs
            )
            aux = np.repeat(
                haplotype_seqs_array,
                np.round(haplotype_freqs * 100).astype(int),
                axis=0,
            )
            consensus = mode(aux, nan_policy="omit")
        else:
            consensus = mode(haplotype_seqs_array, nan_policy="omit")
        if np.any(consensus[1] < 1):
            print("At some loci the consensus base is ambiguous")
        haplotype_master_array = consensus[0][0]

    haplotype_freqs = frequencies(
        args.freq_dstr, num_haplotypes, args.ratio, args.dirichlet_freqs
    )

    # missed = np.zeros(num_haplotypes)

    df_snvs_expected = true_snvs(
        haplotype_master_array,
        haplotype_master,
        haplotype_seqs_array,
        num_haplotypes,
        haplotype_freqs,
        args.long_deletions,
        alphabet,
    )

    if args.only_deletions:
        # Only look at deletions: drop other entries in expected SNVs dataframe
        if args.long_deletions:
            is_deletion = df_snvs_expected["REF"].str.len() > 1
        else:
            is_deletion = df_snvs_expected["ALT"].str.startswith("-")
        df_snvs_expected = df_snvs_expected[is_deletion]

    # Keep track of SNVs that fall within targeted regions
    df_snvs["IS_CONTAINED"] = False
    df_snvs_expected["IS_CONTAINED"] = False
    if args.long_deletions:
        deletion_length = df_snvs["REF"].str.len() - 1
        is_deletion = deletion_length > 0
        # Using 0-based indexing
        start_locus = df_snvs["POS"] - 1
        start_locus[is_deletion] += 1
        end_locus = start_locus + deletion_length - 1
        # Similarly for expected SNVs (Already uses 0-based indexing)
        deletion_length_exp = df_snvs_expected["REF"].str.len() - 1
        is_deletion_exp = deletion_length_exp > 0
        start_locus_exp = df_snvs_expected["POS"].copy()
        start_locus_exp[is_deletion_exp] += 1
        end_locus_exp = start_locus_exp + deletion_length_exp - 1
    else:
        # Handle SNVs and single-nucleotide deletions
        # Using 0-based indexing
        start_locus = df_snvs.index.get_level_values("POS") - 1
        end_locus = None
        # Similarly for expected SNVs (Already uses 0-based indexing)
        start_locus_exp = df_snvs_expected["POS"]
        end_locus_exp = None

    if args.coverage_intervals is not None:
        with open(args.coverage_intervals, "r") as infile:
            for line in infile:
                record = line.rstrip().split("\t")
                if record[0] == args.sampleID:
                    if len(record) == 1:
                        print("Empty target region")
                        with open(args.outfile, "w") as outfile:
                            outfile.write("ID\tTP\tFP\tFN\tTN\n")
                        return
                    regions = record[1]
                    break
        regions = regions.split(",")
        idxs = np.zeros(reference_len, dtype=bool)
        print("Reporting using 1-based indexing (and closed intervals)")
        num_loci = 0
        for r in regions:
            aux = r.split(":")
            ref_name = aux[0]
            if args.haplotype_master is not None:
                assert header == ref_name, (
                    f"Name of the reference, {ref_name}, does not agree with "
                    f"fasta file, {header}"
                )
            aux = aux[1].split("-")
            start = int(aux[0])
            end = int(aux[1])
            if args.snv_caller == "shorah" and not args.no_expansion:
                # ShoRAH was used for SNV calling
                # Assuming 3 windows were used for SNV calling, identify
                # region that is covered by at least 2 windows (below, using
                # 0-based indexing and closed intervals)
                start_ = max(0, start - args.window_len - 1)
                end_ = min(reference_len, end + args.window_len)
                num_windows = (
                    np.floor(
                        (end_ - (start_ + args.window_len - 1))
                        / (args.window_len // args.window_shift)
                    )
                    + 1
                )
                offset = (args.window_shift - 1) * args.window_len / args.window_shift

                start = max(0, start - offset - 1)
                # In order to identify the region which is covered by at least
                # two windows, add to the end of the first window the
                # increment multiply by the number of windows - 2 (i.e.,
                # discarding last window). In this case assuming half-open
                # interval [start, end)
                end = min(
                    reference_len,
                    start_
                    + args.window_len
                    + (num_windows - 2) * (args.window_len // args.window_shift),
                )
            # idxs[range(int(start), int(end))] = True
            # loci_region = loci[int(start):int(end)]

            # if DBG:
            #     print(f"DBG loci_true[i]: {loci_true[i]}")
            #     print(f"DBG loci_region[0]: {loci_region[0]}")
            # Here, loci are reported using 1-based indexing and a closed
            # interval
            num_loci += end - start
            start = int(start)
            end = int(end)
            print(f"Region with enough support: {start + 1}-{end}")

            # Mark reported and expected SNVs within the region
            is_contained = target_snvs(
                start, end, start_locus, args.long_deletions, end_locus
            )
            df_snvs["IS_CONTAINED"] = df_snvs["IS_CONTAINED"] | is_contained
            is_contained = target_snvs(
                start, end, start_locus_exp, args.long_deletions, end_locus_exp
            )
            df_snvs_expected["IS_CONTAINED"] = (
                df_snvs_expected["IS_CONTAINED"] | is_contained
            )

    else:
        loci = np.arange(reference_len)
        if args.snv_caller == "shorah":
            idxs = np.zeros(reference_len, dtype=bool)
            offset = args.window_len // args.window_shift
            # Parse coverage intervals from ShoRAH output
            with open(args.coverage, "r") as infile:
                # Look for regions at least covered by two windows
                start_w = 1
                end_w = 1
                for count, line in enumerate(infile):
                    record = line.rstrip().split("\t")
                    if count == 0:
                        start_w = int(record[2])
                        end_w = int(record[3])
                    else:
                        if int(record[2]) == start_w + offset:
                            start_w = int(record[2])
                            idxs[(start_w - 1) : end_w] = True
                        else:
                            start_w = int(record[2])
                        end_w = int(record[3])

            loci_region = np.extract(idxs, loci)

        else:
            if args.coverage is not None:
                with open(args.coverage, "r") as infile:
                    header = infile.readline().rstrip().split("\t")
                sampleID_idx = [
                    idx for idx, name in enumerate(header) if args.sampleID in name
                ]
                coverage = np.loadtxt(
                    args.coverage,
                    dtype=int,
                    delimiter="\t",
                    skiprows=1,
                    usecols=(sampleID_idx[0],),
                )
                assert coverage.size == reference_len, (
                    "Coverage file and reference file do not have the same "
                    "number of loci"
                )
                # Do not account for position with zero coverage for reporting
                # TP, FP, FN, and specially TN
                mask = coverage <= 0
                loci_region = loci[~mask]
            else:
                raise IOError(
                    "Expected coverage file as input when target region is not specified"
                )

        num_loci = loci_region.size
        regions = consecutive(loci_region)
        start = [el[0] for el in regions]
        end = [el[-1] + 1 for el in regions]
        for si, ei in zip(start, end):
            # Mark reported and expected SNVs within the region
            is_contained = target_snvs(
                si, ei, start_locus, args.long_deletions, end_locus
            )
            df_snvs["IS_CONTAINED"] = df_snvs["IS_CONTAINED"] | is_contained
            is_contained = target_snvs(
                si, ei, start_locus_exp, args.long_deletions, end_locus_exp
            )
            df_snvs_expected["IS_CONTAINED"] = (
                df_snvs_expected["IS_CONTAINED"] | is_contained
            )

    # Drop SNVs that fall outside of the targeted regions. Otherwise, these
    # rows will be counted toward false positives/negatives.
    df_snvs = df_snvs[df_snvs["IS_CONTAINED"]]
    df_snvs_expected = df_snvs_expected[df_snvs_expected["IS_CONTAINED"]]

    if args.output_true:
        output_file = os.path.join(outdir, "true_snvs.tsv")
        # Report using 1-based indexing
        df_snvs_expected["POS"] += 1
        df_snvs_expected.to_csv(
            output_file,
            sep="\t",
            columns=["POS", "REF", "ALT", "FREQ", "HAPLOTYPES"],
            header=["Loci", "Reference", "Variant", "Frequency", "Haplotypes"],
            index=False,
            compression=None,
        )

    # join on POS and ALT
    df_pairs = df_snvs_expected.merge(
        df_snvs, how="outer", on=["POS", "ALT", "REF"], suffixes=["_exp", "_rep"]
    )

    FN_mask = df_pairs["INFO"].isnull()
    FN = sum(FN_mask)

    FP_mask = df_pairs["HAPLOTYPES"].isnull()
    FP = sum(FP_mask)

    TP_mask = ~FN_mask & ~FP_mask
    TP = sum(TP_mask)

    TN = num_loci - len(df_pairs["POS"].value_counts())
    # Sensitivity
    if TP or FN:
        print("Sensitivity: {:.6f}".format(TP / (TP + FN)))

    # Precision
    if TP or FP:
        print("Precision: {:.6f}".format(TP / (TP + FP)))

    # Specificity
    if TN or FP:
        print("Specificity: {:.6f}".format(TN / (TN + FP)))

    print("TP: ", TP)
    print("FP: ", FP)
    print("FN: ", FN)
    print("TN: ", int(TN))
    # print("Number of FN per haplotype: ", missed)

    # Write to output file
    with open(args.outfile, "w") as outfile:
        outfile.write("ID\tTP\tFP\tFN\tTN\n")
        outfile.write(f"{args.sampleID}\t{TP}\t{FP}\t{FN}\t{int(TN)}\n")

    # output_file = os.path.join(outdir, 'FN_per_haplotype.tsv')
    # with open(output_file, 'w') as outfile:
    #     for idx, name in enumerate(haplotype_ids):
    #         aux = name.split(' ')[0]
    #         outfile.write(f'{aux}\t{missed[idx]}\n')

    output_file = os.path.join(outdir, "TP_frequencies.tsv")
    df_pairs[TP_mask].to_csv(
        output_file,
        sep="\t",
        columns=["POS", "REF", "ALT", "FREQ_exp", "FREQ_rep", "INFO"],
        header=[
            "Loci",
            "Reference",
            "Variant",
            "Freq (expected)",
            "Freq (reported)",
            "Info",
        ],
        index=False,
        compression=None,
    )

    output_file = os.path.join(outdir, "FP_frequencies.tsv")
    df_pairs[FP_mask].to_csv(
        output_file,
        sep="\t",
        columns=["POS", "REF", "ALT", "FREQ_rep", "INFO"],
        header=["Loci", "Reference", "Variant", "Freq", "Info"],
        index=False,
        compression=None,
    )

    output_file = os.path.join(outdir, "FN_frequencies.tsv")
    df_pairs[FN_mask].to_csv(
        output_file,
        sep="\t",
        columns=["POS", "REF", "ALT", "FREQ_exp", "HAPLOTYPES"],
        header=["Loci", "Reference", "Variant", "Freq", "Haplotypes"],
        index=False,
        compression=None,
    )