Esempio n. 1
0
def knownRegions(refdict, chromosomes):
    ends = {c: None for c in chromosomes}
    assert os.path.isfile(refdict)

    with open(refdict, 'r') as file:
        for line in file:
            if "@SQ" in line:
                assert "SN:" in line and "LN:" in line
                c = line.split('SN:')[1].split()[0]
                if c[3:] != "chr":
                    c = "chr{}".format(c)
                if c in chromosomes:
                    end = int(line.split('LN:')[1].split()[0])
                    ends[c] = end
    '''
    with open(refdict, 'r') as i:
        for l in i:
            if '@SQ' in l:
                assert 'SN:' in l and 'LN:' in l
                c = l.split('SN:')[1].split()[0]
                if c in chromosomes:
                    end = int(l.split('LN:')[1].split()[0])
                    ends[c] = end
     '''
    if None in ends.values():
        log(msg=
            "The following chromosomes have not been found in the dictionary of the reference genome: \n\t{}"
            .format(','.join([c for c in ends if ends[c] is None])),
            level="WARN")

    res = {}
    for c in chromosomes:
        res[c] = [(0, ends[c])]

    return res
Esempio n. 2
0
def main():
    log(msg=
        "# Parsing the input arguments, checking the consistency of given files, and extracting required "
        "information\n",
        level="STEP")

    args = ap.parse_baf_arguments()
    logArgs(args, 80)

    if args["reference"]:
        defaultMode(args)
    else:
        naiveMode(args)
Esempio n. 3
0
def defaultMode(args):
    log(msg="# Inferring SNPs from the normal sample\n", level="STEP")

    snps = SNPCalling.call(samtools=args["samtools"],
                           bcftools=args["bcftools"],
                           reference=args["reference"],
                           samples=[args["normal"]],
                           chromosomes=args["chromosomes"],
                           num_workers=args["j"],
                           snplist=args["snps"],
                           q=args["q"],
                           Q=args["Q"],
                           qual=args["qual"],
                           mincov=args["mincov"],
                           dp=args["maxcov"],
                           E=args["E"],
                           regions=args["regions"],
                           verbose=args["verbose"])

    if not snps:
        sp.close("No SNPs found in the normal!\n")

    log(msg="# Selecting heterozygous SNPs\n", level="STEP")

    hetSNPs = selectHetSNPs(counts=snps,
                            gamma=args["gamma"],
                            maxshift=args["maxshift"],
                            verbose=args["verbose"])

    if not hetSNPs:
        sp.close(
            "No heterozygous SNPs found in the selected regions of the normal!\n"
        )

    log(msg=
        "# Writing the list of selected SNPs, covered and heterozygous in the normal sample\n",
        level="STEP")

    with open(args["outputSnps"], 'w') as f:
        for chro in args["chromosomes"]:
            if (args["normal"][1], chro) in hetSNPs:
                for snp in hetSNPs[args["normal"][1], chro]:
                    f.write("{}\t{}\n".format(snp[1], snp[2]))

    log(msg=
        "# Writing the allele counts of the normal sample for selected SNPs\n",
        level="STEP")

    if args["outputNormal"] is not None:
        with open(args["outputNormal"], 'w') as f:
            for chro in args["chromosomes"]:
                if (args["normal"][1], chro) in hetSNPs:
                    for count in hetSNPs[args["normal"][1], chro]:
                        f.write("{}\t{}\t{}\t{}\t{}\n".format(
                            count[0], count[1], count[2], count[3], count[4]))
    else:
        for chro in args["chromosomes"]:
            if (args["normal"][1], chro) in hetSNPs:
                for count in hetSNPs[args["normal"][1], chro]:
                    sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format(
                        count[0], count[1], count[2], count[3], count[4]))

    log(msg="# Counting the alleles of tumor samples for selected SNPs\n",
        level="STEP")

    counts = \
        AlleleCounting.count(samtools=args["samtools"], bcftools=args["bcftools"], reference=args["reference"],
                             samples=args["samples"], chromosomes=args["chromosomes"], num_workers=args["j"],
                             snplist=args["outputSnps"], q=args["q"], Q=args["Q"], mincov=args["mincov"],
                             dp=args["maxcov"], E=args["E"], verbose=args["verbose"])

    if not counts:
        sp.close("The selected SNPs are not covered in the tumors!\n")

    log(msg="# Writing the allele counts of tumor samples for selected SNPs\n",
        level="STEP")
    if args["outputTumors"] is not None:
        with open(args["outputTumors"], 'w') as f:
            for sample in args["samples"]:
                for chro in args["chromosomes"]:
                    if (sample[1], chro) in counts:
                        for count in counts[sample[1], chro]:
                            f.write("{}\t{}\t{}\t{}\t{}\n".format(
                                count[0], count[1], count[2], count[3],
                                count[4]))
    else:
        for sample in args["samples"]:
            for chro in args["chromosomes"]:
                if (sample[1], chro) in counts:
                    for count in counts[sample[1], chro]:
                        sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format(
                            count[0], count[1], count[2], count[3], count[4]))
Esempio n. 4
0
def logArgs(args, width):
    text = "\n"
    for key in args:
        text += "\t{}: {}\n".format(key, args[key])
    log(msg=text, level="INFO")
Esempio n. 5
0
def main():
    log(msg="# Parsing and checking input arguments\n", level="STEP")
    args = ap.parse_bin_arguments()
    logArgs(args, 80)

    if args["regions"] is None:
        log(msg=
            "# Retrieving genomic regions to consider from maximum chromosome length\n",
            level="STEP")
        regions = knownRegions(args["refdict"], args["chromosomes"])
    else:
        log(msg="# Checking the consistency of the given regions\n",
            level="STEP")
        regions = ap.parseRegions(args["regions"], args["chromosomes"])

    if args["verbose"]:
        msg = "regions: "
        for c in args["chromosomes"]:
            msg += " {}: {}".format(c, regions[c])
        msg += "\n"
        log(msg=msg, level="INFO")

    log(msg="# Binning and counting the normal sample\n", level="STEP")

    normal_bins = \
        bb.bin(samtools=args["samtools"], samples=[args["normal"]], chromosomes=args["chromosomes"],
               num_workers=args["j"], q=args["q"], size=args["size"], regions=regions, verbose=args["verbose"])

    if not normal_bins:
        raise SystemExit("No bins in the normal sample!\n")

    log(msg="# Writing the read counts for bins of normal sample\n",
        level="STEP")
    if args["outputNormal"] is not None:
        with open(args["outputNormal"], 'w') as f:
            for c in args["chromosomes"]:
                for count in normal_bins[args["normal"][1], c]:
                    f.write("{}\t{}\t{}\t{}\t{}\n".format(
                        count[0], count[1], count[2], count[3], count[4]))
    else:
        for c in args["chromosomes"]:
            for count in normal_bins[args["normal"][1], c]:
                sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format(
                    count[0], count[1], count[2], count[3], count[4]))

    log(msg="# Binning and counting the tumor samples\n", level="STEP")
    tumor_bins = \
        bb.bin(samtools=args["samtools"], samples=args["samples"], chromosomes=args["chromosomes"],
               num_workers=args["j"], q=args["q"], size=args["size"], regions=regions, verbose=args["verbose"])

    if not tumor_bins:
        log.close("No bins in the tumor samples!\n")

    log(msg="# Writing the read counts for bins of tumor samples\n",
        level="STEP")
    if args["outputTumors"] is not None:
        with open(args["outputTumors"], 'w') as f:
            for sample in args["samples"]:
                for c in args["chromosomes"]:
                    for count in tumor_bins[sample[1], c]:
                        f.write("{}\t{}\t{}\t{}\t{}\n".format(
                            count[0], count[1], count[2], count[3], count[4]))
    else:
        for sample in args["samples"]:
            for c in args["chromosomes"]:
                for count in tumor_bins[sample[1], c]:
                    sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format(
                        count[0], count[1], count[2], count[3], count[4]))

    log(msg="# Counting total number of reads for normal and tumor samples\n",
        level="STEP")
    total_counts = \
        tc.tcount(samtools=args["samtools"], samples=({args["normal"]}|args["samples"]),
                  chromosomes=args["chromosomes"], num_workers=args["j"], q=args["q"], verbose=args["verbose"])

    try:
        total = {
            sample[1]: sum(total_counts[sample[1], chromosome]
                           for chromosome in args["chromosomes"])
            for sample in args["samples"]
        }
        total[args["normal"][1]] = sum(total_counts[args["normal"][1],
                                                    chromosome]
                                       for chromosome in args["chromosomes"])
    except KeyError:
        raise KeyError(
            "Either a chromosome or a sample has not been considered in the total counting!"
        )

    log(msg="# Writing the total read counts for all samples in {}\n".format(
        args["outputTotal"]),
        level="STEP")
    with open(args["outputTotal"], 'w') as f:
        f.write("{}\t{}\n".format(args["normal"][1], total[args["normal"][1]]))
        for sample in args["samples"]:
            f.write("{}\t{}\n".format(sample[1], total[sample[1]]))