Exemple #1
0
def checkReferenceAllels(refFile, siteFile):
    from XiaoweiLib import GenomeSequence
    gs = GenomeSequence()

    chroms = gs.open(refFile)
    logger.info("Load reference file: " + refFile +
                " (%d chromosomes)" % chroms)

    msg = "Error: Found mismatched reference alleles at %s:%d. [%s in " + siteFile + " vs. %s in " + refFile + "]"
    mismatchedRefAllele = 0
    for ln in myopen(siteFile):
        fd = ln.strip().split()
        if fd[0].lower().startswith('chr'): continue
        chrom, pos, rsid, ref, alt = fd[:5]
        pos = int(pos)
        trueRef = gs.getBase1(chrom, pos)

        if trueRef.lower() != ref.lower():
            mismatchedRefAllele += 1
            logger.warn(msg % (chrom, pos, ref, trueRef))

    if mismatchedRefAllele == 0:
        logger.info("No mismatched reference alleles detected.")
    else:
        logger.info(
            "Detected %d mismatched reference alleles. Please fix and rerun this script."
            % mismatchedRefAllele)
        sys.exit(1)
    return 0
Exemple #2
0
def getVariant(fn):
    ret = -1
    d = re.compile(r'Total \[ (\d+) \] variants are used to calculate autosomal kinship matrix.')
    for ln in myopen(fn):
        res = d.search(ln)
        if res:
            ret = res.groups()[0]
    return int(ret)
Exemple #3
0
def loadKinship(fn):
    ids = []
    kin = []
    ncol = -1
    for i, ln in enumerate(myopen(fn)):
        fd = ln.strip().split()
        if i == 0:
            ncol = len(fd)
            continue
        ids.append(fd[:2])
        kin.append([float(i) for i in fd[2:]])
    nsample = ncol - 2
    if len(ids) != nsample or \
       any([nsample != len(i) for i in kin]):
        print >> sys.stderr, "Dimension not match in ", fn
    print >> sys.stderr, "Kinship file %s with %d samples loaded" % (fn, nsample)
    return ids, kin
Exemple #4
0
                continue
            assert (len(val) == 2), i
            pileupId[i[0]] = val
        ## print pileupId
        logger.info("%d sample id loaded" % len(pileupId))

    # covFile = open(outPrefix + '.coverage', 'w')
    # refCountFile = open(outPrefix + '.refCount', 'w')
    # printHeader(colDict, mapRef, covFile)
    # printHeader(colDict, mapRef, refCountFile)
    #logFile = open(arg_outPrefix + '.log', 'w')
    from collections import Counter
    seqFile = open(arg_outPrefix + '.seq', 'w')
    for fn in fns:
        res = {}
        for ln in myopen(fn):  # loop each pileup file
            fd = ln.strip().split()
            if len(fd) == 4:
                ## after samtools 1.0.18, trancated pileup lines will be outputted
                ## and we will need to ignore them
                chrom, pos, ref, depth = fd
                refCount, altCount, qual = 0, 0, 0
            elif len(fd) == 6:
                chrom, pos, ref, depth, reads, quals = ln.strip().split()

                try:
                    refCount, altCount = count(ref, reads)
                    qual = calculateMeanQuality(quals)
                except:
                    logger.warn(
                        "Cannot parse pileup data, entering debug mode ...")