def checkReferenceAllels(refFile, siteFile): from XiaoweiLib import GenomeSequence gs = GenomeSequence() chroms = gs.open(refFile) logger.info("Load reference file: " + refFile + " (%d chromosomes)" % chroms) msg = "Error: Found mismatched reference alleles at %s:%d. [%s in " + siteFile + " vs. %s in " + refFile + "]" mismatchedRefAllele = 0 for ln in myopen(siteFile): fd = ln.strip().split() if fd[0].lower().startswith('chr'): continue chrom, pos, rsid, ref, alt = fd[:5] pos = int(pos) trueRef = gs.getBase1(chrom, pos) if trueRef.lower() != ref.lower(): mismatchedRefAllele += 1 logger.warn(msg % (chrom, pos, ref, trueRef)) if mismatchedRefAllele == 0: logger.info("No mismatched reference alleles detected.") else: logger.info( "Detected %d mismatched reference alleles. Please fix and rerun this script." % mismatchedRefAllele) sys.exit(1) return 0
def getVariant(fn): ret = -1 d = re.compile(r'Total \[ (\d+) \] variants are used to calculate autosomal kinship matrix.') for ln in myopen(fn): res = d.search(ln) if res: ret = res.groups()[0] return int(ret)
def loadKinship(fn): ids = [] kin = [] ncol = -1 for i, ln in enumerate(myopen(fn)): fd = ln.strip().split() if i == 0: ncol = len(fd) continue ids.append(fd[:2]) kin.append([float(i) for i in fd[2:]]) nsample = ncol - 2 if len(ids) != nsample or \ any([nsample != len(i) for i in kin]): print >> sys.stderr, "Dimension not match in ", fn print >> sys.stderr, "Kinship file %s with %d samples loaded" % (fn, nsample) return ids, kin
continue assert (len(val) == 2), i pileupId[i[0]] = val ## print pileupId logger.info("%d sample id loaded" % len(pileupId)) # covFile = open(outPrefix + '.coverage', 'w') # refCountFile = open(outPrefix + '.refCount', 'w') # printHeader(colDict, mapRef, covFile) # printHeader(colDict, mapRef, refCountFile) #logFile = open(arg_outPrefix + '.log', 'w') from collections import Counter seqFile = open(arg_outPrefix + '.seq', 'w') for fn in fns: res = {} for ln in myopen(fn): # loop each pileup file fd = ln.strip().split() if len(fd) == 4: ## after samtools 1.0.18, trancated pileup lines will be outputted ## and we will need to ignore them chrom, pos, ref, depth = fd refCount, altCount, qual = 0, 0, 0 elif len(fd) == 6: chrom, pos, ref, depth, reads, quals = ln.strip().split() try: refCount, altCount = count(ref, reads) qual = calculateMeanQuality(quals) except: logger.warn( "Cannot parse pileup data, entering debug mode ...")