def FetchPatternCov(curPattern, seqLis, kmer2seqIdInt): """ fetch the sequences ID in seqLis that curPattern covers *** to speed up 1. check whether current has already be calculated 2. parse pattern into kmer and store seq id in kmer2seqIdLis *** """ seqCnt = len(seqLis) if curPattern in kmer2seqIdInt: patternCovSeqIdxInt = kmer2seqIdInt[curPattern] else: allCovIdLis = [] kmerLis = FetchAllKmerFromPattern(curPattern) for kmer in kmerLis: if kmer in kmer2seqIdInt: allCovIdLis.append(kmer2seqIdInt[kmer]) else: kmerCovIdxSet = {seqId for seqId, seq in enumerate(seqLis) if kmer in seq} kmerCovIdxBitarray = IdxLis2bin(kmerCovIdxSet, seqCnt) kmerCovIdxInt = Comm.bitarray2int(kmerCovIdxBitarray) kmer2seqIdInt[kmer] = kmerCovIdxInt allCovIdLis.append(kmerCovIdxInt) patternCovSeqIdxInt = reduce(lambda x, y: x | y, allCovIdLis) kmer2seqIdInt[curPattern] = patternCovSeqIdxInt cov = covIdxInt2covCnt(patternCovSeqIdxInt) return cov, patternCovSeqIdxInt, kmer2seqIdInt
def formatCovId(kmer2seqIdSet, totalSeqCnt): """ format covered sequences Id set into binary num """ userKmer2seqIdInt = {} for kmer, seqIdxSet in kmer2seqIdSet.iteritems(): covIdxBitArray = IdxLis2bin(seqIdxSet, totalSeqCnt) covIdxBitInt = Comm.bitarray2int(covIdxBitArray) userKmer2seqIdInt[kmer] = covIdxBitInt return userKmer2seqIdInt
def FetchCovSeqDetail(seqMatrix, kmer, samplingSize): """ fetch sequence id in seqMatrix covered by kmer """ covIdxMatrix = [] covLis = [] for seqLis in seqMatrix: covIdxLis = {seqId for seqId, seq in enumerate(seqLis) if kmer in seq} covIdxBitarray = BioinfoComm.IdxLis2bin(covIdxLis, samplingSize) covIdxInt = Comm.bitarray2int(covIdxBitarray) covIdxMatrix.append(covIdxInt) covLis.append(int(bitarray.bitarray.count(covIdxBitarray))) # to save memory covIdxMatrix = np.array(covIdxMatrix) return covIdxMatrix, covLis