def parse(inFq, inDomtblout, inProtFna=None): """ Read in joined pair-end reads and its HMM annotation from the FASTQ, DOMTBLOUT, (and optionally PROT FASTA file) @param inFq: FASTQ file containing joined pair-end reads @param inDomtblout: HMM annotation file @param inProtFna: corresponding prot sequence (can be None) @type inFq: str @type inDomtblout: str @type inProtFna: str | None @rtype: list[read_rec.ReadRec] @return: a list of read-records """ recList = [] # read in prot sequences if inProtFna is None: nameToProtSeq = {} else: nameToProtSeq = fas.fastaFileToDictWholeNames(inProtFna) # read in dom file nameToDom = hmm.readDomblout(inDomtblout) assert inProtFna is None or len(nameToProtSeq) == len(nameToDom) # read in pair-end reads, create ReadRec for readName, dna, comment, qs in fq.ReadFqGen(inFq): readName = readName[1:] # strip starting @ protSeq = nameToProtSeq.get(readName, None) hit, frameTag = nameToDom[readName] annotStart, annotLen, strain, score, acc = hmm.dnaHitInfo(hit, dna, protSeq) # strain, score, acc not used if strain == 1: assert 1 <= frameTag <= 3 else: assert 4 <= frameTag <= 6 # alignment env coord protStart = int(hit[19]) - 1 protLen = int(hit[20]) - protStart assert annotLen == 3 * protLen # alignment coord protStartAli = int(hit[17]) - 1 protLenAli = int(hit[18]) - protStartAli # hmm coordinates hmmCoordStart = int(hit[15]) - 1 hmmCoordLen = int(hit[16]) - hmmCoordStart # the env coordinates start (and end) often before the alignment coordinates and end after, get the offsets offsetEnv = protStartAli - protStart offsetEnvE = protLen - offsetEnv - protLenAli assert offsetEnv >= 0 assert offsetEnvE >= 0 hmmCoordStart -= offsetEnv hmmCoordLen += offsetEnv + offsetEnvE tokens = comment.split('\t') if len(tokens) == 5: # get the ends of the pair-end read and corresponding quality-scores p, dna1, qs1, dna2, qs2 = tokens # get the QSArray representation of the first read-end qsA1 = qs_man.QSArray(dna=dna1, qsArrayFq=qs1) # get the reverse complement of the second read-end qsA2 = qs_man.QSArray(dna=dna2, qsArrayFq=qs2) qsA2.revCompl() # get the consensus QS Array representing of the joined read qsA = qs_man.QSArray(qsA1=qsA1, qsA2=qsA2, pos1Within2=len(dna2) - len(dna)) else: # there is just a simple dna sequence (i.e. not joined reads) qsA = qs_man.QSArray(dna=dna, qsArrayFq=qs) recList.append(read_rec.ReadRec(readName, qsA, frameTag, annotStart, annotLen, hmmCoordStart, hmmCoordLen, protSeq, protStart, protLen)) return recList
def partitionReads(sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads=True, considerSam=True): """ Partitioning reads into the individual gene-domains. """ try: strainDirList = map(lambda x: os.path.join(sampleDir, x), os.listdir(sampleDir)) samplePartDir = os.path.join(sampleDir, pfamPartitionedDir) if not os.path.isdir(samplePartDir): os.mkdir(samplePartDir) # for each gene-dom fqOutDict = {} fqProtOutDict = {} fqSamOutDict = {} fqDomOutDict = {} for strainDir in strainDirList: strainAcc = os.path.basename(strainDir) if strainAcc == 'sample_partitioned': # skip the directory containing the partitioned data continue if os.path.isdir(strainDir): # for each dom file for f in os.listdir(strainDir): if (joinedReads and f.endswith('join_prot.domtblout.gz')) \ or (not joinedReads and f.endswith('pair1_prot.domtblout.gz')): # a domtblout file found i = f.split('_', 1)[0] if i.isdigit(): if joinedReads: domPath = os.path.join(strainDir, f) domPath2 = None fqPath = os.path.join(strainDir, '%s_join.fq.gz' % (i,)) fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,)) fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,)) fqProtPath = os.path.join(strainDir, '%s_join_prot.fna.gz' % (i,)) fqProtPath2 = None samPath = os.path.join(strainDir, '%s_join_gmap.sam.gz' % (i,)) assert os.path.isfile(fqPath) else: domPath = os.path.join(strainDir, f) domPath2 = os.path.join(strainDir, '%s_pair2_prot.domtblout.gz' % (i,)) fqPath = None fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,)) fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,)) fqProtPath = os.path.join(strainDir, '%s_pair1_prot.fna.gz' % (i,)) fqProtPath2 = os.path.join(strainDir, '%s_pair2_prot.fna.gz' % (i,)) samPath = os.path.join(strainDir, '%s_pair.sam.gz' % (i,)) assert os.path.isfile(domPath2) and os.path.isfile(fqProtPath2) assert os.path.isfile(domPath) and os.path.isfile(fqPair1Path) \ and os.path.isfile(fqPair2Path) and os.path.isfile(fqProtPath) if considerSam: assert os.path.isfile(samPath) # map: read-name -> list of hits (lists sorted according to the scores) nameToHitList = getReadNameToHitList(domPath) if not joinedReads: nameToHitList2 = getReadNameToHitList(domPath2) len1 = len(nameToHitList) len2 = len(nameToHitList2) nameToHitList.update(nameToHitList2) assert len(nameToHitList) == len1 + len2 # map: read-name-prot -> seq-prot protNameToSeq = fas.fastaFileToDictWholeNames(fqProtPath) if not joinedReads: protNameToSeq.update(fas.fastaFileToDictWholeNames(fqProtPath2)) # map: read-name -> sam-line-entry if considerSam: readNameToSam = {} if joinedReads: for line in gzip.open(samPath): line = line.strip() if line.startswith('#'): continue readName = line.split('\t', 1)[0] # lines with only 11 entries will be padded with * to 12 if len(line.split('\t')) == 11: line += '\t*' readNameToSam[readName] = line + '\t' + strainAcc else: entry = [] for line in gzip.open(samPath): line = line.strip() if line.startswith('#') or line.startswith('@'): continue if len(entry) < 2: entry.append(line) if len(entry) == 2: readName = entry[0].split('\t', 1)[0] assert readName == entry[1].split('\t', 1)[0] readNameToSam[readName] = entry[0] + '\t*\t' + strainAcc + '\n' \ + entry[1] + '\t*\t' + strainAcc entry = [] # map read-name -> "pair1-dna tab pair1-qs tab pair2-dna tab pair2-qs" if joinedReads: readNameToPairReads = fq.getFqToDict(fqPair1Path, fqPair2Path) else: readNameToPairReads = None if joinedReads: g1 = fq.ReadFqGen(fqPath) g2 = [] else: g1 = fq.ReadFqGen(fqPair1Path) g2 = fq.ReadFqGen(fqPair2Path) # go over all reads for readName, dna, p, qs in list(g1) + list(g2): readName = readName[1:] # strip starting '@' # take the hit with the highest score topHit = None if readName in nameToHitList: topHit = nameToHitList[readName][0] # is the hit significant, filter according to the score and accuracy if topHit is None or float(topHit[13]) < scoreThreshold or float(topHit[21]) < accuracyThreshold: continue else: famName = topHit[3] if famName not in fqOutDict: fqOutDict[famName] = [] fqProtOutDict[famName] = [] fqSamOutDict[famName] = [] fqDomOutDict[famName] = [] if joinedReads: comment = readNameToPairReads[readName] else: comment = '' fqOutDict[famName].append((readName, dna, qs, comment)) protSeqName = topHit[0] protSeq = protNameToSeq[protSeqName] fqProtOutDict[famName].append((readName, protSeq)) if considerSam: if joinedReads: fqSamOutDict[famName].append(readNameToSam[readName]) else: fqSamOutDict[famName].append(readNameToSam[readName[:-2]]) # top hit coordinates within the read startOnRead, overlapLen, strain = hmm.dnaHitInfo(topHit, dna, protSeq)[:3] fqDomOutDict[famName].append('\t'.join(topHit) + '\t%s\t%s\t%s' % (startOnRead, overlapLen, strain)) if joinedReads: ident = 'join' else: ident = 'pair' # for each gene dom, store reads into a file for famName, fqContentList in fqOutDict.iteritems(): # get the tagged fam-dom-name that can be used in file names pf = comh.getGeneNameToFileName(famName)[:-4] # define output files fqOutO = os.path.join(samplePartDir, 'o_%s_%s.fq.gz' % (pf, ident)) # 'o_' ~ ordered entries fqOutR = os.path.join(samplePartDir, 'r_%s_%s.fq.gz' % (pf, ident)) # 'r_' ~ random shuffled entries fqProtOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.fna.gz' % (pf, ident)) fqProtOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.fna.gz' % (pf, ident)) fqSamOutO = os.path.join(samplePartDir, 'o_%s_%s_gmap.sam.gz' % (pf, ident)) fqSamOutR = os.path.join(samplePartDir, 'r_%s_%s_gmap.sam.gz' % (pf, ident)) fqDomOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.domtblout.gz' % (pf, ident)) fqDomOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.domtblout.gz' % (pf, ident)) # write FASTQ fqOut = fq.WriteFq(fqOutO) for e in fqContentList: fqOut.writeFqEntry('@' + e[0], e[1], e[2], e[3]) fqOut.close() # write PROT fqProtOut = fq.WriteFq(fqProtOutO) fqProtOut.write('\n'.join(map(lambda x: '>%s\n%s' % (x[0], x[1]), fqProtOutDict[famName])) + '\n') fqProtOut.close() # write SAM if considerSam: fqSamOut = fq.WriteFq(fqSamOutO) fqSamOut.write('\n'.join(fqSamOutDict[famName]) + '\n') fqSamOut.close() # write DOM fqDomOut = fq.WriteFq(fqDomOutO) fqDomOut.write('\n'.join(fqDomOutDict[famName]) + '\n') fqDomOut.close() # shuffle file entries (to remove any bias imposed by the ordering) rand.shuffleLines(fqOutO, fqOutR, 4, shuffleRandSeed) rand.shuffleLines(fqProtOutO, fqProtOutR, 2, shuffleRandSeed) rand.shuffleLines(fqDomOutO, fqDomOutR, 1, shuffleRandSeed) if considerSam: if joinedReads: rand.shuffleLines(fqSamOutO, fqSamOutR, 1, shuffleRandSeed) else: rand.shuffleLines(fqSamOutO, fqSamOutR, 2, shuffleRandSeed) # delete ordered files (keep only the shuffled ones) os.remove(fqOutO) os.remove(fqProtOutO) if considerSam: os.remove(fqSamOutO) os.remove(fqDomOutO) except Exception as e: print('Exception in partitionReads:') print sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads print e.message print type(e) print e.args raise e