Python dnaHitInfo Examples

Programming Language: Python

Namespace/Package Name: algbioi.haplo.hmm

Method/Function: dnaHitInfo

Examples at hotexamples.com: 2

Python dnaHitInfo - 2 examples found. These are the top rated real world Python examples of algbioi.haplo.hmm.dnaHitInfo extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: hio.py Project: algbioi/snowball

def parse(inFq, inDomtblout, inProtFna=None):
    """
        Read in joined pair-end reads and its HMM annotation from the FASTQ, DOMTBLOUT, (and optionally PROT FASTA file)

        @param inFq: FASTQ file containing joined pair-end reads
        @param inDomtblout: HMM annotation file
        @param inProtFna: corresponding prot sequence (can be None)

        @type inFq: str
        @type inDomtblout: str
        @type inProtFna: str | None
        @rtype: list[read_rec.ReadRec]

        @return: a list of read-records
    """
    recList = []

    # read in prot sequences
    if inProtFna is None:
        nameToProtSeq = {}
    else:
        nameToProtSeq = fas.fastaFileToDictWholeNames(inProtFna)

    # read in dom file
    nameToDom = hmm.readDomblout(inDomtblout)

    assert inProtFna is None or len(nameToProtSeq) == len(nameToDom)

    # read in pair-end reads, create ReadRec
    for readName, dna, comment, qs in fq.ReadFqGen(inFq):

        readName = readName[1:]  # strip starting @

        protSeq = nameToProtSeq.get(readName, None)

        hit, frameTag = nameToDom[readName]

        annotStart, annotLen, strain, score, acc = hmm.dnaHitInfo(hit, dna, protSeq)  # strain, score, acc not used

        if strain == 1:
            assert 1 <= frameTag <= 3
        else:
            assert 4 <= frameTag <= 6

        # alignment env coord
        protStart = int(hit[19]) - 1
        protLen = int(hit[20]) - protStart

        assert annotLen == 3 * protLen

        # alignment coord
        protStartAli = int(hit[17]) - 1
        protLenAli = int(hit[18]) - protStartAli

        # hmm coordinates
        hmmCoordStart = int(hit[15]) - 1
        hmmCoordLen = int(hit[16]) - hmmCoordStart

        # the env coordinates start (and end) often before the alignment coordinates and end after, get the offsets
        offsetEnv = protStartAli - protStart
        offsetEnvE = protLen - offsetEnv - protLenAli

        assert offsetEnv >= 0
        assert offsetEnvE >= 0

        hmmCoordStart -= offsetEnv
        hmmCoordLen += offsetEnv + offsetEnvE

        tokens = comment.split('\t')

        if len(tokens) == 5:
            # get the ends of the pair-end read and corresponding quality-scores
            p, dna1, qs1, dna2, qs2 = tokens

            # get the QSArray representation of the first read-end
            qsA1 = qs_man.QSArray(dna=dna1, qsArrayFq=qs1)

            # get the reverse complement of the second read-end
            qsA2 = qs_man.QSArray(dna=dna2, qsArrayFq=qs2)
            qsA2.revCompl()

            # get the consensus QS Array representing of the joined read
            qsA = qs_man.QSArray(qsA1=qsA1, qsA2=qsA2, pos1Within2=len(dna2) - len(dna))
        else:
            # there is just a simple dna sequence (i.e. not joined reads)
            qsA = qs_man.QSArray(dna=dna, qsArrayFq=qs)

        recList.append(read_rec.ReadRec(readName, qsA, frameTag, annotStart, annotLen, hmmCoordStart, hmmCoordLen,
                                        protSeq, protStart, protLen))

    return recList

Example #2

Show file

File: hio.py Project: algbioi/snowball

def partitionReads(sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads=True,
                   considerSam=True):
    """
        Partitioning reads into the individual gene-domains.
    """
    try:
        strainDirList = map(lambda x: os.path.join(sampleDir, x), os.listdir(sampleDir))

        samplePartDir = os.path.join(sampleDir, pfamPartitionedDir)
        if not os.path.isdir(samplePartDir):
            os.mkdir(samplePartDir)

        # for each gene-dom
        fqOutDict = {}
        fqProtOutDict = {}
        fqSamOutDict = {}
        fqDomOutDict = {}

        for strainDir in strainDirList:
            strainAcc = os.path.basename(strainDir)
            if strainAcc == 'sample_partitioned':  # skip the directory containing the partitioned data
                continue
            if os.path.isdir(strainDir):
                # for each dom file
                for f in os.listdir(strainDir):

                    if (joinedReads and f.endswith('join_prot.domtblout.gz')) \
                            or (not joinedReads and f.endswith('pair1_prot.domtblout.gz')):  # a domtblout file found
                        i = f.split('_', 1)[0]
                        if i.isdigit():
                            if joinedReads:
                                domPath = os.path.join(strainDir, f)
                                domPath2 = None
                                fqPath = os.path.join(strainDir, '%s_join.fq.gz' % (i,))
                                fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,))
                                fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,))
                                fqProtPath = os.path.join(strainDir, '%s_join_prot.fna.gz' % (i,))
                                fqProtPath2 = None
                                samPath = os.path.join(strainDir, '%s_join_gmap.sam.gz' % (i,))
                                assert os.path.isfile(fqPath)
                            else:
                                domPath = os.path.join(strainDir, f)
                                domPath2 = os.path.join(strainDir, '%s_pair2_prot.domtblout.gz' % (i,))
                                fqPath = None
                                fqPair1Path = os.path.join(strainDir, '%s_pair1.fq.gz' % (i,))
                                fqPair2Path = os.path.join(strainDir, '%s_pair2.fq.gz' % (i,))
                                fqProtPath = os.path.join(strainDir, '%s_pair1_prot.fna.gz' % (i,))
                                fqProtPath2 = os.path.join(strainDir, '%s_pair2_prot.fna.gz' % (i,))
                                samPath = os.path.join(strainDir, '%s_pair.sam.gz' % (i,))
                                assert os.path.isfile(domPath2) and os.path.isfile(fqProtPath2)
                            assert os.path.isfile(domPath) and os.path.isfile(fqPair1Path) \
                                   and os.path.isfile(fqPair2Path) and os.path.isfile(fqProtPath)

                            if considerSam:
                                assert os.path.isfile(samPath)

                            # map: read-name -> list of hits (lists sorted according to the scores)
                            nameToHitList = getReadNameToHitList(domPath)
                            if not joinedReads:
                                nameToHitList2 = getReadNameToHitList(domPath2)
                                len1 = len(nameToHitList)
                                len2 = len(nameToHitList2)
                                nameToHitList.update(nameToHitList2)
                                assert len(nameToHitList) == len1 + len2

                            # map: read-name-prot -> seq-prot
                            protNameToSeq = fas.fastaFileToDictWholeNames(fqProtPath)
                            if not joinedReads:
                                protNameToSeq.update(fas.fastaFileToDictWholeNames(fqProtPath2))

                            # map: read-name -> sam-line-entry
                            if considerSam:
                                readNameToSam = {}
                                if joinedReads:
                                    for line in gzip.open(samPath):
                                        line = line.strip()
                                        if line.startswith('#'):
                                            continue
                                        readName = line.split('\t', 1)[0]
                                        # lines with only 11 entries will be padded with * to 12
                                        if len(line.split('\t')) == 11:
                                            line += '\t*'
                                        readNameToSam[readName] = line + '\t' + strainAcc
                                else:
                                    entry = []
                                    for line in gzip.open(samPath):
                                        line = line.strip()
                                        if line.startswith('#') or line.startswith('@'):
                                            continue
                                        if len(entry) < 2:
                                            entry.append(line)
                                        if len(entry) == 2:
                                            readName = entry[0].split('\t', 1)[0]
                                            assert readName == entry[1].split('\t', 1)[0]
                                            readNameToSam[readName] = entry[0] + '\t*\t' + strainAcc + '\n' \
                                                                      + entry[1] + '\t*\t' + strainAcc
                                            entry = []

                            # map read-name -> "pair1-dna tab pair1-qs tab pair2-dna tab pair2-qs"
                            if joinedReads:
                                readNameToPairReads = fq.getFqToDict(fqPair1Path, fqPair2Path)
                            else:
                                readNameToPairReads = None

                            if joinedReads:
                                g1 = fq.ReadFqGen(fqPath)
                                g2 = []
                            else:
                                g1 = fq.ReadFqGen(fqPair1Path)
                                g2 = fq.ReadFqGen(fqPair2Path)

                            # go over all reads
                            for readName, dna, p, qs in list(g1) + list(g2):
                                readName = readName[1:]  # strip starting '@'

                                # take the hit with the highest score
                                topHit = None
                                if readName in nameToHitList:
                                    topHit = nameToHitList[readName][0]

                                # is the hit significant, filter according to the score and accuracy
                                if topHit is None or float(topHit[13]) < scoreThreshold or float(topHit[21]) < accuracyThreshold:
                                    continue
                                else:
                                    famName = topHit[3]
                                    if famName not in fqOutDict:
                                        fqOutDict[famName] = []
                                        fqProtOutDict[famName] = []
                                        fqSamOutDict[famName] = []
                                        fqDomOutDict[famName] = []

                                    if joinedReads:
                                        comment = readNameToPairReads[readName]
                                    else:
                                        comment = ''
                                    fqOutDict[famName].append((readName, dna, qs, comment))

                                    protSeqName = topHit[0]
                                    protSeq = protNameToSeq[protSeqName]
                                    fqProtOutDict[famName].append((readName, protSeq))

                                    if considerSam:
                                        if joinedReads:
                                            fqSamOutDict[famName].append(readNameToSam[readName])
                                        else:
                                            fqSamOutDict[famName].append(readNameToSam[readName[:-2]])

                                    # top hit coordinates within the read
                                    startOnRead, overlapLen, strain = hmm.dnaHitInfo(topHit, dna, protSeq)[:3]

                                    fqDomOutDict[famName].append('\t'.join(topHit) + '\t%s\t%s\t%s' % (startOnRead, overlapLen, strain))

        if joinedReads:
            ident = 'join'
        else:
            ident = 'pair'

        # for each gene dom, store reads into a file
        for famName, fqContentList in fqOutDict.iteritems():

            # get the tagged fam-dom-name that can be used in file names
            pf = comh.getGeneNameToFileName(famName)[:-4]

            # define output files
            fqOutO = os.path.join(samplePartDir, 'o_%s_%s.fq.gz' % (pf, ident))  # 'o_' ~ ordered entries
            fqOutR = os.path.join(samplePartDir, 'r_%s_%s.fq.gz' % (pf, ident))  # 'r_' ~ random shuffled entries

            fqProtOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.fna.gz' % (pf, ident))
            fqProtOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.fna.gz' % (pf, ident))

            fqSamOutO = os.path.join(samplePartDir, 'o_%s_%s_gmap.sam.gz' % (pf, ident))
            fqSamOutR = os.path.join(samplePartDir, 'r_%s_%s_gmap.sam.gz' % (pf, ident))

            fqDomOutO = os.path.join(samplePartDir, 'o_%s_%s_prot.domtblout.gz' % (pf, ident))
            fqDomOutR = os.path.join(samplePartDir, 'r_%s_%s_prot.domtblout.gz' % (pf, ident))

            # write FASTQ
            fqOut = fq.WriteFq(fqOutO)
            for e in fqContentList:
                fqOut.writeFqEntry('@' + e[0], e[1], e[2], e[3])
            fqOut.close()

            # write PROT
            fqProtOut = fq.WriteFq(fqProtOutO)
            fqProtOut.write('\n'.join(map(lambda x: '>%s\n%s' % (x[0], x[1]), fqProtOutDict[famName])) + '\n')
            fqProtOut.close()

            # write SAM
            if considerSam:
                fqSamOut = fq.WriteFq(fqSamOutO)
                fqSamOut.write('\n'.join(fqSamOutDict[famName]) + '\n')
                fqSamOut.close()

            # write DOM
            fqDomOut = fq.WriteFq(fqDomOutO)
            fqDomOut.write('\n'.join(fqDomOutDict[famName]) + '\n')
            fqDomOut.close()

            # shuffle file entries (to remove any bias imposed by the ordering)
            rand.shuffleLines(fqOutO, fqOutR, 4, shuffleRandSeed)
            rand.shuffleLines(fqProtOutO, fqProtOutR, 2, shuffleRandSeed)
            rand.shuffleLines(fqDomOutO, fqDomOutR, 1, shuffleRandSeed)

            if considerSam:
                if joinedReads:
                    rand.shuffleLines(fqSamOutO, fqSamOutR, 1, shuffleRandSeed)
                else:
                    rand.shuffleLines(fqSamOutO, fqSamOutR, 2, shuffleRandSeed)

            # delete ordered files (keep only the shuffled ones)
            os.remove(fqOutO)
            os.remove(fqProtOutO)
            if considerSam:
                os.remove(fqSamOutO)
            os.remove(fqDomOutO)
    except Exception as e:
        print('Exception in partitionReads:')
        print sampleDir, scoreThreshold, accuracyThreshold, shuffleRandSeed, pfamPartitionedDir, joinedReads
        print e.message
        print type(e)
        print e.args
        raise e