Esempio n. 1
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq),'N','')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount


    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'
Esempio n. 2
0
 def __init__(self, id, name, seq):
     self.id = id
     self.name = noNewLine(name)
     seq = noNewLine(seq)
     self.seqBp = len(removeNonDna(seq))
     self._seqCompressed = zlib.compress(seq)
     self._taxPathDict = None
     self._placementWeight = None
     self._hash = hash(seq.upper())
     self._candidateTaxPathDictList = []
     self._candidateTaxPathDictWeightsList = []
     self._candidateTaxPathDictSourceList = []  # where does this prediction come from
     self._candidateTaxPathDictTagList = []
     self.scaffold = None
     self._removeNonDna = False
Esempio n. 3
0
def scafToContigOutputPPFormat(scafContigFile, scafPPSOutFile, contigPPSOutFile):
    """
        Takes scaffold-contigs mapping and scaffold placement (PP.out), outputs contigs placement (PP.out)

        @param scafContigFile: tab sepparated scaffold-contigs mapping (scaffold \t contig)
        @param scafPPSOutFile: scaffold predictions (PPS output file)
        @param contigPPSOutFile: contigs predictions (as if it was a PPS output file)
    """
    scafToContigs = dict([])
    try:
        f = open(os.path.normpath(scafContigFile),'r')
    except Exception:
        print "Cannot open file:", scafContigFile
        raise
    else:
        for line in f:
            line = common.noNewLine(line)
            scaffold = re.sub(r'^[ ]*([^ \t]+)\t[^ \t]*',r'\1', line)
            contig = re.sub(r'^[ ]*[^ \t]+\t([^ \t]*)',r'\1', line)
            if scaffold in scafToContigs:
                scafToContigs[scaffold].append(contig)
            else:
                temp = []
                temp.append(contig)
                scafToContigs[scaffold] = temp

    try:
        fr = open(os.path.normpath(scafPPSOutFile),'r')
        fw = open(os.path.normpath(contigPPSOutFile),'w')
    except Exception:
        print "Cannot open one of the files:", scafPPSOutFile, contigPPSOutFile
        raise
    else:
        for line in fr:
            line = common.noNewLine(line)
            if len(line) == 0 or re.match('#', line):
                fw.write(line + '\n')
            else:
                scaffold = re.sub(r'^[ ]*([^ \t]+)[ \t]*.*$',r'\1', line)
                assignment = re.sub(r'^[ ]*[^ \t]+[ \t]*(.*$)',r'\1', line)
                if scaffold in scafToContigs:
                    contigsList = scafToContigs[scaffold]
                    for contig in contigsList:
                        lineW = str(contig + '\t' + assignment + '\n')
                        fw.write(lineW)
                    #print str(lineW),
                else:
                    print 'there is not scaffold-contigs mapping for scaffold:', scaffold
Esempio n. 4
0
    def _setCandidatePlacement(self, sequences, taxonomy, predFileName, source):
        assignedIdList = []
        try:
            f = open(os.path.normpath(predFileName),'r')
        except Exception:
            print "Cannot open file:", predFileName
            raise
        else:
            for line in f:
                line = common.noNewLine(line)
                if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$', line):
                    scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+$',r'\1' ,line))
                    tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)$',r'\1' ,line))
                    if ncbid != 1:
                        taxPathDict = taxonomy.getPathToRoot(ncbid)
                        if taxPathDict is not None and taxPathDict.keys() >= 1:
                            sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag)
                            assignedIdList.append(contigId)
                        else:
                            sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid)))
        finally:
            f.close()

        return set(assignedIdList)
Esempio n. 5
0
def fastaFileToDictWholeNames(filePath):
    """
        Reads a fasta file and returns mapping: seqName -> sequence the whole sequence name is used
        as seqName!!! (even if it contains space)
    """
    seqIdToSeq = {}
    f = None
    try:
        if filePath.endswith('.gz'):
            f = gzip.open(os.path.normpath(filePath), mode='r')
        else:
            f = open(os.path.normpath(filePath), 'r')
    except Exception:
        print "Cannot open file:", filePath
        raise
    else:
        name = ''
        seq = ''
        for line in f:
            line = noNewLine(line)
            if re.match('>', line):
                if seq != '':
                    assert name != ''
                    seqIdToSeq[name] = seq
                    seq = ''
                name = line.replace('>', '')
            else:
                seq += line
        if seq != '':
            assert name != ''
            seqIdToSeq[name] = seq
    finally:
        if f is not None:
            f.close()
    return seqIdToSeq
    def _setCandidatePlacement(self, sequences, taxonomy, predFileName, source):
        assignedIdList = []
        try:
            f = open(os.path.normpath(predFileName),'r')
        except Exception:
            print "Cannot open file:", predFileName
            raise
        else:
            for line in f:
                line = common.noNewLine(line)
                if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$', line):
                    scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+$',r'\1' ,line))
                    tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)$',r'\1' ,line))
                    if ncbid != 1:
                        taxPathDict = taxonomy.getPathToRoot(ncbid)
                        if taxPathDict is not None and taxPathDict.keys() >= 1:
                            sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag)
                            assignedIdList.append(contigId)
                        else:
                            sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid)))
        finally:
            f.close()

        return set(assignedIdList)
Esempio n. 7
0
def forEachLine(filePath, parser):
    """
        For each line of the file call the parser, at the end call the finalize method of the parser if it`s defined.
    """
    try:
        f = open(os.path.normpath(filePath), 'r')
    except Exception:
        sys.stderr.write('Cannot open a file for reading: ' + filePath)
        raise
    else:
        try:
            for line in f:
                parser.parse(noNewLine(line))
        except Exception:
            sys.stderr.write('Cannot read from file: ' + filePath)
            raise
        finally:
            f.close()
    try:
        if isinstance(parser.finalize, types.MethodType):
            parser.finalize()
    except Exception:
        pass

    return parser
Esempio n. 8
0
def toScafContigMap(scafContigFile):
    """
        Reads scaffold contig mapping.

        @param scafContigFile: scaffold-contig mapping (tab separated)

        @return: map: scaffold -> list of contigs
    """
    scafToContigs = dict()
    try:
        f = open(os.path.normpath(scafContigFile), 'r')
    except Exception:
        print "Cannot open file:", scafContigFile
        raise
    else:
        for line in f:
            line = noNewLine(line)
            scaffold = re.sub(r'^[ ]*([^\t]+)\t[^\t]*', r'\1',
                              line)  # gap deleted !!!
            contig = re.sub(r'^[ ]*[^\t]+\t([^\t]*)', r'\1', line)
            if scaffold in scafToContigs:
                scafToContigs[scaffold].append(contig)
            else:
                temp = []
                temp.append(contig)
                scafToContigs[scaffold] = temp

    return scafToContigs
Esempio n. 9
0
def fastaFileToDictWholeNames(filePath):
    """
        Reads a fasta file and returns mapping: seqName -> sequence the whole sequence name is used
        as seqName!!! (even if it contains space)
    """
    seqIdToSeq = {}
    f = None
    try:
        if filePath.endswith('.gz'):
            f = gzip.open(os.path.normpath(filePath), mode='r')
        else:
            f = open(os.path.normpath(filePath), 'r')
    except Exception:
        print "Cannot open file:", filePath
        raise
    else:
        name = ''
        seq = ''
        for line in f:
            line = noNewLine(line)
            if re.match('>', line):
                if seq != '':
                    assert name != ''
                    seqIdToSeq[name] = seq
                    seq = ''
                name = line.replace('>', '')
            else:
                seq += line
        if seq != '':
            assert name != ''
            seqIdToSeq[name] = seq
    finally:
        if f is not None:
            f.close()
    return seqIdToSeq
Esempio n. 10
0
 def _readContigsScaffolds(self, filePath, readContigs=True):
     """
         Read contigs or scaffolds from a file.
     """
     try:
         f = open(os.path.normpath(filePath), 'r')
     except Exception:
         print "Cannot open file:", filePath
         raise
     else:
         name = ''
         seq = ''
         for line in f:
             line = noNewLine(line)
             if re.match('>', line):
                 if seq != '':
                     assert name != ''
                     if readContigs:
                         self._addSeq(name, seq)  # store seq
                     else:
                         self._addScaff(name, None, seq)
                     seq = ''
                 name = line.replace('>', '')
             else:
                 seq += line
         if seq != '':
             assert name != ''
             if readContigs:
                 self._addSeq(name, seq)  #store seq
             else:
                 self._addScaff(name, None, seq)
     finally:
         f.close()
Esempio n. 11
0
def toScafContigMap(scafContigFile):
    """
        Reads scaffold contig mapping.

        @param scafContigFile: scaffold-contig mapping (tab separated)

        @return: map: scaffold -> list of contigs
    """
    scafToContigs = dict()
    try:
        f = open(os.path.normpath(scafContigFile),'r')
    except Exception:
        print "Cannot open file:", scafContigFile
        raise
    else:
        for line in f:
            line = noNewLine(line)
            scaffold = re.sub(r'^[ ]*([^\t]+)\t[^\t]*',r'\1', line)# gap deleted !!!
            contig = re.sub(r'^[ ]*[^\t]+\t([^\t]*)',r'\1', line)
            if scaffold in scafToContigs:
                scafToContigs[scaffold].append(contig)
            else:
                temp = []
                temp.append(contig)
                scafToContigs[scaffold] = temp

    return scafToContigs
Esempio n. 12
0
    def writePlacementsOut(self, outFile, taxaRanks, outputFileContigSubPattern):

        try:
            f = open(os.path.normpath(outFile), 'w')
            f.write('# SEQUENCEID	TAXID')
            # k = 0

            for seq in self.sequences:

                taxPathDict = seq.getTaxonomyPath()
                ncbid = 1
                for rank in taxaRanks:
                    if ((taxPathDict is not None) and (rank in taxPathDict)):
                        ncbid = taxPathDict[rank].ncbid
                    else:
                        break

                if ncbid == 1:
                    continue

                entry = (noNewLine(re.sub(outputFileContigSubPattern, r'\1' , seq.name)) + '\t' + str(ncbid))

                # if k == 0:
                #     f.write(entry)
                #     k += 1
                # else:
                f.write('\n' + entry)

        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()
Esempio n. 13
0
 def _readContigsScaffolds(self, filePath, readContigs = True):
     """
         Read contigs or scaffolds from a file.
     """
     try:
         f = open(os.path.normpath(filePath),'r')
     except Exception:
         print "Cannot open file:", filePath
         raise
     else:
         name = ''
         seq = ''
         for line in f:
             line = noNewLine(line)
             if re.match('>', line):
                 if seq != '':
                     assert name != ''
                     if readContigs:
                         self._addSeq(name, seq)  # store seq
                     else:
                         self._addScaff(name, None, seq)
                     seq = ''
                 name = line.replace('>','')
             else:
                 seq += line
         if seq != '':
             assert name != ''
             if readContigs:
                 self._addSeq(name, seq) #store seq
             else:
                 self._addScaff(name, None, seq)
     finally:
         f.close()
Esempio n. 14
0
def forEachLine(filePath, parser):
    """
        For each line of the file call the parser, at the end call the finalize method of the parser if it`s defined.
    """
    try:
        f = open(os.path.normpath(filePath), 'r')
    except Exception:
        sys.stderr.write('Cannot open a file for reading: ' + filePath)
        raise
    else:
        try:
            for line in f:
                parser.parse(noNewLine(line))
        except Exception:
            sys.stderr.write('Cannot read from file: ' + filePath)
            raise
        finally:
            f.close()
    try:
        if isinstance(parser.finalize, types.MethodType):
            parser.finalize()
    except Exception:
        pass

    return parser
Esempio n. 15
0
def loadDictFromAFile(filePath):
    """
        Returns a dictionary that is stored in a file.

        @param filePath: a file in which a dictionary is stored in format: (key tab item)

        @return: dict that represents mapping: (key -> list of items)
    """
    try:
        dictOfLists = dict([])
        f = open(os.path.normpath(filePath), 'r')

        for line in f:
            pair = re.findall('[^\t]+', common.noNewLine(line))
            assert len(pair) == 2, str(
                'There are not two values separated by \t at line: ' + line)
            key = int(pair[0])
            val = int(pair[1])
            if key in dictOfLists:
                dictOfLists[key].append(val)
            else:
                list = []
                list.append(val)
                dictOfLists[key] = list

        return dictOfLists
    except Exception:
        print "Cannot create a file or write to it:", filePath
        raise
    finally:
        f.close()
Esempio n. 16
0
 def mothurPredToTabSepPred(self, mothurPredFileName, outPredFileName):
     """
         Transforms the mothur output prediction file (*.taxonomy) to the tab separated prediction file seqName tab ncbid tab weight.
     """
     try:
         fr = open(os.path.normpath(mothurPredFileName), 'r')
     except Exception:
         sys.stderr.write("Cannot open file:" + mothurPredFileName + '\n')
         raise
     else:
         try:
             fw = open(os.path.normpath(outPredFileName), 'w')
             lineCount = 0
             for line in fr:
                 line = common.noNewLine(line)
                 try:
                     if re.match(r'^[0-9]+_[0-9]+_[0-9]+_[0-9]+.*', line):
                         name = re.sub(
                             r'([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[\+\-\t ]+.*',
                             r'\1', line)
                         tag = re.sub(
                             r'[0-9]+_[0-9]+_([0-9]+_[0-9]+_[\+\-]+)[\t ]+.*',
                             r'\1', line)
                         placementList = re.sub(
                             r'[0-9]+_[0-9]+_[0-9]+_[0-9]+_[\+\-\t ]+(.*)',
                             r'\1', line.replace('unclassified;',
                                                 '')).rsplit(';')
                         if len(placementList) < 2:
                             continue
                         placement = placementList[-2]
                         try:
                             clade = int(
                                 re.sub('([0-9]+)\(.*', r'\1', placement))
                         except ValueError:
                             continue
                         weight = float(
                             re.sub('[0-9]+\(([0-9\.]+)\)', r'\1',
                                    placement))
                         lineCount += 1
                         if lineCount == 1:
                             fw.write(name + '\t' + str(clade) + '\t' +
                                      str(weight) + '\t' + str(tag))
                         else:
                             fw.write('\n' + name + '\t' + str(clade) +
                                      '\t' + str(weight) + '\t' + str(tag))
                 except Exception:
                     sys.stderr.write('Cannot parse line: ' +
                                      str(lineCount) + 'in file: ' +
                                      mothurPredFileName + '\n')
                     raise
         except Exception:
             sys.stderr.write("Cannot write to file:" + outPredFileName +
                              '\n')
             raise
         finally:
             fw.close()
         fr.close()
Esempio n. 17
0
def readPPSOutput(sequences,
                  taxonomy,
                  inputFastaIdsPPSFile,
                  overwriteAllPlacements=False):
    """
        Reads the output file of PPS and for each sequence decides:
        if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its
        previous placement
        if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then
        the sequence is placed according to the PPS file
    """

    infile = str(inputFastaIdsPPSFile + '.out')
    try:
        f = open(os.path.normpath(infile), 'r')
    except Exception:
        print "Cannot open file:", infile
        raise
    else:
        #i = 0
        for line in f:
            line = common.noNewLine(line)
            if re.match(r'^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', line):
                scaffoldId = int(
                    re.sub(r'^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', r'\1',
                           line))
                contigId = int(
                    re.sub(r'^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$', r'\1',
                           line))
                ncbid = int(
                    re.sub(r'^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$', r'\1',
                           line))
                weight = None  # the weight is not yet defined !!!
                if ncbid != 1:
                    #print line, ":", scaffoldId, contigId, ncbid
                    taxPathDictPPS = taxonomy.getPathToRoot(ncbid)
                    if taxPathDictPPS.keys() >= 1:
                        taxPathDictCurrent = sequences.getSequence(
                            contigId).getTaxonomyPath()
                        if taxPathDictCurrent == None:
                            sequences.setTaxonomyPath(
                                contigId, scaffoldId, taxPathDictPPS,
                                weight)  #weight = None !!!
                            #i += 1
                        else:
                            if ((overwriteAllPlacements)
                                    or (taxPathDictPPS.keys() >
                                        taxPathDictCurrent.keys())):
                                sequences.setTaxonomyPathOverride(
                                    contigId, scaffoldId, taxPathDictPPS,
                                    weight)  #weight = None !!!
                                #i += 1
        #print "placed seq by PPS:", i

    finally:
        f.close()
Esempio n. 18
0
def ppsOut2Placements(ppsOutFile, scafContigFile=None):
    """
        Transforms a PPS assignments to a list of pairs <contigName, assigned_ncbid>

        @param ppsOutFile: PPS output file where the first column is the contig/scaffold name and the last column is ncbid
        @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs

        @return: list of pairs <contigName, assigned_ncbid>
    """

    #print 'ppsOut2Placements ppsOutFile:', ppsOutFile
    #print 'ppsOut2Placements scafContigFile:', scafContigFile

    if scafContigFile != None:
        scafToContigs = toScafContigMap(scafContigFile)
    else:
        scafToContigs = dict([])

    outList = []
    try:
        f = open(os.path.normpath(ppsOutFile),'r')
    except Exception:
        print "Cannot open file:", ppsOutFile
        raise
    else:
        lineCounter = 0
        for line in f:
            lineCounter += 1
            line = common.noNewLine(line)
            name = re.sub(r'^([^ \t]+)[ \t]+.*[0-9]+[ \t]*$',r'\1' ,line)
            try:
                ncbid = int(re.sub(r'^[^ \t]+.*[ \t]+([0-9]+)[ \t]*$',r'\1' ,line))
            except Exception:
                try:
                    ncbid = abs(int(re.sub(r'^[^ \t]+.*[ \t]+(-1)[ \t]*$',r'\1' ,line)))
                except Exception:
                    print 'ppsOut2Placements: cannot parse placement for line nr:', lineCounter, 'line:', line
                    raise

            if name in scafToContigs:
                contigsList = scafToContigs[name]
                for contig in contigsList:
                    outList.append([contig, ncbid])
                    #print ':',contig,ncbid
            else:
                outList.append([name, ncbid])
                #print '',name,ncbid

    return outList
Esempio n. 19
0
def printStatDbk():
    """
        Print statistics of a DBK file.
    """
    seqIdSet = set()
    taxonSet = set()
    cumulativeLen = 0
    recordCount = 0
    zeros = 0
    #
    for record in SeqIO.parse(sys.stdin, "genbank"):
        recordCount += 1
        seqId = record.id

        if seqId in seqIdSet:
            print seqId, 'already in set', seqId
        else:
            seqIdSet.add(seqId)

        seq = str(record.seq)
        cumulativeLen += len(seq)

        if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
            zeros += 1

        taxonId = None

        for feature in record.features:
            if feature.type == "source":
                for xrefentry in feature.qualifiers["db_xref"]:
                    (key, val) = xrefentry.split(":")
                    if key == "taxon":
                        taxonId = int(val)
                        break
            if taxonId is not None:
                break

        if taxonId is None:
            print 'could not find taxonId for', seqId
        else:
            taxonSet.add(taxonId)

    print 'record count', recordCount
    print 'seq count', len(seqIdSet)
    print 'taxon id count', len(taxonSet)
    if len(seqIdSet) > 0:
        print 'avg. seq. len', cumulativeLen / len(seqIdSet)
    print 'zeros', zeros
Esempio n. 20
0
def printStatDbk():
    """
        Print statistics of a DBK file.
    """
    seqIdSet = set()
    taxonSet = set()
    cumulativeLen = 0
    recordCount = 0
    zeros = 0
    #
    for record in SeqIO.parse(sys.stdin, "genbank"):
        recordCount += 1
        seqId = record.id

        if seqId in seqIdSet:
            print seqId, 'already in set', seqId
        else:
            seqIdSet.add(seqId)

        seq = str(record.seq)
        cumulativeLen += len(seq)

        if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
            zeros += 1

        taxonId = None

        for feature in record.features:
            if feature.type == "source":
                for xrefentry in feature.qualifiers["db_xref"]:
                    (key, val) = xrefentry.split(":")
                    if key == "taxon":
                        taxonId = int(val)
                        break
            if taxonId is not None:
                break

        if taxonId is None:
            print 'could not find taxonId for', seqId
        else:
            taxonSet.add(taxonId)

    print 'record count', recordCount
    print 'seq count', len(seqIdSet)
    print 'taxon id count', len(taxonSet)
    if len(seqIdSet) > 0:
        print 'avg. seq. len', cumulativeLen / len(seqIdSet)
    print 'zeros', zeros
Esempio n. 21
0
 def __init__(self, id, name, contig, scaffoldSeq):
     self.id = id
     self.name = name
     self._taxPathDict = None
     self.contigs = []
     self._removeNonDna = False
     if (contig != None):
         self.contigs.append(contig)
     if (scaffoldSeq != None):
         seq = noNewLine(scaffoldSeq)
         self.seqBp = len(removeNonDna(seq))
         self._scaffCompressed = zlib.compress(seq)
         self._hash = hash(seq.upper())
         self._scaffDef = True
     else:
         self._scaffDef = False
         self._hash = None
         self.seqBp = 0
Esempio n. 22
0
 def __init__(self, id, name, contig, scaffoldSeq):
     self.id = id
     self.name = name
     self._taxPathDict = None
     self.contigs = []
     self._removeNonDna = False
     if (contig != None):
         self.contigs.append(contig)
     if (scaffoldSeq != None):
         seq = noNewLine(scaffoldSeq)
         self.seqBp = len(removeNonDna(seq))
         self._scaffCompressed = zlib.compress(seq)
         self._hash = hash(seq.upper())
         self._scaffDef = True
     else:
         self._scaffDef = False
         self._hash = None
         self.seqBp = 0
Esempio n. 23
0
def ssd2Placements(ssdDir, scafContigFile=None):
    """
        Transforms sample specific data to placements. Sequences` names are not allowed to have gaps ' '

        @param ssdDir: directory that contains sample specific data
        @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs

        @return: list of pairs <contigName, assigned_ncbid>
    """

    #collect map: scaffold -> list of contigs
    if scafContigFile != None:
        scafToContigs = toScafContigMap(scafContigFile)
    else:
        scafToContigs = dict([])

    outList = []
    placedContigs = set([])

    for filePath in glob.glob(os.path.join(os.path.normpath(ssdDir),r'*.f[an][sa]')):
        ncbid = int(re.sub(r'^.*[^0-9]([0-9]+)\.[0-9]+\.f[an][sa]$',r'\1' ,filePath)) #int
        try:
            f = open(os.path.normpath(filePath),'r')
        except Exception:
            print "Cannot open file:", filePath
            raise
        else:
            for line in f:
                line = common.noNewLine(line)
                if re.match('>', line):
                    name = re.sub(r'^([^ \t]+)[ \t]*.*$',r'\1',line.replace('>',''))
                    if name in scafToContigs:
                        contigsList = scafToContigs[name]
                    else:
                        contigsList = [name]
                    for contig in contigsList:
                        if contig in placedContigs:
                            print str('contig "' + contig + '" has already been placed')
                        else:
                            placedContigs.add(contig)
                            outList.append([contig, ncbid])
        #count also BP for each contig!!!

    return outList
Esempio n. 24
0
 def mothurPredToTabSepPred(self, mothurPredFileName, outPredFileName):
     """
         Transforms the mothur output prediction file (*.taxonomy) to the tab separated prediction file seqName tab ncbid tab weight.
     """
     try:
         fr = open(os.path.normpath(mothurPredFileName),'r')
     except Exception:
         sys.stderr.write("Cannot open file:" + mothurPredFileName + '\n')
         raise
     else:
         try:
             fw = open(os.path.normpath(outPredFileName), 'w')
             lineCount = 0
             for line in fr:
                 line = common.noNewLine(line)
                 try:
                     if re.match(r'^[0-9]+_[0-9]+_[0-9]+_[0-9]+.*', line):
                         name = re.sub(r'([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[\+\-\t ]+.*', r'\1' , line)
                         tag = re.sub(r'[0-9]+_[0-9]+_([0-9]+_[0-9]+_[\+\-]+)[\t ]+.*', r'\1' , line)
                         placementList = re.sub(r'[0-9]+_[0-9]+_[0-9]+_[0-9]+_[\+\-\t ]+(.*)', r'\1' , line.replace('unclassified;', '')).rsplit(';')
                         if len(placementList) < 2:
                             continue
                         placement = placementList[-2]
                         try:
                             clade = int(re.sub('([0-9]+)\(.*', r'\1' , placement))
                         except ValueError:
                             continue
                         weight = float(re.sub('[0-9]+\(([0-9\.]+)\)', r'\1' , placement))
                         lineCount += 1
                         if lineCount == 1:
                             fw.write(name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag))
                         else:
                             fw.write('\n' + name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag))
                 except Exception:
                     sys.stderr.write('Cannot parse line: ' + str(lineCount) +  'in file: ' + mothurPredFileName + '\n')
                     raise
         except Exception:
             sys.stderr.write("Cannot write to file:" + outPredFileName + '\n')
             raise
         finally:
             fw.close()
         fr.close()
Esempio n. 25
0
def readPPSOutput(sequences, taxonomy, inputFastaIdsPPSFile, overwriteAllPlacements=False):
    """
        Reads the output file of PPS and for each sequence decides:
        if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its
        previous placement
        if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then
        the sequence is placed according to the PPS file
    """

    infile = str(inputFastaIdsPPSFile + ".out")
    try:
        f = open(os.path.normpath(infile), "r")
    except Exception:
        print "Cannot open file:", infile
        raise
    else:
        # i = 0
        for line in f:
            line = common.noNewLine(line)
            if re.match(r"^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$", line):
                scaffoldId = int(re.sub(r"^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$", r"\1", line))
                contigId = int(re.sub(r"^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$", r"\1", line))
                ncbid = int(re.sub(r"^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$", r"\1", line))
                weight = None  # the weight is not yet defined !!!
                if ncbid != 1:
                    # print line, ":", scaffoldId, contigId, ncbid
                    taxPathDictPPS = taxonomy.getPathToRoot(ncbid)
                    if taxPathDictPPS.keys() >= 1:
                        taxPathDictCurrent = sequences.getSequence(contigId).getTaxonomyPath()
                        if taxPathDictCurrent == None:
                            sequences.setTaxonomyPath(contigId, scaffoldId, taxPathDictPPS, weight)  # weight = None !!!
                            # i += 1
                        else:
                            if (overwriteAllPlacements) or (taxPathDictPPS.keys() > taxPathDictCurrent.keys()):
                                sequences.setTaxonomyPathOverride(
                                    contigId, scaffoldId, taxPathDictPPS, weight
                                )  # weight = None !!!
                                # i += 1
        # print "placed seq by PPS:", i

    finally:
        f.close()
Esempio n. 26
0
    def writePlacementsOut(self, outFile, taxaRanks,
                           outputFileContigSubPattern):

        try:
            f = open(os.path.normpath(outFile), 'w')
            f.write('# SEQUENCEID	TAXID')
            # k = 0

            for seq in self.sequences:

                taxPathDict = seq.getTaxonomyPath()
                ncbid = 1
                for rank in taxaRanks:
                    if ((taxPathDict is not None) and (rank in taxPathDict)):
                        ncbid = taxPathDict[rank].ncbid
                    else:
                        break

                if ncbid == 1:
                    continue

                entry = (noNewLine(
                    re.sub(outputFileContigSubPattern, r'\1', seq.name)) +
                         '\t' + str(ncbid))

                # if k == 0:
                #     f.write(entry)
                #     k += 1
                # else:
                f.write('\n' + entry)

        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()
Esempio n. 27
0
 def parse(self, record):
     self._seqToList.append((str(record.id), noNewLine(str(record.seq))))
Esempio n. 28
0
def ppsOutToPPOut(ppsOutFile, outPPOutFile, taxaRanks, taxonomy):
    """
        Transforms the PPS out file to a compatible PPS PP.out file.
    """
    print ppsOutFile

    #contig file to an ncbid
    contigToNcbid = dict([])
    try:
        f = open(os.path.normpath(ppsOutFile),'r')
    except Exception:
        print "Cannot open file:", ppsOutFile
        raise
    else:
        for line in f:
            line = common.noNewLine(line)
            contig = re.sub(r'^[ ]*([^\t]+)\t.*$',r'\1', line)
            try:
                ncbid = int(re.sub(r'^.*\t([0-9]+)[ \t]*$',r'\1', line))
            except Exception:
                print 'line skipped:', line
                continue
            contigToNcbid[contig] = ncbid
            #print str('|' + contig + '|' + str(ncbid) + '|')

    try:
        f = open(os.path.normpath(outPPOutFile), 'w')
        f.write('#Translate output to PP.out format from: ' + ppsOutFile + '\n#\n'),
        header = str('#ID' + '\t' + 'root')
        for rank in taxaRanks:
            header += str('\t' + rank)
        f.write(header)

        for contig in contigToNcbid:
            taxPathDict = taxonomy.getPathToRoot(contigToNcbid[contig])
            entry = str('\n' + contig)
            if taxPathDict == None:
                entry += str('\t')
            else:
                entry += str('\t' + 'root')
            for rank in taxaRanks:
                if (taxPathDict != None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()):
                    entry += str('\t' + taxPathDict[rank].name)
                else:
                    entry += '\t'
            f.write(entry)
    except Exception:
        print "Cannot create a file or write to it:", outPPOutFile
        raise
    finally:
        f.close()


    def writePlacementsPPOut(self, outFile, taxaRanks, outputFileContigSubPattern):

        try:
            f = open(os.path.normpath(outFile), 'w')

            f.write('#Output of pPPS\n#\n'),
            header = str('#ID' + '\t' + 'root')
            for rank in taxaRanks:
                header += str('\t' + rank)
            f.write(header)

            for seq in self.sequences:
                entry = str('\n' + re.sub(outputFileContigSubPattern, r'\1' , seq.name))
                taxPathDict = seq.getTaxonomyPath()
                if taxPathDict == None:
                    entry += str('\t')
                else:
                    entry += str('\t' + 'root')
                for rank in taxaRanks:
                    if (taxPathDict != None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()):
                        entry += str('\t' + taxPathDict[rank].name)
                    else:
                        entry += '\t'
                f.write(entry)
        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()
Esempio n. 29
0
 def parse(self, record):
     self._seqToList.append((str(record.id), noNewLine(str(record.seq))))
Esempio n. 30
0
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir):
    """
        Reads all sequences. For each taxonId creates a file that contain all sequences
        mapped to this taxonId. If a seqId appears more than one it is ignored since
        acession numbers are unique.

        @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId
        @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq
    """
    taxonIdToOutBuffer = {}
    seqIdSet = set()

    totalSeqCount = 0
    totalStoredSeqCount = 0
    totalIdenticalSeqCount = 0

    for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList):
        print 'processing', mapFilePath, fastaFilePath
        seqCount = 0
        storedSeqCount = 0

        seqIdToSeq = fasta.fastaFileToDict(fastaFilePath)
        seqIdToNcbidList = csv.getMapping(mapFilePath,
                                          0,
                                          1,
                                          sep='\t',
                                          comment='#')

        for seqId, seq in seqIdToSeq.iteritems():
            seqCount += 1
            if seqId in seqIdSet:
                totalIdenticalSeqCount += 1
                continue
            else:
                seqIdSet.add(seqId)

            taxonId = seqIdToNcbidList[seqId][0]

            if taxonId not in taxonIdToOutBuffer:
                outBuffer = csv.OutFileBuffer(
                    os.path.join(outputDir, str(str(taxonId) + '.fna')))
                taxonIdToOutBuffer[taxonId] = outBuffer

            taxonIdToOutBuffer[taxonId].writeText(
                str('>' + seqId + '\n' + seq + '\n'))
            taxonIdToOutBuffer[taxonId].close()
            storedSeqCount += 1

            if len(string.replace(common.noNewLine(seq), 'N', '')) == 0:
                print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq))

        # for buff in taxonIdToOutBuffer.values():
        #     buff.close()

        print 'totalSeq, storedSeq', seqCount, storedSeqCount
        totalSeqCount += seqCount
        totalStoredSeqCount += storedSeqCount

    print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount

    print 'sequences merged'