def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq),'N','')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def __init__(self, id, name, seq): self.id = id self.name = noNewLine(name) seq = noNewLine(seq) self.seqBp = len(removeNonDna(seq)) self._seqCompressed = zlib.compress(seq) self._taxPathDict = None self._placementWeight = None self._hash = hash(seq.upper()) self._candidateTaxPathDictList = [] self._candidateTaxPathDictWeightsList = [] self._candidateTaxPathDictSourceList = [] # where does this prediction come from self._candidateTaxPathDictTagList = [] self.scaffold = None self._removeNonDna = False
def scafToContigOutputPPFormat(scafContigFile, scafPPSOutFile, contigPPSOutFile): """ Takes scaffold-contigs mapping and scaffold placement (PP.out), outputs contigs placement (PP.out) @param scafContigFile: tab sepparated scaffold-contigs mapping (scaffold \t contig) @param scafPPSOutFile: scaffold predictions (PPS output file) @param contigPPSOutFile: contigs predictions (as if it was a PPS output file) """ scafToContigs = dict([]) try: f = open(os.path.normpath(scafContigFile),'r') except Exception: print "Cannot open file:", scafContigFile raise else: for line in f: line = common.noNewLine(line) scaffold = re.sub(r'^[ ]*([^ \t]+)\t[^ \t]*',r'\1', line) contig = re.sub(r'^[ ]*[^ \t]+\t([^ \t]*)',r'\1', line) if scaffold in scafToContigs: scafToContigs[scaffold].append(contig) else: temp = [] temp.append(contig) scafToContigs[scaffold] = temp try: fr = open(os.path.normpath(scafPPSOutFile),'r') fw = open(os.path.normpath(contigPPSOutFile),'w') except Exception: print "Cannot open one of the files:", scafPPSOutFile, contigPPSOutFile raise else: for line in fr: line = common.noNewLine(line) if len(line) == 0 or re.match('#', line): fw.write(line + '\n') else: scaffold = re.sub(r'^[ ]*([^ \t]+)[ \t]*.*$',r'\1', line) assignment = re.sub(r'^[ ]*[^ \t]+[ \t]*(.*$)',r'\1', line) if scaffold in scafToContigs: contigsList = scafToContigs[scaffold] for contig in contigsList: lineW = str(contig + '\t' + assignment + '\n') fw.write(lineW) #print str(lineW), else: print 'there is not scaffold-contigs mapping for scaffold:', scaffold
def _setCandidatePlacement(self, sequences, taxonomy, predFileName, source): assignedIdList = [] try: f = open(os.path.normpath(predFileName),'r') except Exception: print "Cannot open file:", predFileName raise else: for line in f: line = common.noNewLine(line) if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$', line): scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+$',r'\1' ,line)) tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)$',r'\1' ,line)) if ncbid != 1: taxPathDict = taxonomy.getPathToRoot(ncbid) if taxPathDict is not None and taxPathDict.keys() >= 1: sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag) assignedIdList.append(contigId) else: sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid))) finally: f.close() return set(assignedIdList)
def fastaFileToDictWholeNames(filePath): """ Reads a fasta file and returns mapping: seqName -> sequence the whole sequence name is used as seqName!!! (even if it contains space) """ seqIdToSeq = {} f = None try: if filePath.endswith('.gz'): f = gzip.open(os.path.normpath(filePath), mode='r') else: f = open(os.path.normpath(filePath), 'r') except Exception: print "Cannot open file:", filePath raise else: name = '' seq = '' for line in f: line = noNewLine(line) if re.match('>', line): if seq != '': assert name != '' seqIdToSeq[name] = seq seq = '' name = line.replace('>', '') else: seq += line if seq != '': assert name != '' seqIdToSeq[name] = seq finally: if f is not None: f.close() return seqIdToSeq
def forEachLine(filePath, parser): """ For each line of the file call the parser, at the end call the finalize method of the parser if it`s defined. """ try: f = open(os.path.normpath(filePath), 'r') except Exception: sys.stderr.write('Cannot open a file for reading: ' + filePath) raise else: try: for line in f: parser.parse(noNewLine(line)) except Exception: sys.stderr.write('Cannot read from file: ' + filePath) raise finally: f.close() try: if isinstance(parser.finalize, types.MethodType): parser.finalize() except Exception: pass return parser
def toScafContigMap(scafContigFile): """ Reads scaffold contig mapping. @param scafContigFile: scaffold-contig mapping (tab separated) @return: map: scaffold -> list of contigs """ scafToContigs = dict() try: f = open(os.path.normpath(scafContigFile), 'r') except Exception: print "Cannot open file:", scafContigFile raise else: for line in f: line = noNewLine(line) scaffold = re.sub(r'^[ ]*([^\t]+)\t[^\t]*', r'\1', line) # gap deleted !!! contig = re.sub(r'^[ ]*[^\t]+\t([^\t]*)', r'\1', line) if scaffold in scafToContigs: scafToContigs[scaffold].append(contig) else: temp = [] temp.append(contig) scafToContigs[scaffold] = temp return scafToContigs
def _readContigsScaffolds(self, filePath, readContigs=True): """ Read contigs or scaffolds from a file. """ try: f = open(os.path.normpath(filePath), 'r') except Exception: print "Cannot open file:", filePath raise else: name = '' seq = '' for line in f: line = noNewLine(line) if re.match('>', line): if seq != '': assert name != '' if readContigs: self._addSeq(name, seq) # store seq else: self._addScaff(name, None, seq) seq = '' name = line.replace('>', '') else: seq += line if seq != '': assert name != '' if readContigs: self._addSeq(name, seq) #store seq else: self._addScaff(name, None, seq) finally: f.close()
def toScafContigMap(scafContigFile): """ Reads scaffold contig mapping. @param scafContigFile: scaffold-contig mapping (tab separated) @return: map: scaffold -> list of contigs """ scafToContigs = dict() try: f = open(os.path.normpath(scafContigFile),'r') except Exception: print "Cannot open file:", scafContigFile raise else: for line in f: line = noNewLine(line) scaffold = re.sub(r'^[ ]*([^\t]+)\t[^\t]*',r'\1', line)# gap deleted !!! contig = re.sub(r'^[ ]*[^\t]+\t([^\t]*)',r'\1', line) if scaffold in scafToContigs: scafToContigs[scaffold].append(contig) else: temp = [] temp.append(contig) scafToContigs[scaffold] = temp return scafToContigs
def writePlacementsOut(self, outFile, taxaRanks, outputFileContigSubPattern): try: f = open(os.path.normpath(outFile), 'w') f.write('# SEQUENCEID TAXID') # k = 0 for seq in self.sequences: taxPathDict = seq.getTaxonomyPath() ncbid = 1 for rank in taxaRanks: if ((taxPathDict is not None) and (rank in taxPathDict)): ncbid = taxPathDict[rank].ncbid else: break if ncbid == 1: continue entry = (noNewLine(re.sub(outputFileContigSubPattern, r'\1' , seq.name)) + '\t' + str(ncbid)) # if k == 0: # f.write(entry) # k += 1 # else: f.write('\n' + entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close()
def _readContigsScaffolds(self, filePath, readContigs = True): """ Read contigs or scaffolds from a file. """ try: f = open(os.path.normpath(filePath),'r') except Exception: print "Cannot open file:", filePath raise else: name = '' seq = '' for line in f: line = noNewLine(line) if re.match('>', line): if seq != '': assert name != '' if readContigs: self._addSeq(name, seq) # store seq else: self._addScaff(name, None, seq) seq = '' name = line.replace('>','') else: seq += line if seq != '': assert name != '' if readContigs: self._addSeq(name, seq) #store seq else: self._addScaff(name, None, seq) finally: f.close()
def loadDictFromAFile(filePath): """ Returns a dictionary that is stored in a file. @param filePath: a file in which a dictionary is stored in format: (key tab item) @return: dict that represents mapping: (key -> list of items) """ try: dictOfLists = dict([]) f = open(os.path.normpath(filePath), 'r') for line in f: pair = re.findall('[^\t]+', common.noNewLine(line)) assert len(pair) == 2, str( 'There are not two values separated by \t at line: ' + line) key = int(pair[0]) val = int(pair[1]) if key in dictOfLists: dictOfLists[key].append(val) else: list = [] list.append(val) dictOfLists[key] = list return dictOfLists except Exception: print "Cannot create a file or write to it:", filePath raise finally: f.close()
def mothurPredToTabSepPred(self, mothurPredFileName, outPredFileName): """ Transforms the mothur output prediction file (*.taxonomy) to the tab separated prediction file seqName tab ncbid tab weight. """ try: fr = open(os.path.normpath(mothurPredFileName), 'r') except Exception: sys.stderr.write("Cannot open file:" + mothurPredFileName + '\n') raise else: try: fw = open(os.path.normpath(outPredFileName), 'w') lineCount = 0 for line in fr: line = common.noNewLine(line) try: if re.match(r'^[0-9]+_[0-9]+_[0-9]+_[0-9]+.*', line): name = re.sub( r'([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[\+\-\t ]+.*', r'\1', line) tag = re.sub( r'[0-9]+_[0-9]+_([0-9]+_[0-9]+_[\+\-]+)[\t ]+.*', r'\1', line) placementList = re.sub( r'[0-9]+_[0-9]+_[0-9]+_[0-9]+_[\+\-\t ]+(.*)', r'\1', line.replace('unclassified;', '')).rsplit(';') if len(placementList) < 2: continue placement = placementList[-2] try: clade = int( re.sub('([0-9]+)\(.*', r'\1', placement)) except ValueError: continue weight = float( re.sub('[0-9]+\(([0-9\.]+)\)', r'\1', placement)) lineCount += 1 if lineCount == 1: fw.write(name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag)) else: fw.write('\n' + name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag)) except Exception: sys.stderr.write('Cannot parse line: ' + str(lineCount) + 'in file: ' + mothurPredFileName + '\n') raise except Exception: sys.stderr.write("Cannot write to file:" + outPredFileName + '\n') raise finally: fw.close() fr.close()
def readPPSOutput(sequences, taxonomy, inputFastaIdsPPSFile, overwriteAllPlacements=False): """ Reads the output file of PPS and for each sequence decides: if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its previous placement if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then the sequence is placed according to the PPS file """ infile = str(inputFastaIdsPPSFile + '.out') try: f = open(os.path.normpath(infile), 'r') except Exception: print "Cannot open file:", infile raise else: #i = 0 for line in f: line = common.noNewLine(line) if re.match(r'^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', line): scaffoldId = int( re.sub(r'^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', r'\1', line)) contigId = int( re.sub(r'^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$', r'\1', line)) ncbid = int( re.sub(r'^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$', r'\1', line)) weight = None # the weight is not yet defined !!! if ncbid != 1: #print line, ":", scaffoldId, contigId, ncbid taxPathDictPPS = taxonomy.getPathToRoot(ncbid) if taxPathDictPPS.keys() >= 1: taxPathDictCurrent = sequences.getSequence( contigId).getTaxonomyPath() if taxPathDictCurrent == None: sequences.setTaxonomyPath( contigId, scaffoldId, taxPathDictPPS, weight) #weight = None !!! #i += 1 else: if ((overwriteAllPlacements) or (taxPathDictPPS.keys() > taxPathDictCurrent.keys())): sequences.setTaxonomyPathOverride( contigId, scaffoldId, taxPathDictPPS, weight) #weight = None !!! #i += 1 #print "placed seq by PPS:", i finally: f.close()
def ppsOut2Placements(ppsOutFile, scafContigFile=None): """ Transforms a PPS assignments to a list of pairs <contigName, assigned_ncbid> @param ppsOutFile: PPS output file where the first column is the contig/scaffold name and the last column is ncbid @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs @return: list of pairs <contigName, assigned_ncbid> """ #print 'ppsOut2Placements ppsOutFile:', ppsOutFile #print 'ppsOut2Placements scafContigFile:', scafContigFile if scafContigFile != None: scafToContigs = toScafContigMap(scafContigFile) else: scafToContigs = dict([]) outList = [] try: f = open(os.path.normpath(ppsOutFile),'r') except Exception: print "Cannot open file:", ppsOutFile raise else: lineCounter = 0 for line in f: lineCounter += 1 line = common.noNewLine(line) name = re.sub(r'^([^ \t]+)[ \t]+.*[0-9]+[ \t]*$',r'\1' ,line) try: ncbid = int(re.sub(r'^[^ \t]+.*[ \t]+([0-9]+)[ \t]*$',r'\1' ,line)) except Exception: try: ncbid = abs(int(re.sub(r'^[^ \t]+.*[ \t]+(-1)[ \t]*$',r'\1' ,line))) except Exception: print 'ppsOut2Placements: cannot parse placement for line nr:', lineCounter, 'line:', line raise if name in scafToContigs: contigsList = scafToContigs[name] for contig in contigsList: outList.append([contig, ncbid]) #print ':',contig,ncbid else: outList.append([name, ncbid]) #print '',name,ncbid return outList
def printStatDbk(): """ Print statistics of a DBK file. """ seqIdSet = set() taxonSet = set() cumulativeLen = 0 recordCount = 0 zeros = 0 # for record in SeqIO.parse(sys.stdin, "genbank"): recordCount += 1 seqId = record.id if seqId in seqIdSet: print seqId, 'already in set', seqId else: seqIdSet.add(seqId) seq = str(record.seq) cumulativeLen += len(seq) if len(string.replace(common.noNewLine(seq), 'N', '')) == 0: zeros += 1 taxonId = None for feature in record.features: if feature.type == "source": for xrefentry in feature.qualifiers["db_xref"]: (key, val) = xrefentry.split(":") if key == "taxon": taxonId = int(val) break if taxonId is not None: break if taxonId is None: print 'could not find taxonId for', seqId else: taxonSet.add(taxonId) print 'record count', recordCount print 'seq count', len(seqIdSet) print 'taxon id count', len(taxonSet) if len(seqIdSet) > 0: print 'avg. seq. len', cumulativeLen / len(seqIdSet) print 'zeros', zeros
def __init__(self, id, name, contig, scaffoldSeq): self.id = id self.name = name self._taxPathDict = None self.contigs = [] self._removeNonDna = False if (contig != None): self.contigs.append(contig) if (scaffoldSeq != None): seq = noNewLine(scaffoldSeq) self.seqBp = len(removeNonDna(seq)) self._scaffCompressed = zlib.compress(seq) self._hash = hash(seq.upper()) self._scaffDef = True else: self._scaffDef = False self._hash = None self.seqBp = 0
def ssd2Placements(ssdDir, scafContigFile=None): """ Transforms sample specific data to placements. Sequences` names are not allowed to have gaps ' ' @param ssdDir: directory that contains sample specific data @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs @return: list of pairs <contigName, assigned_ncbid> """ #collect map: scaffold -> list of contigs if scafContigFile != None: scafToContigs = toScafContigMap(scafContigFile) else: scafToContigs = dict([]) outList = [] placedContigs = set([]) for filePath in glob.glob(os.path.join(os.path.normpath(ssdDir),r'*.f[an][sa]')): ncbid = int(re.sub(r'^.*[^0-9]([0-9]+)\.[0-9]+\.f[an][sa]$',r'\1' ,filePath)) #int try: f = open(os.path.normpath(filePath),'r') except Exception: print "Cannot open file:", filePath raise else: for line in f: line = common.noNewLine(line) if re.match('>', line): name = re.sub(r'^([^ \t]+)[ \t]*.*$',r'\1',line.replace('>','')) if name in scafToContigs: contigsList = scafToContigs[name] else: contigsList = [name] for contig in contigsList: if contig in placedContigs: print str('contig "' + contig + '" has already been placed') else: placedContigs.add(contig) outList.append([contig, ncbid]) #count also BP for each contig!!! return outList
def mothurPredToTabSepPred(self, mothurPredFileName, outPredFileName): """ Transforms the mothur output prediction file (*.taxonomy) to the tab separated prediction file seqName tab ncbid tab weight. """ try: fr = open(os.path.normpath(mothurPredFileName),'r') except Exception: sys.stderr.write("Cannot open file:" + mothurPredFileName + '\n') raise else: try: fw = open(os.path.normpath(outPredFileName), 'w') lineCount = 0 for line in fr: line = common.noNewLine(line) try: if re.match(r'^[0-9]+_[0-9]+_[0-9]+_[0-9]+.*', line): name = re.sub(r'([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[\+\-\t ]+.*', r'\1' , line) tag = re.sub(r'[0-9]+_[0-9]+_([0-9]+_[0-9]+_[\+\-]+)[\t ]+.*', r'\1' , line) placementList = re.sub(r'[0-9]+_[0-9]+_[0-9]+_[0-9]+_[\+\-\t ]+(.*)', r'\1' , line.replace('unclassified;', '')).rsplit(';') if len(placementList) < 2: continue placement = placementList[-2] try: clade = int(re.sub('([0-9]+)\(.*', r'\1' , placement)) except ValueError: continue weight = float(re.sub('[0-9]+\(([0-9\.]+)\)', r'\1' , placement)) lineCount += 1 if lineCount == 1: fw.write(name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag)) else: fw.write('\n' + name + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(tag)) except Exception: sys.stderr.write('Cannot parse line: ' + str(lineCount) + 'in file: ' + mothurPredFileName + '\n') raise except Exception: sys.stderr.write("Cannot write to file:" + outPredFileName + '\n') raise finally: fw.close() fr.close()
def readPPSOutput(sequences, taxonomy, inputFastaIdsPPSFile, overwriteAllPlacements=False): """ Reads the output file of PPS and for each sequence decides: if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its previous placement if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then the sequence is placed according to the PPS file """ infile = str(inputFastaIdsPPSFile + ".out") try: f = open(os.path.normpath(infile), "r") except Exception: print "Cannot open file:", infile raise else: # i = 0 for line in f: line = common.noNewLine(line) if re.match(r"^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$", line): scaffoldId = int(re.sub(r"^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$", r"\1", line)) contigId = int(re.sub(r"^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$", r"\1", line)) ncbid = int(re.sub(r"^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$", r"\1", line)) weight = None # the weight is not yet defined !!! if ncbid != 1: # print line, ":", scaffoldId, contigId, ncbid taxPathDictPPS = taxonomy.getPathToRoot(ncbid) if taxPathDictPPS.keys() >= 1: taxPathDictCurrent = sequences.getSequence(contigId).getTaxonomyPath() if taxPathDictCurrent == None: sequences.setTaxonomyPath(contigId, scaffoldId, taxPathDictPPS, weight) # weight = None !!! # i += 1 else: if (overwriteAllPlacements) or (taxPathDictPPS.keys() > taxPathDictCurrent.keys()): sequences.setTaxonomyPathOverride( contigId, scaffoldId, taxPathDictPPS, weight ) # weight = None !!! # i += 1 # print "placed seq by PPS:", i finally: f.close()
def writePlacementsOut(self, outFile, taxaRanks, outputFileContigSubPattern): try: f = open(os.path.normpath(outFile), 'w') f.write('# SEQUENCEID TAXID') # k = 0 for seq in self.sequences: taxPathDict = seq.getTaxonomyPath() ncbid = 1 for rank in taxaRanks: if ((taxPathDict is not None) and (rank in taxPathDict)): ncbid = taxPathDict[rank].ncbid else: break if ncbid == 1: continue entry = (noNewLine( re.sub(outputFileContigSubPattern, r'\1', seq.name)) + '\t' + str(ncbid)) # if k == 0: # f.write(entry) # k += 1 # else: f.write('\n' + entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close()
def parse(self, record): self._seqToList.append((str(record.id), noNewLine(str(record.seq))))
def ppsOutToPPOut(ppsOutFile, outPPOutFile, taxaRanks, taxonomy): """ Transforms the PPS out file to a compatible PPS PP.out file. """ print ppsOutFile #contig file to an ncbid contigToNcbid = dict([]) try: f = open(os.path.normpath(ppsOutFile),'r') except Exception: print "Cannot open file:", ppsOutFile raise else: for line in f: line = common.noNewLine(line) contig = re.sub(r'^[ ]*([^\t]+)\t.*$',r'\1', line) try: ncbid = int(re.sub(r'^.*\t([0-9]+)[ \t]*$',r'\1', line)) except Exception: print 'line skipped:', line continue contigToNcbid[contig] = ncbid #print str('|' + contig + '|' + str(ncbid) + '|') try: f = open(os.path.normpath(outPPOutFile), 'w') f.write('#Translate output to PP.out format from: ' + ppsOutFile + '\n#\n'), header = str('#ID' + '\t' + 'root') for rank in taxaRanks: header += str('\t' + rank) f.write(header) for contig in contigToNcbid: taxPathDict = taxonomy.getPathToRoot(contigToNcbid[contig]) entry = str('\n' + contig) if taxPathDict == None: entry += str('\t') else: entry += str('\t' + 'root') for rank in taxaRanks: if (taxPathDict != None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()): entry += str('\t' + taxPathDict[rank].name) else: entry += '\t' f.write(entry) except Exception: print "Cannot create a file or write to it:", outPPOutFile raise finally: f.close() def writePlacementsPPOut(self, outFile, taxaRanks, outputFileContigSubPattern): try: f = open(os.path.normpath(outFile), 'w') f.write('#Output of pPPS\n#\n'), header = str('#ID' + '\t' + 'root') for rank in taxaRanks: header += str('\t' + rank) f.write(header) for seq in self.sequences: entry = str('\n' + re.sub(outputFileContigSubPattern, r'\1' , seq.name)) taxPathDict = seq.getTaxonomyPath() if taxPathDict == None: entry += str('\t') else: entry += str('\t' + 'root') for rank in taxaRanks: if (taxPathDict != None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()): entry += str('\t' + taxPathDict[rank].name) else: entry += '\t' f.write(entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close()
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer( os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText( str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq), 'N', '')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'