def ppsOut2Placements(ppsOutFile, scafContigFile=None): """ Transforms a PPS assignments to a list of pairs <contigName, assigned_ncbid> @param ppsOutFile: PPS output file where the first column is the contig/scaffold name and the last column is ncbid @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs @return: list of pairs <contigName, assigned_ncbid> """ #print 'ppsOut2Placements ppsOutFile:', ppsOutFile #print 'ppsOut2Placements scafContigFile:', scafContigFile if scafContigFile != None: scafToContigs = toScafContigMap(scafContigFile) else: scafToContigs = dict([]) outList = [] try: f = open(os.path.normpath(ppsOutFile),'r') except Exception: print "Cannot open file:", ppsOutFile raise else: lineCounter = 0 for line in f: lineCounter += 1 line = common.noNewLine(line) name = re.sub(r'^([^ \t]+)[ \t]+.*[0-9]+[ \t]*$',r'\1' ,line) try: ncbid = int(re.sub(r'^[^ \t]+.*[ \t]+([0-9]+)[ \t]*$',r'\1' ,line)) except Exception: try: ncbid = abs(int(re.sub(r'^[^ \t]+.*[ \t]+(-1)[ \t]*$',r'\1' ,line))) except Exception: print 'ppsOut2Placements: cannot parse placement for line nr:', lineCounter, 'line:', line raise if name in scafToContigs: contigsList = scafToContigs[name] for contig in contigsList: outList.append([contig, ncbid]) #print ':',contig,ncbid else: outList.append([name, ncbid]) #print '',name,ncbid return outList
def ssd2Placements(ssdDir, scafContigFile=None): """ Transforms sample specific data to placements. Sequences` names are not allowed to have gaps ' ' @param ssdDir: directory that contains sample specific data @param scafContigFile: scaffold contig mapping (tab separated) if None then all sequences are considered as contigs @return: list of pairs <contigName, assigned_ncbid> """ #collect map: scaffold -> list of contigs if scafContigFile != None: scafToContigs = toScafContigMap(scafContigFile) else: scafToContigs = dict([]) outList = [] placedContigs = set([]) for filePath in glob.glob(os.path.join(os.path.normpath(ssdDir),r'*.f[an][sa]')): ncbid = int(re.sub(r'^.*[^0-9]([0-9]+)\.[0-9]+\.f[an][sa]$',r'\1' ,filePath)) #int try: f = open(os.path.normpath(filePath),'r') except Exception: print "Cannot open file:", filePath raise else: for line in f: line = common.noNewLine(line) if re.match('>', line): name = re.sub(r'^([^ \t]+)[ \t]*.*$',r'\1',line.replace('>','')) if name in scafToContigs: contigsList = scafToContigs[name] else: contigsList = [name] for contig in contigsList: if contig in placedContigs: print str('contig "' + contig + '" has already been placed') else: placedContigs.add(contig) outList.append([contig, ncbid]) #count also BP for each contig!!! return outList