def spliceFromCoords( genomeFile, coordFile, obsFile ): genomeFileHandler = open( genomeFile, "r" ) obsFileHandler = open( obsFile, "w" ) dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) while True: bs = Bioseq() bs.read( genomeFileHandler ) if bs.sequence == None: break if dChr2Maps.has_key( bs.header ): lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) splicedSeq = "" currentSite = 0 for iMap in lCoords: minSplice = iMap.getMin() - 1 if minSplice > currentSite: splicedSeq += bs.sequence[ currentSite : minSplice ] currentSite = iMap.getMax() splicedSeq += bs.sequence[ currentSite : ] bs.sequence = splicedSeq bs.write( obsFileHandler ) genomeFileHandler.close() obsFileHandler.close()
def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFileName ): print "ERROR: file '%s' doesn't exist" % ( inFileName ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFileName, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.appendBioseqInFile( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFileName ): os.remove( outFileName ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFileName ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) sys.exit(1) os.remove( fileName ) return 0
def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 ): maxPropN = 0.40 # discard consensus if more than 40% of N's nbInSeq = self.getSize() if verbose > 0: print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush() if nbInSeq < 2: print "ERROR: can't make a consensus with less than 2 sequences" sys.exit(1) if minNbNt >= nbInSeq: minNbNt = nbInSeq - 1 print "minNbNt=%i" % ( minNbNt ) if minPropNt >= 1.0: print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt ) sys.exit(1) lOccPerSite = self.getListOccPerSite() nbSites = len(lOccPerSite) if verbose > 0: print "nb of sites: %i" % ( nbSites ); sys.stdout.flush() seqConsensus = "" # for each site (i.e. each column of the MSA) nbRmvColumns = 0 countSites = 0 for dNt2Occ in lOccPerSite: countSites += 1 if verbose > 1: print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ), nbSites ) sys.stdout.flush() occMaxNt = 0 # occurrences of the predominant nucleotide at this site lBestNt = [] nbNt = 0 # total nb of A, T, G and C (no gap) # for each distinct symbol at this site (A, T, G, C, N, -,...) for j in dNt2Occ.keys(): if j != "-": nbNt += dNt2Occ[j] if verbose > 1: print "%s: %i" % ( j, dNt2Occ[j] ) if dNt2Occ[j] > occMaxNt: occMaxNt = dNt2Occ[j] lBestNt = [ j ] elif dNt2Occ[j] == occMaxNt: lBestNt.append( j ) if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq) nbRmvColumns += 1 if len( lBestNt ) >= 1: bestNt = lBestNt[0] # if the predominant nucleotide occurs in less than x% of the sequences, put a "N" if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt: bestNt = "N" if int(nbNt) >= int(minNbNt): seqConsensus += bestNt if verbose > 1: print "-> %s" % ( bestNt ) if nbRmvColumns: print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 ) sys.stdout.flush() if seqConsensus == "": print "WARNING: no consensus can be built (no sequence left)" return propN = seqConsensus.count("N") / float(len(seqConsensus)) if propN >= maxPropN: print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 ) return elif propN >= maxPropN * 0.5: print "WARNING: %i%% of N's" % ( propN * 100 ) consensus = Bioseq() consensus.sequence = seqConsensus consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() ) if verbose > 0: statEntropy = self.getEntropy( verbose - 1 ) print "entropy: %s" % ( statEntropy.stringQuantiles() ) sys.stdout.flush() return consensus