def run(self): ext = 'pickle' files = iseqlib.getfiles(self.indir, ext) sizes = [] size2stats = {} #key = size, val = averStats for file in files: size = int(file.rstrip(ext).rstrip('.')) sizes.append(size) stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") ) size2stats[size] = stats sizes.sort() #output summary file of the sample: outfile = os.path.join( self.outdir, "%s.txt" %self.name ) f = open(outfile, 'w') f.write("Index") for s in sizes: f.write("\t%d\tStd" %s) metrics = self.metrics metricsStd = [m + "Std" for m in metrics] for i, metric in enumerate(metrics): f.write("\n%s" %metric) for size in sizes: avr = size2stats[size][metric] std = size2stats[size][metricsStd[i]] f.write("\t%f\t%f" %(avr, std)) f.write("\n") f.close() #pickle size2stats to temporary output directory picklefile = os.path.join(self.tempOutdir, "%s.pickle" % self.name) pickle.dump( size2stats, gzip.open(picklefile, "wb") )
def run(self): globalTempDir = self.getGlobalTempDir() ext = "pickle" files = iseqlib.getfiles(self.indir, ext) samples = [file.split('.')[0] for file in files] for sample in samples: samplefile = os.path.join(self.indir, "%s.%s"%(sample, ext)) sampledir = os.path.join(globalTempDir, sample) system("mkdir -p %s" %sampledir) if self.options.sampling: for i in xrange(self.options.numsam): #sampling a number of times samplingdir = os.path.join(sampledir, "%d" %i) system("mkdir -p %s" %samplingdir) self.addChildTarget( Sampling(samplefile, samplingdir, self.options) ) else: tempoutdir = os.path.join(sampledir, "0") system("mkdir -p %s" %tempoutdir) #filtering if selected Vs and/or selected Js were specified if self.options.vs or self.options.js: sampleObj = pickle.load( gzip.open(samplefile, "rb") ) subsample = iseqlib.filterSampleByGenes(sampleObj, self.options.vs, self.options.js) system("rm %s" %samplefile) pickle.dump( subsample, gzip.open(samplefile, "wb") ) self.addChildTarget( Analyses(samplefile, tempoutdir, self.options) ) #Calculate means & standard deviations of samplings self.setFollowOnTarget( AverageResults(globalTempDir, self.options) )
def run(self): #read input fasta files: ext = 'fa' files = iseqlib.getfiles(self.options.indir, ext) globalTempDir = self.getGlobalTempDir() for file in files: filepath = os.path.join(self.options.indir, file) self.addChildTarget( ReadFasta(filepath, globalTempDir, self.options.minReadCount) ) self.setFollowOnTarget( SamplingAndAnalyses(globalTempDir, self.options) )
def run(self): singleOutdir = os.path.join(self.options.outdir, "diversity") system("mkdir -p %s" %singleOutdir) globalTempDir = self.getGlobalTempDir() ext = 'pickle' samples = iseqlib.getfiles(self.samdir, ext) for sample in samples: #Each sample samplename = sample.rstrip(ext).rstrip('.') self.addChildTarget( SampleSingleAnalyses(globalTempDir, samplename, self.samdir, self.options, singleOutdir) ) # R --no-save --no-restore --args adapt16D-adapt11D.txt < diversityPlot.R self.setFollowOnTarget( SummarySingle(globalTempDir, singleOutdir, self.options.diversityIndices) )
def filterSamples(indir, vs, js): if not vs and not js: return ext = 'pickle' files = iseqlib.getfiles(indir, ext) for file in files: filepath = os.path.join(indir, file) sample = pickle.load( gzip.open(filepath, "rb") ) subsample = iseqlib.filterSampleByGenes(sample, vs, js) system("rm %s" %filepath) pickle.dump(subsample, gzip.open(filepath, "wb")) return
def run(self): globalTempDir = self.getGlobalTempDir() ext = "pickle" files = iseqlib.getfiles(self.indir, ext) samples = [ '.'.join(file.split('.')[:-1]) for file in files ] for sample in samples: samplefile = os.path.join(self.indir, "%s.%s" %(sample, ext)) outfile = os.path.join(globalTempDir, "%s.pickle" %sample) #temp/sample.pickle self.addChildTarget( Sampling(samplefile, outfile, self.options) ) self.setFollowOnTarget( Analyses(globalTempDir, self.outdir, self.options) )
def run(self): ext = 'pickle' files = iseqlib.getfiles(self.indir, ext) sample2size2stats = {} sizes = [] for file in files: name = file.rstrip(ext).rstrip('.') size2stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") ) for size in size2stats: if size not in sizes: sizes.append(size) sample2size2stats[name] = size2stats sizes.sort() #Print summary of each statistic to output files (1 file/statistic where row = samples, columns = sampling size) metrics = self.metrics metricsStd = [m + "Std" for m in metrics] for i, metric in enumerate(metrics): outfile = os.path.join(self.outdir, "%s.txt" %metric) f = open(outfile, 'w') f.write("Sample") for size in sizes: f.write("\t%d\tStd" %size) for sample, size2stats in sample2size2stats.iteritems(): f.write("\n%s" %sample) for size in sizes: if size not in size2stats: f.write("\tNA\tNA") else: s = size2stats[size] f.write("\t%f\t%f" % (s[metric], s[metricsStd[i]]) ) f.write("\n") f.close() #Summary of all statictics for each sampling size (each file per sampling size, row=samples, columns = statistics for size in sizes: outfile = os.path.join(self.outdir, "%d.txt" %size) f = open(outfile, 'w') f.write("Sample") for m in metrics: f.write("\t%s\tStd" %m) for sample, size2stats in sample2size2stats.iteritems(): f.write("\n%s" %sample) for i, metric in enumerate(metrics): if size not in size2stats: f.write("\tNA\tNA") else: s = size2stats[size] f.write("\t%f\t%f" %(s[metric], s[metricsStd[i]]) ) f.write("\n") f.close()
def run(self): ext = 'fa' samples = iseqlib.getfiles(self.options.indir, ext) globalTempDir = self.getGlobalTempDir() #Read input fasta files and write pickle files into globalTempDir: for sample in samples: name = sample.rstrip(ext).rstrip('.') infile = os.path.join(self.options.indir, sample) self.addChildTarget( ReadFasta(infile, globalTempDir, name) ) #After done reading fastas, move to the analyses self.setFollowOnTarget( Analyses(globalTempDir, self.options) )
def run(self): pairOutdir = os.path.join(self.options.outdir, 'similarity') system("mkdir -p %s" %pairOutdir) globalTempDir = self.getGlobalTempDir() ext = 'pickle' samples = iseqlib.getfiles(self.samdir, ext) samplenames = [s.rstrip(ext).rstrip('.') for s in samples] for i in xrange( len(samples) - 1 ): s1 = samples[i] s1name = samplenames[i] for j in xrange( i+1, len(samples) ): s2 = samples[j] s2name = samplenames[j] self.addChildTarget( SamplePairAnalyses(globalTempDir, s1name, s2name, self.samdir, self.options, pairOutdir) ) self.setFollowOnTarget( SummaryPair(globalTempDir, pairOutdir, self.options.similarityIndices) )
def run(self): ext = 'pickle' picklefiles = iseqlib.getfiles( self.indir, ext ) statsList = [] for file in picklefiles: stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") ) statsList.append(stats) avrstats = PairSamplingStats() #initialize avrstats #Calculate mean and std using numpy #metrics = ['bray', 'horn', 'mountford', 'chao'] #stds = ['brayStd', 'hornStd', 'mountfordStd', 'chaoStd'] metrics = self.metrics stds = [m + "Std" for m in metrics] for i in xrange( len(metrics) ): vals = [s[metrics[i]] for s in statsList] avrstats[ metrics[i] ] = np.mean( vals ) avrstats[ stds[i] ] = np.std( vals ) #Pickle the average stat of this sampling size picklefile = os.path.join(self.outdir, "%d.pickle" %self.samplingsize) pickle.dump( avrstats, gzip.open(picklefile, "wb") )
def run(self): ext = 'pickle' picklefiles = iseqlib.getfiles( self.indir, ext ) statsList = [] for file in picklefiles: stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") ) statsList.append(stats) avrstats = SingleSamplingStats() #initialize avrstats #Calculate mean and std using numpy #metrics = ['uniqClones', 'simpson', 'invsimpson', 'shannon', 'fisherAlpha'] #stds = ['uniqClonesStd', 'simpsonStd', 'invsimpsonStd', 'shannonStd', 'fisherAlphaStd'] metrics = self.metrics stds = [m + "Std" for m in metrics] for i in xrange( len(metrics) ): vals = [s[metrics[i]] for s in statsList] avrstats[ metrics[i] ] = np.mean( vals ) avrstats[ stds[i] ] = np.std( vals ) #Pickle the average stat of this sampling size picklefile = os.path.join(self.outdir, "%d.pickle" %self.samplingsize) pickle.dump( avrstats, gzip.open(picklefile, "wb") )
def run(self): ext = 'pickle' files = iseqlib.getfiles(self.indir, ext) sample2mate2size2stats = {} sizes = [] for file in files: name = file.rstrip(ext).rstrip('.') samples = name.split('-') size2stats = pickle.load( gzip.open(os.path.join(self.indir, file), "rb") ) for size in size2stats: if size not in sizes: sizes.append(size) s1 = samples[0] s2 = samples[1] if s1 not in sample2mate2size2stats: sample2mate2size2stats[s1] = { s2: size2stats } else: sample2mate2size2stats[s1][s2] = size2stats if s2 not in sample2mate2size2stats: sample2mate2size2stats[s2] = { s1: size2stats } else: sample2mate2size2stats[s2][s1] = size2stats #for i, sample in enumerate(samples): # if sample not in sample2mate2size2stats: # sample2mate2size2stats[sample] = {samples[(i+1) %2]: size2stats} # else: # sample2mate2size2stats[sample][samples[(i+1)%2]] = size2stats sizes.sort() #Print summary of each statistic to output files (1 file/1 statistic, 1 sampling size where row = samples, cols = samples) metrics = self.metrics metricsStd = [m + "Std" for m in metrics] samples = sorted(sample2mate2size2stats.keys()) for i, metric in enumerate(metrics): for size in sizes: outfile = os.path.join(self.outdir, "%s-%d.txt" %(metric, size)) f = open(outfile, 'w') f.write("Sample") for s in samples: f.write("\t%s\tStd" %(s) ) for s in samples: f.write("\n%s" %s) for s2 in samples: if s == s2: f.write("\t-\t-") else: if s in sample2mate2size2stats and s2 in sample2mate2size2stats[s]: size2stats = sample2mate2size2stats[s][s2] if size not in size2stats: f.write("\t-\t-") else: stat= size2stats[size] f.write("\t%f\t%f" %(stat[metric], stat[ metricsStd[i] ] )) else: f.write("\t-\t-") f.write("\n") f.close()