def subsampling_BowTie_n_inhouse(file_iter, di_method, df_output): """ Run subsampling for BowTie+inhouse (inhouse must already have been pre-processed for eligibility) Simply prints out per line: <sample>,<size>,<comma-separated DI> """ runner = DiversityIndexRunner() seqvec = np.zeros((5,520), dtype=np.int) print >> sys.stderr, "DFs will be written to {0}....".format(df_output) h = open(df_output, 'w') w = DF.DFWriter(h) for sample,file in file_iter: eligible_bowtie = hello.subsample_reads_BowTie_prepare(file, refmap, phred_cutoff, min_length, ecoli_lo, ecoli_hi) eligible_inhouse = load(open(inhouse_eligible.format(sample, phred_cutoff, min_length, max_degen, L2[ecoli_lo], L2[ecoli_hi]))) print >> sys.stderr, "eligible reads for {0}: bowtie {1}, inhouse {2}".format(sample, \ len(eligible_bowtie), len(eligible_inhouse)) p = len(eligible_bowtie)*1./(len(eligible_bowtie)+len(eligible_inhouse)) for size in SUBSAMPLE_SIZES: print >> sys.stderr, sample, size seqvec[:] = 0 hello.subsample_reads_BowTie(file, refmap, seqvec, eligible_bowtie, int(size*p)) hello.subsample_reads_inhouse(refmap, seqvec, eligible_inhouse, phred_cutoff, min_length, size-int(size*p)) df = Read.ReadDF(sample, refmap) df.len = 520 df.assign_vec(seqvec) df.add_annotation('size', size) w.write(df) di=runner.run(df, method=di_method, threshold=0, vec_pre_normalized=False, ignoreN=True)[valid_DI_pos] print("{0},{1},{2}".format(sample,size,",".join(map(str,di)))) h.close()
def main(file_iter, output_df_filename, log_f): log_f.write("phred cutoff:{0}\n".format(phred_cutoff)) log_f.write("min length:{0}\n".format(min_length)) log_f.write("max degen (if used):{0}\n".format(max_degen)) log_f.write("use ecoli range {0}-{1}\n".format(ecoli_lo, ecoli_hi)) f = open(output_df_filename, 'w') dfwriter = DF.DFWriter(f) for sample,file in file_iter: print >> sys.stderr, "processing {0}.........".format(sample) seqvec = np.zeros((5,520), dtype=np.int) # --------------- for in-house aligned ------------- # for file in glob.iglob(inhouse.format(sample)): # if file.endswith('.bz2'): # os.system("bunzip2 " + file) # file = file[:-4] used, discarded = hello.gather_reads_inhouse(file, refmap, seqvec, phred_cutoff, min_length, max_degen, ecoli_lo, ecoli_hi) print >> sys.stderr, file, used, discarded log_f.write("FILE:{0} USED:{1} DISCARDED:{2}\n".format(file, used, discarded)) # os.system("bzip2 " + file) # ---------------- for BowTie-aligned ---------------# # used, discarded = hello.gather_reads_BowTie(file, refmap, seqvec, phred_cutoff, min_length, ecoli_lo, ecoli_hi) # print >> sys.stderr, "used:", used, "discarded:", discarded # log_f.write("FILE:{0} USED:{1} DISCARDED:{2}\n".format(file, used, discarded)) df = Read.ReadDF(sample, refmap) df.len = 520 df.assign_vec(seqvec) dfwriter.write(df) runner = DiversityIndexRunner() di=runner.run(df, method='Simpson', threshold=0, vec_pre_normalized=False, ignoreN=True)[valid_DI_pos] print("{0},{1}".format(sample,",".join(map(str,di)))) f.close()
class Cluster: def __init__(self, df_list, **kwargs): if df_list is None: print >> sys.stderr, "called with a None, I hope you know what you're doing!" print >> sys.stderr, "calling init_from_di_list later perhaps?" self.df_list = None return self.df_list = df_list self.original_names = [df.name for df in self.df_list] self.mask = kwargs['mask'] if 'mask' in kwargs else 1. self.method = kwargs['method'] if 'method' in kwargs else 'Simpson' self.threshold = kwargs['threshold'] if 'threshold' in kwargs else 10 self.m = len(df_list) # number of samples (rows) self.n = df_list[0].len # length of the DF vectors (columns) self.runner = DiversityIndexRunner(self.mask) self.trees = [tree.Leaf(self.df_list[i].name) for i in xrange(self.m)] self.X = np.zeros((self.m, self.n), dtype=np.float) for i,df in enumerate(self.df_list): #print >> sys.stderr, "normalizing {0}....".format(df.name) df.normalized_vec(ignoreN=True) di = self.runner.run(df, method=self.method, threshold=self.threshold, \ vec_pre_normalized=True, ignoreN=True) self.X[i, :] = di # calculate the initial distance matrix self._dist = np.zeros((self.m, self.m), dtype=np.float) for i in xrange(self.m): self._dist[i, i] = float("inf") for j in xrange(i+1, self.m): # method 1: Euclidean distance between DIs d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[j,:])) # method 2: sum of sum of distances squared between DFs #d = self.df_list[i].get_vec_diff_sqsum(self.df_list[j]) self._dist[i, j] = d self._dist[j, i] = d def init_from_di_list(self, di_dict, **kwargs): """ alternative __init__ taking a dict sample_name --> array of DI as input if this init is used, we're expecting to run UPGMA clustering """ self.original_names = di_dict.keys() self.original_names.sort() self.mask = kwargs['mask'] if 'mask' in kwargs else 1. self.method = kwargs['method'] if 'method' in kwargs else 'Simpson' self.threshold = kwargs['threshold'] if 'threshold' in kwargs else 10 self.m = len(di_dict) self.n = len(di_dict.itervalues().next()) self.trees = [tree.Leaf(x) for x in self.original_names] self.X = np.zeros((self.m, self.n), dtype=np.float) # fill up X using di_dict for i in xrange(self.m): self.X[i] = di_dict[self.original_names[i]] self._dist = np.zeros((self.m, self.m), dtype=np.float) for i in xrange(self.m): self._dist[i, i] = float("inf") for j in xrange(i+1, self.m): # method 1: Euclidean distance between DIs d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[j,:])) # method 2: sum of sum of distances squared between DFs #d = self.df_list[i].get_vec_diff_sqsum(self.df_list[j]) self._dist[i, j] = d self._dist[j, i] = d def write_DI(self, output_filename, mask=None): with open(output_filename, 'w') as f: for i, name in enumerate(self.original_names): di = self.X[i, ] if mask is None else self.X[i, mask] f.write(name + ',') di.tofile(f, sep=",") f.write('\n') def run_one_cluster_step(self): d = self._dist.argmin() i, j = d / self.m, d % self.m _min_val = self._dist[i, j] if _min_val == float("inf"): raise StopIteration, "done!" #print >> sys.stderr, "combining {0} and {1}".format(self.trees[i], self.trees[j]) # merge j into i size_i = len(self.trees[i].get_leaves()) size_j = len(self.trees[j].get_leaves()) t = tree.Tree() t.add_edge((self.trees[i], 0, _min_val/2)) # (subtree-i, bootstrap=0, branch length=dist) t.add_edge((self.trees[j], 0, _min_val/2)) self.trees[i] = t self.trees[j] = None # NEW!!! instead of just adding df_list[j] to df_list[i], normalize the counts FIRST!!! if self.df_list is not None: self.df_list[i].normalized_vec_add(self.df_list[j], vec_pre_normalized=True, ignoreN=True) # print "before", self.X[i, ] self.X[i] = self.runner.run(self.df_list[i], method=self.method, threshold=self.threshold,\ vec_pre_normalized=True, ignoreN=True) # print("merged {0} and {1}".format(i, j)) # print "new vec is now", self.X[i, ] # self._dist[j, :] = float("inf") # self._dist[:, j] = float("inf") for k in xrange(self.m): if k==i or k==j or self.trees[k] is None: continue # method 1: #d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[k,:])) # method 2: #d = self.df_list[i].get_vec_diff_sqsum(self.df_list[k]) # method 3: UPGMA d = (self._dist[k, i] * size_i + self._dist[k, j] * size_j) / (size_i + size_j) # method 4: complete linkage #d = max(self._dist[k, i], self._dist[k, j]) #print >> sys.stderr, "using Euclidean dist: {0}, using vecdiff: {1}".format(\ # math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[k,:])), d) self._dist[i, k] = d self._dist[k, i] = d self._dist[j, :] = float("inf") self._dist[:, j] = float("inf") def run_till_end(self): while len(self.trees) > 1: try: self.run_one_cluster_step() except StopIteration: break