def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1): """Expand a selection left and right from a staring index in a list of values Keep expanding unless the stdev of the values goes above the cutoff Return a list of indices into the original list """ ret_list = [startIndex] # this is what we will give back start_val = vals[startIndex] value_store = [start_val] sorted_indices = np_argsort(vals) max_index = len(vals) # set the upper and lower to point to the position # where the start resides lower_index = 0 upper_index = 0 for i in range(max_index): if sorted_indices[i] == startIndex: break lower_index += 1 upper_index += 1 do_lower = True do_upper = True max_index -= 1 while do_lower or do_upper: if do_lower: do_lower = False if lower_index > 0: try_val = vals[sorted_indices[lower_index - 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array lower_index -= 1 ret_list.append(sorted_indices[lower_index]) do_lower = True if do_upper: do_upper = False if upper_index < max_index: try_val = vals[sorted_indices[upper_index + 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array upper_index += 1 ret_list.append(sorted_indices[upper_index]) do_upper = True return sorted(ret_list)
def get_filtered(self, condition, sort_by_col=None): """ Filter and optionally sort asset metrics by condition :param condition: boolean array (example: (mf['some_metric'] > 0) & (mf['another_metric'] == 1) ) :param sort_by_col: (optional) column name to sort results (sort order is always ascending) :return: tuple of arrays ( sorted_assets_array, sorted_metrics_data_matrix) """ flt_idx = self._indexes[condition] _flt_data = self._data.take(flt_idx, axis=0) if sort_by_col is not None: col_id = self._columns[sort_by_col] srt_idx = np_argsort(_flt_data[:, col_id]) orig_sorted_idx = np_take(flt_idx, srt_idx, axis=0) return np_take(self._assets_list, orig_sorted_idx), np_take(_flt_data, srt_idx, axis=0) else: return np_take(self._assets_list, flt_idx), self._data[flt_idx, :]
def findArrayCenter(self, vals): """Find the center of the numpy array vals, return the index of the center""" # parameters current_val_max = -1 delta = 0 bounce_amount = 0.1 height = 0 last_val = 0 working = np_array([]) final_index = -1 # sort and normalise between 0 -> 1 sorted_indices = np_argsort(vals) vals_sorted = [vals[i] for i in sorted_indices] vals_sorted -= vals_sorted[0] if vals_sorted[-1] != 0: vals_sorted /= vals_sorted[-1] # print vals_sorted # run through in one direction for val in vals_sorted: # calculate delta delta = val - last_val # reduce the current value according to the delta value height = self.reduceViaDelta(height, bounce_amount, delta) # bounce the ball up height += bounce_amount # store the height working = np_append(working, height) final_index += 1 # save the last val last_val = val current_val_max = -1 height = 0 last_val = 0 # print "===W===" # print working # print "===E===" # run through in the reverse direction vals_sorted = vals_sorted[::-1] for val in vals_sorted: if last_val == 0: delta = 0 else: delta = last_val - val height = self.reduceViaDelta(height, bounce_amount, delta) height += bounce_amount # add to the old heights working[final_index] += height final_index -= 1 last_val = val # print working # print "==EEE==" # find the original index! return sorted_indices[np_argmax(working)]
def find_max_neig(neig_list,g1,perc,model,scaler,inputs): n_maxs = len(neig_list) if n_maxs == 0: return None if n_maxs > 10: # Ascending order n_maxs = int(np_ceil(perc*len(neig_list))) neig_key_list = [k for k in neig_list] neig_wt_list = [float(neig_list[k]['weight']) for k in neig_list] sorted_ind = np_argsort(neig_wt_list) sorted_wts = [{'weight':val} for val in np_sort(neig_wt_list)][-n_maxs:] sorted_neig_keys = [neig_key_list[i] for i in sorted_ind][-n_maxs:] imp_neigs = dict(zip(sorted_neig_keys,sorted_wts)) folNm = inputs['folNm'] if len(imp_neigs) == 1: imp_neig = list(imp_neigs.keys())[0] wt = imp_neigs[imp_neig] wt_edge = wt['weight'] node_to_add = imp_neig #ADD ALL EDGES OF NEW NODE TO ORIG GRAPH with open(folNm+"/"+node_to_add,'rb') as f: its_neig_list = pickle_load(f) orig_nodes = g1.nodes() all_nodesWedges = set(orig_nodes).intersection(its_neig_list) for node in all_nodesWedges: wt = its_neig_list[node] wt_edge = wt['weight'] g1.add_edge(node_to_add,node,weight=wt_edge) (score_imp_neig,comp_bool) = get_score(g1,model,scaler,inputs['model_type']) g1.remove_node(node_to_add) else: scores = {} for neig in imp_neigs: # Add to graph wt = imp_neigs[neig] wt_edge = wt['weight'] node_to_add = neig #ADD ALL EDGES OF NEW NODE TO ORIG GRAPH with open(folNm+"/"+node_to_add,'rb') as f: its_neig_list = pickle_load(f) orig_nodes = g1.nodes() all_nodesWedges = set(orig_nodes).intersection(its_neig_list) for node in all_nodesWedges: wt = its_neig_list[node] wt_edge = wt['weight'] g1.add_edge(node_to_add,node,weight=wt_edge) # Check score (score_curr,comp_bool) = get_score(g1,model,scaler,inputs['model_type']) scores[neig] = score_curr g1.remove_node(node_to_add) imp_neig = max(iter(scores.items()), key=operator_itemgetter(1))[0] score_imp_neig = scores[imp_neig] return(imp_neig,score_imp_neig)
def shuffleBAMs(self): """Make the data transformation deterministic by reordering the bams""" # first we should make a subset of the total data # we'd like to take it down to about 1500 or so RI's # but we'd like to do this in a repeatable way ideal_contig_num = 1500 sub_cons = range(len(self.indices)) while len(sub_cons) > ideal_contig_num: # select every second contig when sorted by norm cov cov_sorted = np_argsort(self.normCoverages[sub_cons]) sub_cons = np_array([ sub_cons[cov_sorted[i * 2]] for i in np_arange(int(len(sub_cons) / 2)) ]) if len(sub_cons) > ideal_contig_num: # select every second contig when sorted by mer PC1 mer_sorted = np_argsort(self.kmerNormPC1[sub_cons]) sub_cons = np_array([ sub_cons[mer_sorted[i * 2]] for i in np_arange(int(len(sub_cons) / 2)) ]) # now that we have a subset, calculate the distance between each of the untransformed vectors num_sc = len(sub_cons) # log shift the coverages towards the origin sub_covs = np_transpose([ self.covProfiles[i] * (np_log10(self.normCoverages[i]) / self.normCoverages[i]) for i in sub_cons ]) sq_dists = cdist(sub_covs, sub_covs, 'cityblock') dists = squareform(sq_dists) # initialise a list of left, right neighbours lr_dict = {} for i in range(self.numStoits): lr_dict[i] = [] too_big = 10000 while True: closest = np_argmin(dists) if dists[closest] == too_big: break (i, j) = self.small2indices(closest, self.numStoits - 1) lr_dict[j].append(i) lr_dict[i].append(j) # mark these guys as neighbours if len(lr_dict[i]) == 2: # no more than 2 neighbours sq_dists[i, :] = too_big sq_dists[:, i] = too_big sq_dists[i, i] = 0.0 if len(lr_dict[j]) == 2: # no more than 2 neighbours sq_dists[j, :] = too_big sq_dists[:, j] = too_big sq_dists[j, j] = 0.0 # fix the dist matrix sq_dists[j, i] = too_big sq_dists[i, j] = too_big dists = squareform(sq_dists) # now make the ordering ordering = [0, lr_dict[0][0]] done = 2 while done < self.numStoits: last = ordering[done - 1] if lr_dict[last][0] == ordering[done - 2]: ordering.append(lr_dict[last][1]) last = lr_dict[last][1] else: ordering.append(lr_dict[last][0]) last = lr_dict[last][0] done += 1 # reshuffle the contig order! # yay for bubble sort! working = np_arange(self.numStoits) for i in range(1, self.numStoits): # where is this guy in the list loc = list(working).index(ordering[i]) if loc != i: # swap the columns self.covProfiles[:, [i, loc]] = self.covProfiles[:, [loc, i]] self.stoitColNames[[i, loc]] = self.stoitColNames[[loc, i]] working[[i, loc]] = working[[loc, i]]
def shuffleBAMs(self): """Make the data transformation deterministic by reordering the bams""" # first we should make a subset of the total data # we'd like to take it down to about 1500 or so RI's # but we'd like to do this in a repeatable way ideal_contig_num = 1500 sub_cons = range(len(self.indices)) while len(sub_cons) > ideal_contig_num: # select every second contig when sorted by norm cov cov_sorted = np_argsort(self.normCoverages[sub_cons]) sub_cons = np_array([sub_cons[cov_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))]) if len(sub_cons) > ideal_contig_num: # select every second contig when sorted by mer PC1 mer_sorted = np_argsort(self.kmerNormPC1[sub_cons]) sub_cons = np_array([sub_cons[mer_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))]) # now that we have a subset, calculate the distance between each of the untransformed vectors num_sc = len(sub_cons) # log shift the coverages towards the origin sub_covs = np_transpose([self.covProfiles[i]*(np_log10(self.normCoverages[i])/self.normCoverages[i]) for i in sub_cons]) sq_dists = cdist(sub_covs,sub_covs,'cityblock') dists = squareform(sq_dists) # initialise a list of left, right neighbours lr_dict = {} for i in range(self.numStoits): lr_dict[i] = [] too_big = 10000 while True: closest = np_argmin(dists) if dists[closest] == too_big: break (i,j) = self.small2indices(closest, self.numStoits-1) lr_dict[j].append(i) lr_dict[i].append(j) # mark these guys as neighbours if len(lr_dict[i]) == 2: # no more than 2 neighbours sq_dists[i,:] = too_big sq_dists[:,i] = too_big sq_dists[i,i] = 0.0 if len(lr_dict[j]) == 2: # no more than 2 neighbours sq_dists[j,:] = too_big sq_dists[:,j] = too_big sq_dists[j,j] = 0.0 # fix the dist matrix sq_dists[j,i] = too_big sq_dists[i,j] = too_big dists = squareform(sq_dists) # now make the ordering ordering = [0, lr_dict[0][0]] done = 2 while done < self.numStoits: last = ordering[done-1] if lr_dict[last][0] == ordering[done-2]: ordering.append(lr_dict[last][1]) last = lr_dict[last][1] else: ordering.append(lr_dict[last][0]) last = lr_dict[last][0] done+=1 # reshuffle the contig order! # yay for bubble sort! working = np_arange(self.numStoits) for i in range(1, self.numStoits): # where is this guy in the list loc = list(working).index(ordering[i]) if loc != i: # swap the columns self.covProfiles[:,[i,loc]] = self.covProfiles[:,[loc,i]] self.stoitColNames[[i,loc]] = self.stoitColNames[[loc,i]] working[[i,loc]] = working[[loc,i]]