def dataSubset(fittingData, numDatapoints, seed=345, maxNumIndepParams=None): """ By default, add one timepoint for each independent parameter first, then increase the number of timepoints per independent parameter. Timepoints are added randomly for each independent parameter. Independent parameters are added in the order of indepParamsList. """ scipy.random.seed(seed) subset = [] numIndepParams = len(fittingData) if maxNumIndepParams is None: maxNumIndepParams = numIndepParams numDatapoints = int(numDatapoints) for i in range(min(numDatapoints, maxNumIndepParams)): varNames = scipy.sort(fittingData[i].keys()) allTimes = scipy.sort(fittingData[i][varNames[0]].keys()) possibleIndices = range(len(allTimes)) scipy.random.shuffle(possibleIndices) N = numDatapoints / maxNumIndepParams if i < numDatapoints % maxNumIndepParams: N += 1 timeIndices = possibleIndices[:N] times = scipy.array(allTimes)[timeIndices] s = {} for var in varNames: s[var] = dict([(t, fittingData[i][var][t]) for t in times]) subset.append(s) return subset
def check( x ): y = sl.canonicalise( x ) yr = y[0,:] yc = y[:,0] assert all( yr == sc.sort( yr ) ) assert all( yc == sc.sort( yc ) )
def thresholdFromNumNonzero(mat, numNonzero, sym=False, useAbs=True, aboveDiagOnly=False): """ Things get complicated if the matrix elements are not all distinct... sym: If True, treat the matrix as symmetric, and count only nonzero elements at or below the diagonal. """ if sym: mat = scipy.tri(len(mat)) * mat if useAbs: absMat = abs(mat) else: absMat = mat if not aboveDiagOnly: flatAbsMat = scipy.sort(arrayFlatten(absMat))[::-1] else: flatAbsMat = scipy.sort(aboveDiagFlat(absMat))[::-1] if numNonzero < 1: return scipy.inf elif numNonzero == len(flatAbsMat): if useAbs: return 0. else: return flatAbsMat[-1] elif numNonzero > len(flatAbsMat): raise Exception("Desired numNonzero > number of matrix elements.") return scipy.mean([flatAbsMat[numNonzero], flatAbsMat[numNonzero - 1]])
def dataSubset(fittingData,numDatapoints,seed=345,maxNumIndepParams=None): """ By default, add one timepoint for each independent parameter first, then increase the number of timepoints per independent parameter. Timepoints are added randomly for each independent parameter. Independent parameters are added in the order of indepParamsList. """ scipy.random.seed(seed) subset = [] numIndepParams = len(fittingData) if maxNumIndepParams is None: maxNumIndepParams = numIndepParams numDatapoints = int(numDatapoints) for i in range(min(numDatapoints,maxNumIndepParams)): varNames = scipy.sort( fittingData[i].keys() ) allTimes = scipy.sort( fittingData[i][varNames[0]].keys() ) possibleIndices = range(len(allTimes)) scipy.random.shuffle(possibleIndices) N = numDatapoints/maxNumIndepParams if i < numDatapoints%maxNumIndepParams: N += 1 timeIndices = possibleIndices[:N] times = scipy.array(allTimes)[timeIndices] s = {} for var in varNames: s[var] = dict([(t,fittingData[i][var][t]) for t in times]) subset.append(s) return subset
def quantify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_all = sp.arange(seg_exon1[0], seg_exon2[-1]) seg_intron = sp.setdiff1d(seg_all, seg_exon1) seg_intron = sp.setdiff1d(seg_intron, seg_exon2) assert(seg_intron.shape[0] > 0) ### compute exon coverages as mean of position wise coverage # intron_cov cov[0] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx, 1] return cov
def plot_tuning_curves(data_flags): # First entries are for mu_dSs, second are for tuning_width #plot_vars = [[0, 9, 18], [7, 11, 19]] plot_vars = [[0, 1, 2], [0, 1, 2]] cmaps = [[cm.Greys, cm.Purples, cm.Blues], [cm.Greys, cm.Purples, cm.Blues]] for data_idx, data_flag in enumerate(data_flags): list_dict = read_specs_file(data_flag) for key in list_dict: exec("%s = list_dict[key]" % key) tuning_curve_data = load_tuning_curve(data_flag) tuning_curve = tuning_curve_data['tuning_curve'] epsilons = tuning_curve_data['epsilons'] if data_idx == 0: fig, plot_dims, axes_tuning, axes_eps, axes_signal = \ tuning_curve_plot_epsilon(plot_vars, iter_vars, params) for idx, idx_var in enumerate(plot_vars[0]): for idy, idy_var in enumerate(plot_vars[1]): colors = cmaps[data_idx][idx](sp.linspace( 0.75, 0.3, params['Mm'])) for iM in range(params['Mm']): axes_tuning[idx, idy].plot(sp.arange(params['Nn'] / 2), sp.sort(tuning_curve[idx_var, idy_var, ::2, iM]), color=colors[iM], linewidth=0.7, zorder=params['Mm'] - iM) axes_tuning[idx, idy].plot(sp.arange(params['Nn'] / 2 - 1, params['Nn'] - 1), sp.sort(tuning_curve[idx_var, idy_var, 1::2, iM])[::-1], color=colors[iM], linewidth=0.7, zorder=params['Mm'] - iM) axes_eps[idy].plot(range(params['Mm']), epsilons[idx_var, idy_var], color=colors[4], linewidth=1.5, zorder=0) for iM in range(params['Mm']): axes_eps[idy].scatter(iM, epsilons[idx_var, idy_var][iM], c=colors[iM], s=3) save_tuning_curve_fig(fig, data_flag)
def quantify_mutex_exons(event, gene, counts_segments, counts_edges): sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) # exon1 cov cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum( seg_lens[seg_exon1]) # exon2 cov cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum( seg_lens[seg_exon2]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_pre_exon2_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[1] += counts_edges[idx2[0], 1] # exon1_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[0] += counts_edges[idx3[0], 1] # exon2_exon_aft_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[1] += counts_edges[idx4[0], 1] return cov
def _remdup(a,amax=None): """Remove duplicates from vector a """ scipy.sort(a) flag = 0 for x in range(1,len(a)): if (a[x-1]+1) - (a[x]+1) == 0: flag = 1 return flag
def plot_tuning_curves(data_flags): # First entries are for mu_dSs, second are for Kk2 diversity plot_vars = [[0, 9, 19], [8, 15, 19]] cmaps = [[cm.Greys, cm.Purples, cm.Blues], [cm.Greys, cm.Purples, cm.Blues]] for data_idx, data_flag in enumerate(data_flags): list_dict = read_specs_file(data_flag) for key in list_dict: exec("%s = list_dict[key]" % key) tuning_curve_data = load_tuning_curve(data_flag) tuning_curve = tuning_curve_data['tuning_curve'] epsilons = tuning_curve_data['epsilons'] Kk2s = tuning_curve_data['Kk2s'] if data_idx == 0: fig, plot_dims, axes_tuning, axes_Kk2, axes_signal = \ tuning_curve_plot_Kk2(plot_vars, iter_vars, params) for idx, idx_var in enumerate(plot_vars[0]): for idy, idy_var in enumerate(plot_vars[1]): colors = cmaps[data_idx][idx](sp.linspace( 0.75, 0.3, params['Mm'])) for iM in range(params['Mm']): axes_tuning[idx, idy].plot(sp.arange(params['Nn'] / 2), sp.sort(tuning_curve[idx_var, idy_var, ::2, iM]), color=colors[iM], linewidth=0.7, zorder=params['Mm'] - iM) axes_tuning[idx, idy].plot(sp.arange(params['Nn'] / 2 - 1, params['Nn'] - 1), sp.sort(tuning_curve[idx_var, idy_var, 1::2, iM])[::-1], color=colors[iM], linewidth=0.7, zorder=params['Mm'] - iM) if idx == 0: sorted_idxs = sp.argsort( sp.std(Kk2s[0, idy_var, :, :], axis=1)) axes_Kk2[idy].imshow(Kk2s[0, idy_var, sorted_idxs, :].T, aspect=0.3, cmap='bone', rasterized=True) save_tuning_curve_fig(fig, data_flag)
def quantify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos, CFG): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape order = 'F' offset = 1 ### find exons corresponding to event idx_exon1 = sp.where((sg[0, 0][0, :] == event.exon1[0]) & (sg[0, 0][1, :] == event.exon1[1]))[0] idx_exon2 = sp.where((sg[0, 0][0, :] == event.exon2[0]) & (sg[0, 0][1, :] == event.exon2[1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs[0, 1][idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs[0, 1][idx_exon2, :])[1]) else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_all = sp.arange(seg_exon1[0], seg_exon2[-1]) seg_intron = sp.setdiff1d(seg_all, seg_exon1) seg_intron = sp.setdiff1d(seg_intron, seg_exon2) assert (seg_intron.shape[0] > 0) ### compute exon coverages as mean of position wise coverage # intron_cov cov[0] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon1[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx, 1] return cov
def make_unique_by_strain(event_list): # event_list = make_unique_by_strain(event_list) rm_idx = [] for i in range(1, event_list.shape[0]): if i % 1000 == 0: print '.', if i % 10000 == 0: print '%i' % i old_coords = event_list[i - 1].get_coords(trafo=True) curr_coords = event_list[i].get_coords(trafo=True) if old_coords.shape[0] == curr_coords.shape[0] and sp.all( old_coords == curr_coords): ### assertion that we did everything right if event_list[i - 1].chr == event_list[i].chr: assert (event_list[i - 1].strand == event_list[i].strand) assert (event_list[i].strain.shape[0] == 1) else: assert (event_list[i - 1].gene_name != event_list[i].gene_name) idx = sp.where(event_list[i - 1].strain == event_list[i].strain[0])[0] if idx.shape[0] > 0: assert (idx.shape[0] == 1) assert (sp.all(event_list[i].get_coords( trafo=True) == event_list[i - 1].get_coords(trafo=True))) if not event_list[i].gene_name[0] in event_list[i - 1].gene_name: event_list[ i - 1].gene_name = sp.r_[event_list[i - 1].gene_name, [event_list[i].gene_name[0]]] event_list[i] = event_list[i - 1] else: event_list[i].strain = sp.r_[[event_list[i - 1].strain[0]], event_list[i].strain] assert (sp.all( sp.sort(event_list[i].strain) == sp.sort( sp.unique(event_list[i].strain)))) ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ... if not event_list[i].gene_name[0] in event_list[i - 1].gene_name: event_list[i].gene_name = sp.r_[ event_list[i - 1].gene_name, [event_list[i].gene_name[0]]] rm_idx.append(i - 1) print 'events dropped: %i' % len(rm_idx) keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0] event_list = event_list[keep_idx] return event_list
def _test_cg(self, tname, cg, A, b, expect): logging.info(f'{tname} with {cg.__name__}') expect = sp.sort(expect) actual = sp.sort(cg(A, b)) logging.debug(f'{tname} A\n{str(A)}') logging.debug(f'{tname} b\n{str(b)}') logging.debug(f'{tname} expect\n{str(expect)}') logging.debug(f'{tname} actual\n{str(actual)}') elapsed = timeit(lambda: cg(A, b), number=1000) logging.info(f'{tname} {elapsed:.3}ms') abserrorsum = sp.sum(sp.absolute(expect - actual)) logging.info(f'{tname} sum of absolute errors = {abserrorsum:.3}')
def average_EF(structure, distance_function, feature=True, pca=False, percent=[0.25], name=False): """ Compute the average enrichment factor of a given structure, over the set of actives Does it faster because of a vectorisation process (old version below) :param structure: the structure to access, in data/ :param name: the name of the embedding csv to use :return: the average EF for this structure as a dict percent : EF """ # not very useful, just put the name of the embedding csv. Otherwise it will get the baseline emebeddings if not name: name = '_feature={}_pca={}.csv'.format(feature, pca) # META Get the Path of the embeddings csv_actives_dir = os.path.join('../data/embeddings', structure) csv_decoys_dir = os.path.join('../data/embeddings', structure) csv_actives_path = os.path.join(csv_actives_dir, 'csv_actives' + name) csv_decoys_path = os.path.join(csv_decoys_dir, 'csv_decoys' + name) features = (4 * feature + 1) * 12 actives_values = pd.read_csv(csv_actives_path, usecols=range(2, features + 1), dtype=float) actives_values = np.array(actives_values) decoys_values = pd.read_csv(csv_decoys_path, usecols=range(2, features + 1), dtype=float) decoys_values = np.array(decoys_values) # Compute the DM (bottleneck) actives_dist = distance_matrix(actives_values, actives_values, distance_function) decoys_dist = distance_matrix(actives_values, decoys_values,distance_function) # Sort and compute the EF enrichments = {} for perc in percent: enrichments[perc] = [] for i in range(actives_dist.shape[1]): list_actives = np.sort(actives_dist[i])[:-1] list_decoys = np.sort(decoys_dist[i]) total = np.sort(np.append(list_decoys, list_actives)) a, d, tot = len(list_actives), len(list_decoys), len(total) # avoid to recompute this for all percent for perc in percent: threshold_index = tot - int(perc / 100 * tot) - 1 threshold = total[threshold_index] selected_actives = [x for x in list_actives if x >= threshold] selected_decoys = [x for x in list_decoys if x >= threshold] sa, sd = len(selected_actives), len(selected_decoys) stot = sa + sd numerator = sa / stot denominator = a / tot enrichments[perc].append(numerator / denominator) for perc in percent: enrichments[perc] = np.mean(enrichments[perc]) return enrichments
def set_ordered_temporal_adaptation_rate(self): """ Set a spread of adaptation rates, possibly ordered by activity levels. The spread is incorporated when temporal_adaptation_rate_sigma is nonzero, and this spread gives a factor change, i.e. beta --> beta*10^{-sigma, sigma}. Various ordering schemes are given. """ try: self.dYy self.Yy self.Yy0 except AttributeError: print 'Must run set_measured_activity(...) before calling '\ 'set_ordered_temporal_adaptation_rate(...)' sp.random.seed(self.temporal_adaptation_rate_seed) exp_spread = sp.random.normal(-self.temporal_adaptation_rate_sigma, self.temporal_adaptation_rate_sigma, self.Mm) self.temporal_adaptation_rate_vector = self.temporal_adaptation_rate*\ 10.**exp_spread # Order the adaptation rates by activity levels if self.temporal_adaptation_rate_ordering == 'random': pass elif self.temporal_adaptation_rate_ordering == 'increasing_Yy': sorted_idxs = self.Yy.argsort() idx_ranks = sorted_idxs.argsort() self.temporal_adaptation_rate_vector = \ sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks] elif self.temporal_adaptation_rate_ordering == 'increasing_dYy': sorted_idxs = self.dYy.argsort() idx_ranks = sorted_idxs.argsort() self.temporal_adaptation_rate_vector = \ sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks] elif self.temporal_adaptation_rate_ordering == 'decreasing_Yy': sorted_idxs = self.Yy.argsort()[::-1] idx_ranks = sorted_idxs.argsort() self.temporal_adaptation_rate_vector = \ sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks] elif self.temporal_adaptation_rate_ordering == 'decreasing_dYy': sorted_idxs = self.dYy.argsort()[::-1] idx_ranks = sorted_idxs.argsort() self.temporal_adaptation_rate_vector = \ sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks] else: print "\ntemporal_adaptation_rate_ordering not set to "\ "a valid string; use 'random', 'increasing_Yy', "\ "'increasing_dYy', 'decreasing_Yy', or 'decreasing_dYy'" quit()
def QQPlot(arguments,pv,unique_pv,fname): font_size = 18 mpl.rcParams['font.family']="sans-serif" mpl.rcParams['font.sans-serif']="Arial" mpl.rcParams['font.size']=font_size #mpl.rcParams['figure.dpi'] = 300 mpl.rcParams['font.weight']='medium' mpl.rcParams['figure.facecolor'] = 'white' mpl.rcParams['lines.linewidth'] = 1 mpl.rcParams['axes.facecolor'] = 'white' mpl.rcParams['patch.edgecolor'] = 'white' mpl.rcParams['grid.linestyle'] = '-' mpl.rcParams['grid.color'] = 'LightGray' if arguments.ignore!=None: if arguments.ignore in fname: return if arguments.distinct: pv = unique_pv pl.figure(figsize=(5,5)) pv_uni = (sp.arange(1.0/float(pv.shape[0]),1,1.0/float(pv.shape[0]+1))) pl.plot(-sp.log10(pv_uni),-sp.log10(sp.sort(pv_uni)),'b--') pl.ylim(0,(-sp.log10(pv[:])).max()+1) pl.plot(-sp.log10(pv_uni),-sp.log10(sp.sort(pv[:],axis=0)),'.',color="#F68E55",markersize=12,markeredgewidth=0,alpha=1) #plot theoretical expectations if arguments.estpv: datapoints=10**(sp.arange(sp.log10(0.5),sp.log10(pv.shape[0]-0.5)+0.1,0.1)) beta_alpha=sp.zeros(datapoints.shape[0]) beta_nalpha=sp.zeros(datapoints.shape[0]) beta_tmp=sp.zeros(datapoints.shape[0]) for n in xrange(datapoints.shape[0]): m=datapoints[n] beta_tmp[n]=stats.beta.ppf(0.5,m,pv.shape[0]-m) beta_alpha[n]=stats.beta.ppf(0.05,m,pv.shape[0]-m) beta_nalpha[n]=stats.beta.ppf(1-0.05,m,pv.shape[0]-m) estimated_pvals=datapoints/pv.shape[0] lower_bound = -sp.log10(estimated_pvals-(beta_tmp-beta_alpha)) upper_bound = -sp.log10(estimated_pvals+(beta_nalpha-beta_tmp)) pl.fill_between(-sp.log10(estimated_pvals),lower_bound,upper_bound,color='#00BFF3',alpha=0.4,linewidth=0) if arguments.title: pl.title("Phenotype: %s"%(fname)) pl.xlabel('Expected $-log10(p-value)$') pl.ylabel('Observed $-log10(p-value)$') if arguments.gc: gc = sp.median(stats.chi2.isf(pv,1))/0.456 pl.text(4,1,"$\hat \lambda=%.2f$"%(gc)) remove_border() pl.subplots_adjust(left=0.14,bottom=0.13,right=0.97,top=0.95,wspace=0.45) pl.savefig(os.path.join(arguments.out,'qqplot_' + fname + '.' + arguments.iformat) ) pl.close()
def get_coords(self, trafo=False): if self.event_type != 'mult_exon_skip': if trafo: #return sp.sort(sp.unique(sp.c_[self.exons1_col.ravel(), self.exons2_col.ravel()])) return sp.sort(sp.r_[self.exons1_col.ravel(), self.exons2_col.ravel()]) else: #return sp.sort(sp.unique(sp.c_[self.exons1.ravel(), self.exons2.ravel()])) return sp.sort(sp.r_[self.exons1.ravel(), self.exons2.ravel()]) else: if trafo: return sp.sort(sp.r_[self.exons1_col.ravel()[:4], self.exons2_col.ravel()[-4:]]) else: return sp.sort(sp.r_[self.exons1.ravel()[:4], self.exons2.ravel()[-4:]])
def quantify_mult_exon_skip(event, gene, counts_segments, counts_edges): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0] seg_exons = [] for i in range(1, event.exons2.shape[0] - 1): tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0] seg_exons.append(sp.where(segs.seg_match[tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exons_u = sp.sort(sp.unique([x for sublist in seg_exons for x in sublist])) ### inner exons_cov cov[0] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exons[0][0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[-1][-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[0] += counts_edges[idx2[0], 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[1] = counts_edges[idx3[0], 1] for i in range(len(seg_exons) - 1): # sum_inner_exon_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[i][-1], seg_exons[i+1][0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[0] += counts_edges[idx4[0], 1] return cov
def find_holes(data): sample = data.copy() size = sample.size # Here's a little hack to "flatten" star boxes tmp = scipy.sort(sample) star_cutoff = scipy.median(tmp[-30:-10]) * 0.6 sample = scipy.where(sample > star_cutoff, star_cutoff, sample) derivative = deriv_1d(sample) derivative = ndimage.gaussian_filter1d(derivative, 3) derivative = abs(derivative) tmp = scipy.sort(derivative) avg = scipy.median(tmp[size / 8:size * 3 / 8]) sigma = tmp[size / 8:size * 3 / 8].std() threshold = avg + sigma * 100. edge = [] count = 0 while derivative.max() > threshold: start = derivative.argmax() - 7 end = derivative.argmax() + 8 if start < 0: start = 0 if end > derivative.size: end = derivative.size fit = find_peak(derivative[start:end]) if start > 7 and end < derivative.size - 7: edge.append(float(start) + fit[2]) start -= 3 end += 3 if start < 0: start = 0 if end > derivative.size: end = derivative.size derivative[start:end] = 0. edge.sort() return edge, threshold, star_cutoff
def NGorN50(file_path='contigs.txt', genomesize=None): contigs, num_contig = file_parser(file_path) print( "Total number of contigs: %d " %(num_contig) ) # Expect 20 # Sort the contigs in reverse order in an array e.g. # array([79, 23, 10]) contigs = scipy.sort(contigs)[::-1] #print(contigs) # Calculate sum to compare against for N50s or NG50 if genomesize == None: contig_sum = contigs.sum()/2 print( "50 Contig Sum is: %d" % (contig_sum) ) else: contig_sum = int(genomesize)/2 print ("50 Genome Size specified: %d" %(contig_sum)) for counter in range(1, num_contig+1): # TODO: Consider memoizing this if you need to reuse this script for large contigs for performance gains. # Check the accumulated sum against the comparison if contigs[0:counter].sum() > contig_sum: print( "Partial Contig Sum is: %d, with counter: %d, and contig length %d" % (contigs[0:counter].sum(), counter, contigs[counter-1]) ) # Only need to find the first case break
def traj_ensemble_quantiles(traj_set, quantiles=(0.025, 0.5, 0.975)): """ Return a list of trajectories, each one corresponding the a given passed-in quantile. """ all_values = scipy.array([traj.values for traj in traj_set]) sorted_values = scipy.sort(all_values, 0) q_trajs = [] for q in quantiles: # Calculate the index corresponding to this quantile. The q is because # Python arrays are 0 indexed index = q * (len(sorted_values) - 1) below = int(scipy.floor(index)) above = int(scipy.ceil(index)) if above == below: q_values = sorted_values[below] else: # Linearly interpolate... q_below = (1.0 * below) / (len(sorted_values) - 1) q_above = (1.0 * above) / (len(sorted_values) - 1) q_values = sorted_values[below] + (q - q_below) * ( sorted_values[above] - sorted_values[below]) / (q_above - q_below) q_traj = copy.deepcopy(traj_set[0]) q_traj.values = q_values q_trajs.append(q_traj) return q_trajs
def bulk_bands_calculator(self, s, sub, kx, ky, kz): ''' Calculate the band energies for the specified kx, ky, and kz values. The 3x3 Hamiltonian for wurtzite crystals is used for the valence, while a 1x1 Hamiltonian is used for the conduction band. The model is from the chapter by Vurgaftman and Meyer in the book by Piprek. ''' E = scipy.zeros((4, len(s.Eg0))) E[0,:] = s.Eg0+s.delcr+s.delso/3+\ hbar**2/(2*s.mepara)*(kx**2+ky**2)+\ hbar**2/(2*s.meperp)*(kz**2)+\ (s.a1+s.D1)*s.epszz+(s.a2+s.D2)*(s.epsxx+s.epsyy) L = hbar**2/(2*m0)*(s.A1*kz**2+s.A2*(kx+ky)**2)+\ s.D1*s.epszz+s.D2*(s.epsxx+s.epsyy) T = hbar**2/(2*m0)*(s.A3*kz**2+s.A4*(kx+ky)**2)+\ s.D3*s.epszz+s.D4*(s.epsxx+s.epsyy) F = s.delcr + s.delso / 3 + L + T G = s.delcr - s.delso / 3 + L + T K = hbar**2 / (2 * m0) * s.A5 * (kx + 1j * ky)**2 + s.D5 * (s.epsxx - s.epsyy) H = hbar**2 / (2 * m0) * s.A6 * (kx + 1j * ky) * kz + s.D6 * (s.epsxz) d = scipy.sqrt(2) * s.delso / 3 for ii in range(len(s.Eg0)): mat = scipy.matrix([[F[ii], K[ii], -1j * H[ii]], [K[ii], G[ii], -1j * H[ii] + d[ii]], [-1j * H[ii], -1j * H[ii] + d[ii], L[ii]]]) w, v = scipy.linalg.eig(mat) E[1:, ii] = scipy.flipud(scipy.sort(scipy.real(w))) return E
def _get_Voronoi_edges(vor): r""" Given a Voronoi object as produced by the scipy.spatial.Voronoi class, this function calculates the start and end points of eeach edge in the Voronoi diagram, in terms of the vertex indices used by the received Voronoi object. Parameters ---------- vor : scipy.spatial.Voronoi object Returns ------- A 2-by-N array of vertex indices, indicating the start and end points of each vertex in the Voronoi diagram. These vertex indices can be used to index straight into the ``vor.vertices`` array to get spatial positions. """ edges = [[], []] for facet in vor.ridge_vertices: # Create a closed cycle of vertices that define the facet edges[0].extend(facet[:-1]+[facet[-1]]) edges[1].extend(facet[1:]+[facet[0]]) edges = sp.vstack(edges).T # Convert to scipy-friendly format mask = sp.any(edges == -1, axis=1) # Identify edges at infinity edges = edges[~mask] # Remove edges at infinity edges = sp.sort(edges, axis=1) # Move all points to upper triangle # Remove duplicate pairs edges = edges[:, 0] + 1j*edges[:, 1] # Convert to imaginary edges = sp.unique(edges) # Remove duplicates edges = sp.vstack((sp.real(edges), sp.imag(edges))).T # Back to real edges = sp.array(edges, dtype=int) return edges
def bulk_bands_calculator(self,s,sub,kx,ky,kz): ''' Calculate the band energies for the specified kx, ky, and kz values. The 3x3 Hamiltonian for wurtzite crystals is used for the valence, while a 1x1 Hamiltonian is used for the conduction band. The model is from the chapter by Vurgaftman and Meyer in the book by Piprek. ''' E = scipy.zeros((4,len(s.Eg0))) E[0,:] = s.Eg0+s.delcr+s.delso/3+\ hbar**2/(2*s.mepara)*(kx**2+ky**2)+\ hbar**2/(2*s.meperp)*(kz**2)+\ (s.a1+s.D1)*s.epszz+(s.a2+s.D2)*(s.epsxx+s.epsyy) L = hbar**2/(2*m0)*(s.A1*kz**2+s.A2*(kx+ky)**2)+\ s.D1*s.epszz+s.D2*(s.epsxx+s.epsyy) T = hbar**2/(2*m0)*(s.A3*kz**2+s.A4*(kx+ky)**2)+\ s.D3*s.epszz+s.D4*(s.epsxx+s.epsyy) F = s.delcr+s.delso/3+L+T G = s.delcr-s.delso/3+L+T K = hbar**2/(2*m0)*s.A5*(kx+1j*ky)**2+s.D5*(s.epsxx-s.epsyy) H = hbar**2/(2*m0)*s.A6*(kx+1j*ky)*kz+s.D6*(s.epsxz) d = scipy.sqrt(2)*s.delso/3 for ii in range(len(s.Eg0)): mat = scipy.matrix([[ F[ii], K[ii], -1j*H[ii] ], [ K[ii], G[ii], -1j*H[ii]+d[ii]], [-1j*H[ii], -1j*H[ii]+d[ii], L[ii] ]]) w,v = scipy.linalg.eig(mat) E[1:,ii] = scipy.flipud(scipy.sort(scipy.real(w))) return E
def import_data(size=128): files = [] orients = ["00F", "30L", "30R", "45L", "45R", "60L", "60R", "90L", "90R"] for orient in orients: _files = glob.glob(os.path.join(data_dir, "*/*_%s.jpg" % orient)) files = files + _files files = sp.sort(files) D1id = [] D2id = [] Did = [] Rid = [] Y = sp.zeros([len(files), size, size, 3], dtype=sp.uint8) for _i, _file in enumerate(files): y = imread(_file) y = imresize(y, size=[size, size], interp="bilinear") Y[_i] = y fn = _file.split(".jpg")[0] fn = fn.split("/")[-1] did1, did2, rid = fn.split("_") Did.append(did1 + "_" + did2) Rid.append(rid) Did = sp.array(Did, dtype="|S100") Rid = sp.array(Rid, dtype="|S100") RV = {"Y": Y, "Did": Did, "Rid": Rid} return RV
def subsetsWithFits(fileNumString,onlyNew=False): """ Find data subsets (N) that have models that have been fit to all conditions. onlyNew (False) : Optionally include only subsets that have fits that are not included in the current combined fitProbs. """ fpd = loadFitProbData(fileNumString) saveFilename = fpd.values()[0]['saveFilename'] Nlist = [] for N in scipy.sort(fpd.keys()): # find models that have been fit to all conditions if len(fpd[N]['fitProbDataList']) == 1: fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys() else: fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \ for fp in fpd[N]['fittingProblemList'] ]) if onlyNew: Nfilename = directoryPrefixNonly(fileNumString,N)+'/'+saveFilename fileExists = os.path.exists(Nfilename) if not fileExists: # no combined file exists if len(fitModels) > 0: Nlist.append(N) else: # check which fit models are currently included in the saved file fpMultiple = load(Nfilename) fitModelsSaved = fpMultiple.logLikelihoodDict.keys() if len(scipy.intersect1d(fitModels,fitModelsSaved)) < len(fitModels): Nlist.append(N) else: if len(fitModels) > 0: Nlist.append(N) return Nlist
def neo_p0(setup, *args): ntemps, nwalkers, nsteps = setup t = args[0] ndim = args[1] C = args[2] pos = sp.zeros((ntemps, nwalkers, ndim)) for temp in range(ntemps): for j in range(ndim): boundaries = t[C[j]].lims fact = sp.absolute(boundaries[0] - boundaries[1]) / nwalkers rnd = sp.random.uniform(0.9, 0.9999) dif = sp.arange(nwalkers) * fact * sp.random.uniform(0.9, 0.9999) if (t[C[j]].cv and t[C[j]].tag() != 'Period'): for i in range(nwalkers): pos[temp][i][j] = (boundaries[1] + 3 * boundaries[0]) / \ 4 + (dif[i] * 2. / 5. + fact / 2.0) elif t[C[j]].tag() == 'Jitter': jitt_ini = sp.sort(sp.fabs(sp.random.normal(0, 1, nwalkers))) * 0.1 dif = jitt_ini * sp.random.uniform(0.9, 0.9999) for i in range(nwalkers): pos[temp][i][j] = boundaries[0] + (dif[i] + fact / 2.0) pos[temp][i][j] *= 0.1 else: for i in range(nwalkers): pos[temp][i][j] = boundaries[0] + (dif[i] + fact / 2.0) return pos
def draw_graph(graph): # create networkx graph G=nx.Graph() ordered_node_list = scipy.sort([int(i[1::]) for i in graph]) # add nodes #for node in graph: # G.add_node(node) for num in ordered_node_list: G.add_node('n'+str(num)) # add edges for i in graph: for j in graph[i][1::]: G.add_edge(i,j) colors = ['b','r','g','c','w','k'] node_color = [colors[graph[node][0]] for node in graph] # draw graph #pos = nx.shell_layout(G) pos = nx.spring_layout(G,iterations=100) nx.draw(G, pos, node_color = node_color) # show graph plt.axis('off') plt.show()
def compute_accuracy(self): """Computes accuracy across the range in `self.date_range`. Returns: a pandas DataFrame with three columns corresponding to each kind of prediction method (PredPol, perfect prediction (god), and the baseline (naive_count)). The entries of each column are an array where the ith entry is the average accuracy over `self.date_range` when visiting i number of grid cells """ accuracy = { method: sp.zeros((len(self.results), len(self.lambda_columns))) for method in ['predpol', 'god', 'naive_count'] } naive_count = count_seen(self.pred_obj, self.pred_obj.train)['num_observed'] for i, (lambda_col, actual_col) in self._iterator(): actual_vals = self.results[actual_col].values accuracy['god'][:, i] = sp.sort(actual_vals)[::-1] sorted_idx = sp.argsort(self.results[lambda_col])[::-1] accuracy['predpol'][:, i] = actual_vals[sorted_idx] sorted_idx = sp.argsort(naive_count)[::-1] accuracy['naive_count'][:, i] = actual_vals[sorted_idx] naive_count += self.results[actual_col] # Compute CI and p-values here for k, v in accuracy.items(): accuracy[k] = sp.sum(v, axis=1) accuracy[k] = sp.cumsum(accuracy[k] / sp.sum(accuracy[k])) return pd.DataFrame(accuracy)
def traj_ensemble_quantiles(traj_set, quantiles=(0.025, 0.5, 0.975)): """ Return a list of trajectories, each one corresponding the a given passed-in quantile. """ all_values = scipy.array([traj.values for traj in traj_set]) sorted_values = scipy.sort(all_values, 0) q_trajs = [] for q in quantiles: # Calculate the index corresponding to this quantile. The q is because # Python arrays are 0 indexed index = q * (len(sorted_values) - 1) below = int(scipy.floor(index)) above = int(scipy.ceil(index)) if above == below: q_values = sorted_values[below] else: # Linearly interpolate... q_below = (1.0*below)/(len(sorted_values)-1) q_above = (1.0*above)/(len(sorted_values)-1) q_values = sorted_values[below] + (q - q_below)*(sorted_values[above] - sorted_values[below])/(q_above - q_below) q_traj = copy.deepcopy(traj_set[0]) q_traj.values = q_values q_trajs.append(q_traj) return q_trajs
def set_step_stim(self): """ Generate a random step stimulus with given density of steps and given discrete stimulus values. """ assert self.step_stim_density < self.nT, "step_stim_density must be "\ "less than number of time points, but nT = %s, density = %s" \ % (self.nT, self.step_stim_density) self.stim = sp.zeros(self.nT) # Get points at which to switch the step stimulus sp.random.seed(self.step_stim_seed) switch_pts = sp.random.choice(self.nT, self.step_stim_density) switch_pts = sp.sort(switch_pts) # Set values in each inter-switch interval from step_stim_vals array sp.random.seed(self.step_stim_seed) for iT in range(self.step_stim_density - 1): stim_val = sp.random.choice(self.step_stim_vals) self.stim[switch_pts[iT]:switch_pts[iT + 1]] = stim_val # Fill in ends edge_vals = sp.random.choice(self.step_stim_vals, 2) self.stim[:switch_pts[0]] = edge_vals[0] self.stim[switch_pts[self.step_stim_density - 1]:] = edge_vals[1] self.Tt = sp.arange(0, self.dt * self.nT, self.dt)
def termFrequencyMatrix(directory,stopwords,termlist): """ The student must code this. """ filenames = sp.sort(os.listdir(directory)) frequencyMatrix = sp.zeros((len(termlist),len(filenames))) for i in xrange(len(filenames)): frequencyMatrix[:,i] = termVector(directory + filenames[i],stopwords,termlist) return frequencyMatrix.astype(float)
def generateNoteLength(self): length = (60. / self.wavetempo) * self.time_freq_fs note_length = sp.array([2**i for i in range(5)]) / 4. note_length *= length note_huten = sp.array( [note_length[i-1]+note_length[i] for i in range(1, 5)]) note_length = sp.r_[note_length, note_huten] note_length = sp.sort(note_length) note_length_pair = [] for i in range(note_length.size): try: upper = (note_length[i+1] - note_length[i])/2 upper += note_length[i] except IndexError: upper = note_length[i] * 2 try: lower = note_length_pair[-1][1] except IndexError: lower = 0 note_length_pair.append((lower, upper)) if(self.output_form == 'MML'): note_name = ['16', '16.', '8', '8.', '4', '4.', '2', '2.', '1'] elif(self.output_form == 'PMX'): note_name = ['1', '1d', '8', '8d', '4', '4d', '2', '2d', '0'] return (note_name, note_length_pair)
def quantify_mutex_exons(event, gene, counts_segments, counts_edges): sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) # exon1 cov cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2 cov cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_pre_exon2_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[1] += counts_edges[idx2[0], 1] # exon1_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[0] += counts_edges[idx3[0], 1] # exon2_exon_aft_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[1] += counts_edges[idx4[0], 1] return cov
def _verify_eqrm_flags(eqrm_flags): """ Check that the values in eqrm_flags are consistant with how EQRM works. Args: eqrm_flags: A DictKeyAsAttributes instance. """ if not allclose(eqrm_flags.atten_periods, sort(eqrm_flags.atten_periods)): raise AttributeSyntaxError( "Syntax Error: Period values are not ascending") if eqrm_flags.save_hazard_map == True and eqrm_flags.is_scenario == True: raise AttributeSyntaxError( 'Cannot save the hazard map for a scenario.') if eqrm_flags.atten_variability_method == 1 and \ eqrm_flags.run_type == 'risk_csm': raise AttributeSyntaxError( 'Cannot use spawning when doing a risk_csm simulation.') if eqrm_flags.amp_variability_method == 1: raise AttributeSyntaxError( 'Cannot spawn on amplification.') if eqrm_flags.event_set_handler == 'load' and \ eqrm_flags.event_set_load_dir is None: raise AttributeSyntaxError( 'event_set_load_dir must be set if event_set_handler is load.') if eqrm_flags.event_set_handler == 'load' and \ not os.path.exists(eqrm_flags.event_set_load_dir): raise AttributeSyntaxError( 'event_set_load_dir %s must exist if event_set_handler is load.' % eqrm_flags.event_set_load_dir) # Only do these checks if different from output_dir # (output_dir gets created if not exists) if eqrm_flags.data_array_storage != eqrm_flags.output_dir and \ not os.path.exists(eqrm_flags.data_array_storage): raise AttributeSyntaxError( 'data_array_storage %s must exist and be accessible from %s' % (eqrm_flags.data_array_storage, socket.gethostname())) if eqrm_flags.fault_source_tag is None and \ eqrm_flags.zone_source_tag is None: raise AttributeSyntaxError( 'Either fault_source_tag or zone_source_tag must be set.') # Check to see if a parameter is defined that is incompatible with the # defined run_type # Note: _add_default_values should have already dealt with adding # incompatible defaults for param in CONV_NEW: if not is_param_compatible(param, eqrm_flags): raise AttributeSyntaxError( "Attribute " + param['new_para'] + " not compatible with run_type=" + eqrm_flags['run_type'] + " - compatible run_type values are " + str(param['run_type']))
def __init__(self, name, data): assert isinstance(name, six.string_types) assert isinstance(data, (list, tuple)) self.name = name self.data = data self.by_date = dict([(datetime_from_date(dateutil.parser.parse(d)), v) for (d, v) in self.data]) self.sorted = sort(array(list(self.by_date.keys())))
def calculateEV_montecarlo(fFn, zRV, nDraws=10000): global g_montecarloDraws if (not (zRV, nDraws) in g_montecarloDraws): g_montecarloDraws[(zRV, nDraws)] = scipy.sort(zRV.rvs(size=nDraws)) draws = g_montecarloDraws[(zRV, nDraws)] vals = map(fFn, draws) EV = scipy.mean(vals) return EV
def calculateEV_montecarlo2(grid, fArray, zRV, nDraws=10000): global g_montecarloDraws if (not (zRV, nDraws) in g_montecarloDraws): g_montecarloDraws[(zRV, nDraws)] = scipy.sort(zRV.rvs(size=nDraws)) draws = g_montecarloDraws[(zRV, nDraws)] fn = linterp.LinInterp1D(grid, fArray) EV = fn.applySorted(draws) / nDraws return EV
def termFrequencyMatrix(directory, stopwords, termlist): """ The student must code this. """ filenames = sp.sort(os.listdir(directory)) frequencyMatrix = sp.zeros((len(termlist), len(filenames))) for i in xrange(len(filenames)): frequencyMatrix[:, i] = termVector(directory + filenames[i], stopwords, termlist) return frequencyMatrix.astype(float)
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio # Get numpy array representation of input self.vectors = numpy_array_from_list_or_numpy_array(vectors) # Build map from vector string representation to vector for index in range(self.vectors.shape[1]): self.vector_dict[self.__vector_to_string( self.vectors[:, index])] = index # Get transposed version of vector matrix, so that the rows # are the vectors (needed by cdist) vectors_t = numpy.transpose(self.vectors) # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * self.vectors.shape[1]) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k * (self.vectors.shape[1] / query_count)) index = min(index, self.vectors.shape[1] - 1) self.query_indices.append(int(index)) print('\nStarting exact search (query set size=%d)...\n' % query_count) # For each query vector get radius of closest N neighbours self.nearest_radius = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = vectors_t[index, :].reshape(1, self.vectors.shape[0]) exact_search_start_time = time.time() D = cdist(v, vectors_t, 'euclidean') # Get radius of closest N neighbours self.nearest_radius[index] = scipy.sort(D)[0, N] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print('\Done with exact search...\n') # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio # Get numpy array representation of input self.vectors = numpy_array_from_list_or_numpy_array(vectors) # Build map from vector string representation to vector for index in range(self.vectors.shape[1]): self.vector_dict[self.__vector_to_string( self.vectors[:, index])] = index # Get transposed version of vector matrix, so that the rows # are the vectors (needed by cdist) vectors_t = numpy.transpose(self.vectors) # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * self.vectors.shape[1]) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k*(self.vectors.shape[1]/query_count)) index = min(index, self.vectors.shape[1]-1) self.query_indices.append(int(index)) print '\nStarting exact search (query set size=%d)...\n' % query_count # For each query vector get radius of closest N neighbours self.nearest_radius = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = vectors_t[index, :].reshape(1, self.vectors.shape[0]) exact_search_start_time = time.time() D = cdist(v, vectors_t, 'euclidean') # Get radius of closest N neighbours self.nearest_radius[index] = scipy.sort(D)[0, N] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print '\Done with exact search...\n' # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def quantify_exon_skip(event, gene, counts_segments, counts_edges): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0]) & (sg.vertices[1, :] == event.exons2[2, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1]) # get inner exon cov cov[0] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) / sp.sum( seg_lens[seg_exon]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx1, 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx2, 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx3, 1] return cov
def entropy2(values): """Calculate the entropy of vector values. values will be flattened to a 1d ndarray.""" values = sp.asarray(values).flatten() p = sp.diff(sp.c_[0,sp.diff(sp.sort(values)).nonzero(), values.size])/float(values.size) H = (p*sp.log2(p)).sum() return -H
def dict_arrsort(in_dict): """sort all arrays in a dictionary [""" try: for k in in_dict.keys(): if isinstance(in_dict[k], sp.ndarray): in_dict[k] = sp.sort(in_dict[k]) finally: return in_dict
def __init__(self, name, data): assert isinstance(name, six.string_types) assert isinstance(data, (list, tuple)) self.name = name self.data = data self.by_date = dict( [(datetime_from_date(dateutil.parser.parse(d)), v) for (d, v) in self.data] ) self.sorted = sort(array(list(self.by_date.keys())))
def filterBid(allids, sbids): ''' gets two id list returns matching index ''' if sp.unique(sbids).shape[0] != sbids.shape[0]: warnings.warn("superset ids are not unique: Making it unique") sbids = sp.unique(sbids) if sp.unique(allids).shape[0] != allids.shape[0]: warnings.warn("Subset ids are not unique: Making it unique") allids = sp.unique(allids) if sp.sum(sp.sort(allids) == allids) != allids.shape[0]: warnings.warn("Superset ids are not sorted: Sorting it") allids = sp.sort(allids) if sp.sum(sp.sort(sbids) == sbids) != sbids.shape[0]: warnings.warn('subset ids are not sorted: Sorting it') sbids = sp.sort(sbids) return sp.where(sp.in1d(allids, sbids))[0]
def generateNoteLength(self, tempo): length = 60 / tempo * self.fs * 4 note_length = sp.array([2**i for i in range(5)]) / 4 note_length *= length note_huten = sp.array( [note_length[i-1]+note_length[i] for i in range(1, 4)]) note_length = sp.r_[note_length, note_huten] self.note_length = sp.sort(note_length) self.note_name = ['16', '16.', '8', '8.', '4', '4.', '2', '2.' '1']
def ECDF2(seq): """ Calculate the Empirical Cumuated Distribution Function (ecdf) from a sequence 'seq'. """ N=len(seq) sseq=sp.sort(seq) ranks = sp.stats.rankdata(sseq) ecdf=ranks/(N+1) return ecdf
def __init__(self, data, sky, wave, wave2, skymodel): """ Plot data """ self.data = plt.plot(wave, data, c='b')[0] self.sky = plt.plot(wave2, sky, c='gray')[0] self.canvas = self.data.get_figure().canvas self.ax = self.data.get_axes() self.xdata = self.data.get_xdata().copy() self.start = [ data.copy(), sky.copy(), wave.copy(), wave2.copy(), skymodel ] self.skymodel = skymodel """ Get metadata (ie line locations) for arcs """ data = self.data.get_ydata() self.datapeaks = ndimage.maximum_filter(data, 9) tmp = scipy.sort(data) thresh = tmp[tmp.size * 0.95] cond = (data == self.datapeaks) & (data > thresh) self.datapeaks = scipy.where(cond)[0] self.datasel = self.datapeaks * 0 self.datalines = [] for peak in self.datapeaks: l = plt.axvline(self.xdata[peak], c='k') self.datalines.append(l) self.spec = self.data self.peaks = self.datapeaks self.selected = self.datasel self.lines = self.datalines """ Set useful flags """ self.domotion = False self.origx = None self.soln = None self.pick = False self.fitlines = None self.keyid = self.canvas.mpl_connect('key_press_event', self.key_press) self.connect() print """ Mouse Controls: - left button drags single lines (rescales spectrum!) - middle button drags all lines (or exits from pan/zoom modes) - right button selects/deselects lines Keyboard Commands: a - add new line (use mouse to select the line) m - fit a polynomial to the blue `solution' d - optimize the blue fit to the gray model (like m, but optimizes too) w - write the current state to disk r - read a saved state n - reset to the initial state q - quit (performs an `m' fit if no fit has been applied yet) """ plt.show()
def runFitAllParallelWorker(fileNumString,endTime=None,verbose=True): """ Each worker node runs this function to look for and perform work. endTime (None) : Stop work if endTime hours (wall time) have elapsed when completing a work unit. If None, continue indefinitely. """ # check that the fitProbData file exists if not fileNumString+"_fitProbData.dat" in os.listdir('.'): raise Exception, "fitProbData database file not found: "+str(fitProbDatFilename) # 9.24.2013 make sure SloppyCell C compiling is working if not testCcompiling(): raise Exception, "SloppyCell C compiling not working." if endTime is None: endTime = scipy.inf startWallTime = time.time() elapsedTimeHours = 0 while elapsedTimeHours < endTime: fitProbData = loadFitProbData(fileNumString) saveFilename = fitProbData.values()[0]['saveFilename'] numTimepointsList = scipy.sort(fitProbData.keys()) # () find a (condition,Np,model) triplet to work on conditioni,numTimepointsi,modelj = assignWork(fileNumString) numTimepoints = numTimepointsList[numTimepointsi] fitProb = loadFitProb(saveFilename,fileNumString,conditioni,numTimepoints) if verbose: print "runFitAllParallelWorker: Assigned work: condition",conditioni,\ ", numTimepoints",numTimepoints,", model index",modelj # set up smallerBestSeenParams if (numTimepointsi > 0) and \ (getState(fitProbData,conditioni,numTimepointsi-1,modelj) == 'finished'): smallerFitProb = loadFitProb(saveFilename,fileNumString,conditioni, numTimepointsList[numTimepointsi-1]) fitProb.smallerBestParamsDict = paramsDict(smallerFitProb) # fit the single model fitProb.fitAll(maxNumFit=modelj+1) # save the result in the individual fitProbDict file saveFitProb(fitProb,saveFilename,fileNumString,conditioni,numTimepoints) # save the result in the more general fitProbData file updateFitProbData(fitProb,fileNumString,conditioni,numTimepoints,modelj) if verbose: print "runFitAllParallelWorker: Finished work." elapsedTimeHours = (time.time() - startWallTime)/3600.
def measure(file,num=25): f = pyfits.open(file) data = f[0].data.astype(scipy.float64)/1000. size = data.size wave = spectools.wavelength(file) sorted = scipy.sort(data) zero = sorted[size/20:size/10].mean() sigma = sorted[size/20:size/10].std() thresh = zero+100*sigma count = 0 search = data.copy() vals = scipy.zeros(num) place = scipy.zeros(num) while count<num: max = search.max() if max<thresh: break pos = search.argmax() search[pos-5:pos+6] = 0. if pos<5 or pos+6>size: continue fitdata = scipy.zeros((11,2)) fitdata[:,0] = wave[pos-5:pos+6] fitdata[:,1] = data[pos-5:pos+6] par = scipy.zeros(4) par[1] = max par[2] = wave[pos] par[3] = wave[pos]-wave[pos-1] fit,chi2 = special_functions.ngaussfit(fitdata,par) if chi2>4: continue model = special_functions.ngauss(fitdata[:,0],fit) pylab.plot(wave[pos-5:pos+6],model) feature = wave[pos] width = fit[3]*299800/feature vals[count] = width place[count] = feature count += 1 args = place.argsort() vals = vals[args] place = place[args] return scipy.median(vals)
def g2ig(g): """ Converts our graph represenataion to an igraph for plotting """ t = scipy.where(CG2adj(g) == 1) l = zip(t[0], t[1]) ig = igraph.Graph(l, directed=True) ig.vs["name"] = scipy.sort([u for u in g]) ig.vs["label"] = ig.vs["name"] return ig
def makeFpdLean(fpd): """ Modify in place to create a stripped-down version of fpd that doesn't include the models. """ for N in scipy.sort(fpd.keys()): fp = fpd[N] for f in fp.fittingProblemList: f.fittingModelDict = {} f.fittingModelList = []
def ECDF(seq): """ Calculate the Empirical Cumuated Distribution Function (ecdf) from a sequence 'seq'. A scipy interpolation object is returned. """ N=len(seq) sseq=sp.sort(seq) ecdf=sp.linspace(1./N,1,N) return interp1d(sseq,ecdf,bounds_error=False)
def entropy2(values): """Calculate the entropy of vector values. values will be flattened to a 1d ndarray.""" values = values.flatten() M = len(sp.unique(values)) p = sp.diff(sp.c_[sp.diff(sp.sort(values)).nonzero(), len(values)])/float(len(values)) H = -((p*sp.log2(p)).sum()) return H