def encode_by_tokens(self, graph_set): sentence_tokens = graph_set[0].get("tokens", []) sentence_encoded = [ utils.get_idx(t, self._word2idx) for t in sentence_tokens ] edges_encoded = [] for g in graph_set: first_edge = graph.get_graph_first_edge(g) property_label = first_edge.get('label', '') edge_ids = [ utils.get_idx(t, self._word2idx) for t in property_label.split() ] edges_encoded.append(edge_ids) return sentence_encoded, edges_encoded
def get_interp_data(z_sample, maps, logfile=None, verbose=True): """ Finds the two projections with redshifts that are the nearest higher and nearest lower redshifts and extracts the Parameters ---------- z_sample : float The redshift of interest in the interpolation. maps : array or array-like The filenames of the column density maps. These have the same indexing as redshift_arr logfile : The file to write the logs. Returns ------- map_high : str map_high : str """ z_exist = np.empty(len(maps)) for i in range(len(maps)): with h5py.File(maps[i], "r") as ds: z_exist[i] = ds["Header"].attrs["Redshift"] # Get index idx_low, idx_high = utils.get_idx(z_sample, z_exist) # Get redshift of maps lower/higher than z_sample z_low, z_high = z_exist[idx_low], z_exist[idx_high] dist_low, dist_high = utils.z_to_mpc(z_low), utils.z_to_mpc(z_high) map_low, map_high = maps[idx_low], maps[idx_high] #data_low, data_high = h5py.File(map_low), h5py.File(map_high) if logfile: wlog( "{0:<10} {1:>10}\ {2:<10} {3:>10}".format("idx_low", idx_low, "idx_high", idx_high), logfile, verbose) wlog( "{0:<10} {1:>10.5}\ {2:<10} {3:>10.5}".format("z_low", z_low, "z_high", z_high), logfile, verbose) wlog( "{0:<10} {1:>9.5}\ {2} {3:>9.5}".format("dist_low", dist_low, "dist_high", dist_high), logfile, verbose) wlog( "{0:<10} {1}\ {2:<10} {3}\n".format("map_low", map_low, "map_high", map_high), logfile, verbose) return map_low, map_high
def encode_by_tokens(self, graph_set): sentence_tokens = graph_set[0].get("tokens", []) sentence_encoded = [ utils.get_idx(t, self._word2idx) for t in sentence_tokens ] graphs_encoded = [] for g in graph_set: edges_encoded = [] for edge in g.get('edgeSet', []): property_label = edge.get('label', '') edge_ids = [ utils.get_idx(t, self._word2idx) for t in property_label.split() ] edges_encoded.append(edge_ids) graphs_encoded.append(edges_encoded) return sentence_encoded, graphs_encoded
def encode_graphs(self, graph_set): graphs_encoded = [] for g in graph_set: edges_encoded = [] for edge in g.get('edgeSet', []): property_label = edge.get('label', '') edge_ids = [utils.get_idx(t, self._word2idx) for t in property_label.split()] edges_encoded.append(edge_ids) graphs_encoded.append(edges_encoded) return graphs_encoded
def feature_extraction(features, filtered, valid_labels, fs, window): # all feature calculations must end up the same size X = np.empty((len(valid_labels[window:]), 0)) for i in features: if i == 'linelength': X = np.concatenate((X, linelength(filtered, window)), axis=1) if i == 'delta': X = np.concatenate( (X, bandpower(filtered, window, utils.get_idx(delta_band, window, fs), fs)), axis=1) if i == 'theta': X = np.concatenate( (X, bandpower(filtered, window, utils.get_idx(theta_band, window, fs), fs)), axis=1) if i == 'alpha': X = np.concatenate( (X, bandpower(filtered, window, utils.get_idx(alpha_band, window, fs), fs)), axis=1) if i == 'beta': X = np.concatenate( (X, bandpower(filtered, window, utils.get_idx(beta_band, window, fs), fs)), axis=1) if i == 'gamma': X = np.concatenate( (X, bandpower(filtered, window, utils.get_idx(gamma_band, window, fs), fs)), axis=1) # size of labels should be consistent with X.shape[0] y = valid_labels[window:] return X, y
def train(self): os.makedirs(self.summ_path+"model/") optimizer = keras.optimizers.Adam(0.00001) global_step = 0 print("Training Started. Results and summary files are stored at", self.summ_path) for cur_epoch in trange(config.epoch): trn_sup_idx, trn_qry_idx, trn_lbl = utils.get_idx(lbl=self.trn_lbl) for cur_step in trange(0, config.iter_cnt, config.batch_size): cur_sup_dat = self.trn_dat[trn_sup_idx[cur_step:cur_step+config.batch_size]] cur_qry_dat = self.trn_dat[trn_qry_idx[cur_step:cur_step+config.batch_size]] cur_lbl = trn_lbl[cur_step:cur_step+config.batch_size] self.train_one_step(x_sup=cur_sup_dat, x_qry=cur_qry_dat, lbl=cur_lbl, model=self.all_model, optim=optimizer, vars=self.all_model.trainable_variables, step=global_step, log=global_step%10==0) global_step+=1 val_sup_idx, val_qry_idx, val_lbl = utils.get_idx(lbl=self.val_lbl, iter_cnt=100) self.logger(x_sup=self.val_dat[val_sup_idx], x_qry=self.val_dat[val_qry_idx], lbl=val_lbl, model=self.all_model, step=global_step) self.all_model.save_weights(self.summ_path+"model/%04d.h5"%cur_epoch)
def get_edge_feature_vector(self, edge): edge_kbid = edge.get('kbID')[:-1] if 'kbID' in edge else utils.unknown_el right_label_ids = [utils.get_idx(t, self._word2idx) for t in edge.get('canonical_right', "").split()][ :self._p.get('symbolic.features', {}).get("right.label", 0)] feature_vector = [self._property2idx.get(edge_kbid, 0), self._property2idx.get(edge['hopUp'][:-1] if 'hopUp' in edge else utils.all_zeroes, 0), self._property2idx.get(edge['hopDown'][:-1] if 'hopDown' in edge else utils.all_zeroes, 0), self._modifier2idx.get("argmax" if "argmax" in edge else "argmin" if "argmin" in edge else "num" if "num" in edge else "filter" if "filter" in edge else utils.all_zeroes, 0), self._type2idx.get(edge.get('type', utils.unknown_el), 0), self._propertytype2idx.get(edge['kbID'][-1] if 'kbID' in edge else utils.unknown_el, 0), ] + right_label_ids assert len(feature_vector) <= self._feature_vector_size return feature_vector
def longest_increasing_seq(A): """ Longest Increasing Subsequence. Given a sequence of n real numbers A(1) ... A(n), determine a subsequence (not necessarily contiguous) of maximum length in which the values in the subsequence form a strictly increasing sequence. """ D, T = [1] * len(A), [-1] * len(A) maxLen, maxIdx = 1, -1 for i in range(1, len(A)): for j in range(i): if A[i] > A[j] and D[i] < D[j] + 1: D[i] = D[j] + 1 T[i] = j if D[i] > maxLen: maxLen = D[i] maxIdx = i l = utils.get_idx(maxIdx, T, []) return maxLen, [A[i] for i in l]
def test_get_idx(self): self.assertEquals(utils.get_idx(['a','b','c'], 1), 'b') self.assertEquals(utils.get_idx(pd.Series(['a','b','c']), 1), 'b')
def print_indices(f, freq_band, window, fs): idxStartBin, idxEndBin = utils.get_idx(freq_band, window, fs) f.write("val idxStartBin = %d\n" % idxStartBin) f.write("val idxEndBin = %d\n" % idxEndBin) f.write("\n\n")
def plot_distributions(samples, output_dir, bin_sizes, plot_var, sig_tag, weight_type='None', normalize=False, density=True, log=True, file_name=''): if 'top' in sig_tag: tag = r'$t\bar{t}$' elif 'BSM' in sig_tag: tag = 'BSM' elif 'OoD' in sig_tag: tag = 'OoD' if 'OoD' in sig_tag: labels = {0: [tag, 'QCD'], 1: [tag + ' (weighted)', 'QCD (weighted)']} else: labels = {0: [tag, 'QCD'], 1: [tag + ' (cut)', 'QCD (cut)']} colors = ['tab:orange', 'tab:blue', 'tab:brown'] alphas = [1, 0.5] xlabel = { 'pt': '$p_t$', 'm': '$m$', 'rljet_n_constituents': 'Number of constituents' }[plot_var] plt.figure(figsize=(13, 8)) pylab.grid(True) axes = plt.gca() if not isinstance(samples, list): samples = [samples] for m in [0, 1]: for n in range(len(samples)): sample = samples[n] condition = sample['JZW'] == -1 if m == 0 else sample[ 'JZW'] >= 0 if m == 1 else sample['JZW'] >= -2 if not np.any(condition): continue variable = np.float32(sample[plot_var][condition]) weights = sample['weights'][condition] if 'flat' in weight_type: min_val, max_val = max(0, np.min(variable)), np.max(variable) else: min_val, max_val = max(0, np.min(sample[plot_var])), np.max( sample[plot_var]) bins = get_idx(max_val, bin_size=bin_sizes[plot_var], min_val=min_val, integer=False, tuples=False) if normalize: weights *= 100 / (np.sum(sample['weights']) ) #100/(np.sum(weights)) if density: indices = np.searchsorted(bins, variable, side='right') weights /= np.take(np.diff(bins), np.minimum(indices, len(bins) - 1) - 1) pylab.hist(variable, bins, histtype='step', weights=weights, color=colors[m], lw=2, log=log, alpha=alphas[n], label=labels[n][m]) if 'OoD' in sig_tag: if plot_var == 'm': pylab.xlim(0, 1200) pylab.ylim(1e0, 1e5) elif plot_var == 'pt': pylab.xlim(0, 3000) pylab.ylim(1e0, 1e5) elif 'Geneva' in sig_tag: if plot_var == 'm': pylab.xlim(0, 500) pylab.ylim(1e-2, 1e5) elif plot_var == 'pt': pylab.xlim(0, 2000) pylab.ylim(1e-2, 1e5) else: if plot_var == 'm': pylab.xlim(0, 500) pylab.ylim(1e0, 1e7) elif plot_var == 'pt': pylab.xlim(0, 2000) pylab.ylim(1e0, 1e7) axes.xaxis.set_minor_locator(ticker.AutoMinorLocator(10)) if not log: axes.yaxis.set_minor_locator(ticker.AutoMinorLocator(10)) plt.xlabel(xlabel + ' (GeV)', fontsize=24) y_label = ' density' if density else '' if normalize: y_label += ' (%)' elif sig_tag in ['top-UFO', 'BSM']: y_label += ' (' + r'58.5 fb$^{-1}$' + ')' plt.ylabel('Distribution' + y_label, fontsize=24) axes.tick_params(axis='both', which='major', labelsize=14) plt.legend(loc='upper right', ncol=1 if len(samples) == 1 else 2, fontsize=18) if file_name == '': file_name = (plot_var if plot_var == 'pt' else 'mass') + '_dist.png' file_name = output_dir + '/' + file_name print('Saving', format(plot_var, '2s'), 'distributions to:', file_name) plt.savefig(file_name)
supports, fmt=fmt, delimiter=',') np.savetxt("generated_files/alpha_vectors.csv", alpha_vector, fmt=fmt, delimiter=',') np.savetxt("generated_files/intercepts.csv", intercept, fmt=fmt, delimiter=',') for i in features: if i == 'delta': np.savetxt("generated_files/delta_index.csv", utils.get_idx(fe.delta_band, window, fs), fmt='%d', delimiter=',') if i == 'theta': np.savetxt("generated_files/theta_index.csv", utils.get_idx(fe.theta_band, window, fs), fmt='%d', delimiter=',') if i == 'alpha': np.savetxt("generated_files/alpha_index.csv", utils.get_idx(fe.alpha_band, window, fs), fmt='%d', delimiter=',') if i == 'beta': np.savetxt("generated_files/beta_index.csv", utils.get_idx(fe.beta_band, window, fs),
def get_data_for_interpolation(z_sample, redshift_arr, projections, logfile=None): """ Finds the two projections with redshifts that are the nearest higher and nearest lower redshifts and extracts the Parameters ---------- z_sample : float The redshift of interest in the interpolation. redshift_arr: array or array-like The redshifts from the projections ordered by snapshot number. projections : array or array-like The filenames of the projections. These have the same indexing as redshift_arr logfile : The file to write the logs. Returns ------- data_low : The data of the projection with the nearest lower redshift to z_sample. dist_low : The comoving distance to the projection with the nearest lower redshift to z_sample. data_high : The data of the projection with the nearest higher redshift to z_sample. dist_high : The comoving distance to the projection with the nearest higher redshift to z_sample. """ if logfile: logfile.write("\n-----------------") logfile.write( "\nGetting Interpolation Data: z = {0:.5f}".format(z_sample)) logfile.write("\n-----------------\n") idx_low, idx_high = utils.get_idx(z_sample, redshift_arr) z_low, z_high = redshift_arr[idx_low], redshift_arr[idx_high] dist_low, dist_high = utils.z_to_mpc(z_low), utils.z_to_mpc(z_high) proj_low, proj_high = projections[idx_low], projections[idx_high] data_low, data_high = h5py.File(proj_low), h5py.File(proj_high) if logfile: logfile.write("{0:<10} {1:10}\n{2:<10} {3:10}\n".format( "idx_low", idx_low, "idx_high", idx_high)) logfile.write("{0:<10} {1:10.5}\n{2:<10} {3:10.5}\n".format( "z_low", z_low, "z_high", z_high)) logfile.write("{0:<10} {1:10.5}\n{2:<10} {3:10.5}\n".format( "dist_low", dist_low, "dist_high", dist_high)) logfile.write("{0:<10} {1}\n{2:<10} {3}".format( "proj_low", proj_low, "proj_high", proj_high)) return data_low, dist_low, data_high, dist_high
def test_get_idx(self): self.assertEquals(utils.get_idx(['a', 'b', 'c'], 1), 'b') self.assertEquals(utils.get_idx(pd.Series(['a', 'b', 'c']), 1), 'b')
def encode_question(self, graph_set): sentence_tokens = graph_set[0].get("tokens", []) sentence_encoded = [utils.get_idx(t, self._word2idx) for t in sentence_tokens] return sentence_encoded