def train_val_test_split(data_df,number_judges,train_ratio=0.8,val_ratio=0.1,verbose=0,toshuffle=True): starttime= time.time() sorted_all_data = data_df.sort_values(by='judge_embed_index') train_indexes = [] val_indexes = [] test_indexes = [] currentiloc = 0 for judge_index in range(number_judges): if verbose and judge_index%500 == 0: print(judge_index,time.time()-starttime) cases_of_this_judge = sorted_all_data.loc[sorted_all_data['judge_embed_index'] == judge_index] number_cases = cases_of_this_judge.shape[0] n_of_train = int(number_cases*train_ratio) n_of_val = int(number_cases*val_ratio) nextiloc = currentiloc+number_cases indexes = [i for i in range(currentiloc, nextiloc)] shuffle(indexes) train_indexes += indexes[:n_of_train] val_indexes += indexes[n_of_train:n_of_train+n_of_val] test_indexes += indexes[n_of_train+n_of_val:] currentiloc = nextiloc return skshuffle(data_df.loc[train_indexes]),skshuffle(data_df.loc[val_indexes]),skshuffle(data_df.loc[test_indexes])
def shuffle(X, y=None): if isinstance(X, dict): to_shuffle = X.values() if y is not None: to_shuffle.append(y) shuffled = skshuffle(*to_shuffle) if y is not None: return dict(zip(X.keys(), shuffled[:-1])), shuffled[-1] else: return dict(zip(X.keys(), shuffled)) else: return skshuffle(X, y)
def sorted_batch_iter(data, batch_size, shuffle=False, random_state=123): batch = [] sorted_data = sorted(data, key=lambda x: len(x[0]), reverse=True) for line in sorted_data: batch.append(line) if len(batch) == batch_size: batch = skshuffle(batch, random_state=random_state) yield tuple(list(x) for x in zip(*batch)) # yield batch batch = [] if batch: batch = skshuffle(batch, random_state=random_state) yield tuple(list(x) for x in zip(*batch))
def get_minibatches(X, mb_size, shuffle=True): """ Generate minibatches from given dataset for training. Params: ------- X: np.array of M x 3 Contains the triplets from dataset. The entities and relations are translated to its unique indices. mb_size: int Size of each minibatch. shuffle: bool, default True Whether to shuffle the dataset before dividing it into minibatches. Returns: -------- mb_iter: generator Example usage: -------------- mb_iter = get_minibatches(X_train, mb_size) for X_mb in mb_iter: // do something with X_mb, the minibatch """ minibatches = [] X_shuff = np.copy(X) if shuffle: X_shuff = skshuffle(X_shuff) for i in range(0, X_shuff.shape[0], mb_size): yield X_shuff[i:i + mb_size]
def get_data(path='../data', subset=True): # Read and merge the data df_fake = pd.read_csv(os.path.join(path, 'fake.csv')) df_real = pd.read_csv(os.path.join(path, 'real.csv')) df_fake['label'] = 1 df_real['label'] = 0 df_tot = pd.concat([df_fake, df_real]) df_tot = skshuffle(df_tot, random_state=42) # Add text/title length df_tot['text_length'] = df_tot.apply(lambda x: len(x['text']) if not pd.isnull(x['text']) else -1, axis=1) df_tot['title_length'] = df_tot.apply(lambda x: len(x['title']) if not pd.isnull(x['title']) else -1, axis=1) if subset: # Subset the data df_tot = df_tot.dropna(subset=['text']) df_tot = df_tot[ df_tot['text_length'] >= 10].copy() # as they were nans df_tot = df_tot[df_tot['language'] == 'english'] return df_tot
def _evaluate(self, embeddings, labels): shuffles = [] for _ in range(self.num_shuffle): shuffles.append(skshuffle(embeddings, labels)) all_results = defaultdict(list) training_percents = [0.1, 0.3, 0.5, 0.7, 0.9] for training_percent in training_percents: for shuf in shuffles: training_size = int(training_percent * self.num_graphs) X, y = shuf X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] # clf = SVC() # clf.fit(X_train, y_train) params = {"C": [1e-3, 1e-2, 1e-1, 1, 10]} svc = SVC() clf = GridSearchCV(svc, params) clf.fit(X_train, y_train) preds = clf.predict(X_test) accuracy = f1_score(y_test, preds, average="micro") all_results[training_percent].append(accuracy) result = dict(( f"Accuracy {train_percent}", sum(all_results[train_percent]) / len(all_results[train_percent]), ) for train_percent in sorted(all_results.keys())) print("Test Acc: ", list(result.values())[-1]) return result
def clustering_preprocess(self, nu, N, limits=(0, 100000)): """ Preprocess the samples before clustering Preprocesses the list of frequencies at which significant peaks in the power spectrum were found. The binning factors are shuffled to prevent clustering along that axis (axis=1). The binning factors are scaled to range between 0 and 1. Parameters ---------- nu : ndarray Frequency of peaks that satisfy the H0 test. N : ndarray Bin factors at which the significant peaks were selected. limits : list Lower and upper limits in nu to use for clustering. Samples beyond these limits are rejected. Returns ------- X : ndarray Array of samples to be used by HDBscan """ nuidx = (limits[0] < nu) & (nu < limits[1]) Nscaler = MinMaxScaler().fit(N.reshape(-1, 1)) Ns = skshuffle(Nscaler.transform(N[nuidx].reshape(-1, 1))).flatten() return np.vstack((nu[nuidx], Ns)).T
def compute_regrets(X, y, shuffle=True, random_state=42): regrets = defaultdict(list) timings = defaultdict(list) if shuffle: X, y = skshuffle(X, y, random_state=random_state) n_samples, n_features = X.shape n_iterations = n_samples - 1 iterations = np.arange(n_iterations) n_classes = int(y.max() + 1) classes = np.arange(n_classes) classifiers = get_classifiers_online(n_classes) for clf_name, clf in classifiers: assert hasattr(clf, "partial_fit") logging.info(" using %s" % clf_name) for i in tqdm(range(1, n_iterations)): x_train = X[i - 1].reshape(1, n_features) y_train = np.array([y[i - 1]]) x_test = X[i].reshape(1, n_features) y_test = np.array([y[i]]) t1 = time() clf.partial_fit(x_train, y_train, classes) t2 = time() y_pred = clf.predict_proba(x_test) test_loss = log_loss_single(y_test, y_pred) regrets[clf_name].append(test_loss) timings[clf_name].append(t2 - t1) if hasattr(clf, "clear"): clf.clear() del clf return iterations, regrets, timings
def next(self): diff = self.batch_index + self.batch_size - self.lenght_data if(diff == self.batch_size): diff=0 self.batch_index=0 diff = max(diff,0) if(diff > 0): #print(self.data[self.batch_index: self.batch_index + self.batch_size]) x = np.concatenate((self.data[self.batch_index: self.batch_index + self.batch_size], \ self.data[0: diff])) if (self.labels is not None): y = np.concatenate((self.labels[self.batch_index: self.batch_index + self.batch_size], \ self.labels[0:diff])) self.batch_index=diff if (self.shuffle): if (self.labels is None): np.random.shuffle(self.data) else: self.data, self.labels = skshuffle(self.data, self.labels) else: x = self.data[self.batch_index: self.batch_index + self.batch_size] if (self.labels is not None): y = self.labels[self.batch_index: self.batch_index + self.batch_size] self.batch_index+=self.batch_size if(self.labels is not None): return x,y return x
def __call__(self, names_and_labels, shuffle=False): batches = [] ids_and_names = [] batch_size = self.batch_size rows = Parallel(n_jobs=self.n_jobs)( delayed(_process_item)(self, name) for name, label in names_and_labels) names_and_labels = [ v for (v, row) in zip(names_and_labels, rows) if row is not None ] for id, (name, label) in enumerate(names_and_labels): ids_and_names.append((id, name)) data = np.vstack([r for r in rows if r is not None]) if shuffle: from sklearn.utils import shuffle as skshuffle names_and_labels, ids_and_names, data = skshuffle( names_and_labels, ids_and_names, data) labels_sorted = sorted(set(p[1] for p in names_and_labels)) labels = [ labels_sorted.index(label) for name, label in names_and_labels ] ids = [id for (id, fname) in ids_and_names] data = self.preprocess_data(data) data_mean = data.mean(axis=0) for batch_start in range(0, len(names_and_labels), batch_size): batch = {'data': None, 'labels': []} batch_end = batch_start + batch_size batch['data'] = data[batch_start:batch_end, :].T batch['labels'] = labels[batch_start:batch_end] batch['ids'] = ids[batch_start:batch_end] batches.append(batch) self.dot() for i, batch in enumerate(batches): path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1)) with open(path, 'wb') as f: cPickle.dump(batch, f, -1) self.dot() batches_meta = {} batches_meta['label_names'] = labels_sorted batches_meta['pack_columns'] = ['name'] batches_meta['packs'] = [(os.path.basename(name), ) for id, name in ids_and_names] batches_meta['data_mean'] = data_mean.reshape(data_mean.shape[0], 1) batches_meta['num_vis'] = data.shape[0] batches_meta.update(self.more_meta) with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f: cPickle.dump(batches_meta, f, -1) self.dot() print print "Wrote to %s" % self.output_path
def evaluate(user_df, features_matrix): features_matrix = features_matrix[user_df["user_id"].to_list(),] print(features_matrix.shape) nodesize = features_matrix.shape[0] label_matrix = user_df["age"] label_matrix = label_matrix.to_numpy() label_matrix = np.stack([label_matrix]).T - 1 train_percent = 0.7 random.seed(1) np.random.seed(1) res = [] for i in range(4): t_1 = time.time() X, y = skshuffle(features_matrix, label_matrix) training_size = int(train_percent * nodesize) X_train = X[:training_size, :] y_train = y[:training_size, :] X_test = X[training_size:, :] y_test = y[training_size:, :] clf = LogisticRegression(random_state=0, solver="saga", multi_class="multinomial") clf.fit(X_train, y_train) preds = clf.predict(X_test) acc = (preds == y_test[:, 0]).sum() / len(y_test) res.append(acc) print(time.time() - t_1, "s") print("avg age acc:", sum(res) / len(res)) print("min age acc:", min(res)) print("max age acc:", max(res))
def _evaluate(self, embeddings, labels): shuffles = [] for _ in range(self.num_shuffle): shuffles.append(skshuffle(embeddings, labels)) all_results = defaultdict(list) training_percents = [0.1, 0.3, 0.5, 0.7, 0.9] for training_percent in training_percents: for shuf in shuffles: training_size = int(training_percent * self.num_graphs) X, y = shuf X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] clf = SVC() clf.fit(X_train, y_train) preds = clf.predict(X_test) accuracy = accuracy_score(y_test, preds) all_results[training_percent].append(accuracy) return dict(( f"Accuracy {train_percent}", sum(all_results[train_percent]) / len(all_results[train_percent]), ) for train_percent in sorted(all_results.keys()))
def train(self, method): trainRatio = np.arange(0.1, 1, 0.1) resultMicro, resultMacro = [], [] for ratio in trainRatio: print('\r', "Training with {}% labeled nodes...".format(int(100 * ratio)), end='', flush=True) resultMicroMat, resultMacroMat = [], [] for _ in np.arange(10): # Train 10 times. X, y = skshuffle(self.featureMat, self.labelsMat) trainSize = int(ratio * X.shape[0]) X_train = X[:trainSize] y_train = y[:trainSize] X_test = X[trainSize:] y_test = y[trainSize:] clf = TopKRanker(LogisticRegression(solver='liblinear')) clf.fit(X_train, y_train) topKList = np.diff(y_test.tocsr().indptr) preds = clf.predict(X_test, topKList) resultMicro.append(f1_score(y_test, preds, average='micro')) resultMacro.append(f1_score(y_test, preds, average='macro')) resultMicroMat.append(resultMicro) resultMacroMat.append(resultMacro) np.save( '../data/{}/{}/results/resultsMicro_{}.npy'.format( self.graphName, method, self.name), np.array(resultMicroMat)) np.save( '../data/{}/{}/results/resultsMacro_{}.npy'.format( self.graphName, method, self.name), np.array(resultMacroMat)) return np.array(resultMicroMat), np.array(resultMacroMat)
def main(): group_edges_path = \ '../data/blogcatelog/BlogCatalog-dataset/data/group-edges.csv' parser = ArgumentParser("Calculate BlogCatalog F1 score.") parser.add_argument("--emb_dir", default='../tf/deepwalk', help="Path of directory containing embeddings.") parser.add_argument("--shuffles", default=2, type=int, help='Number of shuffles.') args = parser.parse_args() # load and process embeddings embeds = np.load(args.emb_dir + '/id_emb.npy') id_map = {} for i in range(embeds.shape[0]): id_map[embeds[i][0]] = i features_matrix = embeds[[id_map[k] for k in range(NODE_NUM)]][:, 1:] # load and process labels labels = np.zeros((NODE_NUM, CLASS_NUM)) with open(group_edges_path) as csvfile: lines = csv.reader(csvfile) for line in lines: node_id = int(line[0]) - 1 group_id = int(line[1]) - 1 labels[node_id][group_id] = 1 # train and test args.shuffles times. shuffles = [] for _ in range(args.shuffles): shuffles.append(skshuffle(features_matrix, labels)) train_and_test(shuffles)
def _evaluate(self, features_matrix, label_matrix, num_shuffle): shuffles = [] for _ in range(num_shuffle): shuffles.append(skshuffle(features_matrix, label_matrix)) all_results = defaultdict(list) training_percents = [0.3, 0.5, 0.7, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * len(X)) X_train = X[:training_size, :] y_train = y[:training_size, :] X_test = X[training_size:, :] y_test = y[training_size:, :] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0])) preds = clf.predict(X_test, top_k_list) result = f1_score(y_test, preds, average="micro") all_results[train_percent].append(result) return dict(( f"Micro-F1 {train_percent}", sum(all_results[train_percent]) / len(all_results[train_percent]), ) for train_percent in sorted(all_results.keys()))
def __call__(self, names_and_labels, shuffle=False): batches = [] ids_and_names = [] batch_size = self.batch_size rows = Parallel(n_jobs=self.n_jobs)( delayed(_process_item)(self, name) for name, label in names_and_labels ) names_and_labels = [v for (v, row) in zip(names_and_labels, rows) if row is not None] for id, (name, label) in enumerate(names_and_labels): ids_and_names.append((id, name)) data = np.vstack([r for r in rows if r is not None]) if shuffle: from sklearn.utils import shuffle as skshuffle names_and_labels, ids_and_names, data = skshuffle( names_and_labels, ids_and_names, data) labels_sorted = sorted(set(p[1] for p in names_and_labels)) labels = [labels_sorted.index(label) for name, label in names_and_labels] ids = [id for (id, fname) in ids_and_names] data = self.preprocess_data(data) data_mean = data.mean(axis=0) for batch_start in range(0, len(names_and_labels), batch_size): batch = {'data': None, 'labels': []} batch_end = batch_start + batch_size batch['data'] = data[batch_start:batch_end, :].T batch['labels'] = labels[batch_start:batch_end] batch['ids'] = ids[batch_start:batch_end] batches.append(batch) self.dot() for i, batch in enumerate(batches): path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1)) with open(path, 'wb') as f: cPickle.dump(batch, f, -1) self.dot() batches_meta = {} batches_meta['label_names'] = labels_sorted batches_meta['pack_columns'] = ['name'] batches_meta['packs'] = [ (os.path.basename(name),) for id, name in ids_and_names] batches_meta['data_mean'] = data_mean.reshape(data_mean.shape[0], 1) batches_meta['num_vis'] = data.shape[0] batches_meta.update(self.more_meta) with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f: cPickle.dump(batches_meta, f, -1) self.dot() print print "Wrote to %s" % self.output_path
def shuffle(self): """ Shuffles the data and labels. """ self.data, self.labels = skshuffle(self.data, self.labels, random_state=self.seed) self.Helpers.logger.info("Data shuffled")
def shuffle(self): """Summary Returns: TYPE: Description """ idx = list(range(len(self))) reindex = skshuffle(idx) self.change_idx(reindex)
def prepare(self, P_, N_, shuffle=True, verbose=False, max_parallel_process=8): """Prepare the data for training. Parameters ---------- P_ : array-like of shape = [n_positive_samples, height, width] The positive samples. N_ : array-like of shape = [n_negetive_samples, height, width] The negetive samples. shuffle : bool Whether to shuffle the data or not. """ assert np.shape(P_)[1:3] == np.shape(N_)[1:3], "Window sizes mismatch." _, self.detectWndH, self.detectWndW = np.shape(P_) self.features_cnt, descriptions = \ self.Haarlike.determineFeatures(self.detectWndW, self.detectWndH) self.features_descriptions = descriptions[::-1] if shuffle: # If P_ is a list, this is faster than # P_ = np.array(skshuffle(P_, random_state=1)) P_ = skshuffle(np.array(P_), random_state=1) N_ = skshuffle(np.array(N_), random_state=1) if verbose: print('Preparing positive data.') P = self._translate(P_, verbose=verbose, max_parallel_process=max_parallel_process) if verbose: print('Preparing negative data.') N = self._translate(N_, verbose=verbose, max_parallel_process=max_parallel_process) divlineP = int(len(P)*self.validset_rate) divlineN = int(len(N)*self.validset_rate) validset_X = np.concatenate(( P[0:divlineP], N[0:divlineN] )) validset_y = np.concatenate(( np.ones(len(P[0:divlineP])), np.zeros(len(N[0:divlineN])) )) # validset_X, validset_y = skshuffle(validset_X, validset_y, random_state=1) P = P[divlineP:len(P)] N = N[divlineN:len(N)] self.P = P self.N = N self.validX = validset_X self.validy = validset_y
def shuffle(self): """Summary Returns: TYPE: Description """ idx = list(range(len(self))) reindex = skshuffle(idx) self.props = {key: val[reindex] for key, val in self.props.items()} return
def get_minibatch(X, y, minibatch_size, shuffle=True): minibatches = [] if shuffle: X, y = skshuffle(X, y) for i in range(0, X.shape[0], minibatch_size): X_mini = X[i:i + minibatch_size] y_mini = y[i:i + minibatch_size] minibatches.append((X_mini, y_mini)) return minibatches
def evaluate(): args = parse_args() features_matrix = load_embeddings(args.emb) print(features_matrix.shape) nodesize = features_matrix.shape[0] label_matrix = load_labels(args.label, nodesize) number_shuffles = args.shuffle shuffles = [] for x in range(number_shuffles): shuffles.append(skshuffle(features_matrix, label_matrix)) all_results = defaultdict(list) training_percents = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * nodesize) X_train = X[:training_size, :] y_train = y[:training_size, :] X_test = X[training_size:, :] y_test = y[training_size:, :] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0])) preds = clf.predict(X_test, top_k_list) results = {} averages = ["micro", "macro", "samples", "weighted"] for average in averages: results[average] = f1_score(y_test, preds, average=average) all_results[train_percent].append(results) print('Results, using embeddings of dimensionality', X.shape[1]) print('-------------------') print('Train percent:', 'average f1-score') for train_percent in sorted(all_results.keys()): av = 0 stder = np.ones(number_shuffles) i = 0 for x in all_results[train_percent]: stder[i] = x["micro"] i += 1 av += x["micro"] av /= number_shuffles print(train_percent, ":", av)
def _evaluate(self, features_matrix, label_matrix, num_shuffle): # features_matrix, node2id = utils.load_embeddings(args.emb) # label_matrix = utils.load_labels(args.label, node2id, divi_str=" ") # shuffle, to create train/test groups shuffles = [] for _ in range(num_shuffle): shuffles.append(skshuffle(features_matrix, label_matrix)) # score each train/test group all_results_micro = defaultdict(list) all_results_macro = defaultdict(list) # training_percents = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] training_percents = [0.1, 0.3, 0.5, 0.7, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * self.num_nodes) X_train = X[:training_size, :] y_train = y[:training_size, :] X_test = X[training_size:, :] y_test = y[training_size:, :] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0])) preds = clf.predict(X_test, top_k_list) result = f1_score(y_test, preds, average="micro") all_results_micro[train_percent].append(result) result = f1_score(y_test, preds, average="macro") all_results_macro[train_percent].append(result) # print("micro", result) micro_ = dict(( f"Micro-F1 {train_percent}", sum(all_results_micro[train_percent]) / len(all_results_micro[train_percent]), ) for train_percent in sorted(all_results_micro.keys())) macro_ = dict(( f"Macro-F1 {train_percent}", sum(all_results_macro[train_percent]) / len(all_results_macro[train_percent]), ) for train_percent in sorted(all_results_macro.keys())) micro_.update(macro_) return micro_
def getTrainList(self, batch_size=1000, shuffle=True): n_batch = self.n_train // batch_size batch_list = [] X_shuffle, Y_shuffle = self.X_train.copy(), self.Y_train.copy() if shuffle: X_shuffle, Y_shuffle = skshuffle(self.X_train, self.Y_train) for i in range(n_batch): X, Y = X_shuffle[batch_size * i:batch_size * (i + 1)], Y_shuffle[batch_size * i:batch_size * (i + 1)] X, Y = self.to_tensor(X, Y, self.device) batch_list.append((X, Y)) return batch_list
def make_xgb(params, X_tr, y_tr, test, nr, rs=1): par = params.copy() par['seed'] = rs plst = par.items() np.random.seed(seed=rs + 123) num_round = nr X_tr, y_tr = skshuffle(X_tr, y_tr, random_state=rs + 123) noise = np.random.normal(0, 0.5, len(y_tr)) dtrain = xgb.DMatrix(X_tr, label=y_tr) #+noise)#,missing=-1.0) dtest = xgb.DMatrix(test) #,missing=-1.0) model = xgb.train(plst, dtrain, num_round) #,obj=KappaRelaxedObjective()) pred = model.predict(dtest) return pred
def batch_iter(data, batch_size, shuffle=True): batch = [] shuffled_data = np.copy(data) if shuffle: shuffled_data = skshuffle(shuffled_data) for line in shuffled_data: batch.append(line) if len(batch) == batch_size: yield tuple(list(x) for x in zip(*batch)) # yield batch batch = [] if batch: yield tuple(list(x) for x in zip(*batch))
def shuffle(self): """Summary Returns: TYPE: Description """ idx = list(range(len(self))) reindex = skshuffle(idx) for key, val in self.props.items(): if isinstance(val, list): self.props[key] = [val[i] for i in reindex] else: self.props[key] = val[reindex] return
def batch_iter(data, batch_size, shuffle=False, random_state=123): batch = [] shuffled_data = np.copy(data) if shuffle: shuffled_data = skshuffle(shuffled_data, random_state=random_state) for line in shuffled_data: batch.append(line) if len(batch) == batch_size: yield tuple(list(x) for x in zip(*batch)) # yield batch batch = [] if batch: yield tuple(list(x) for x in zip(*batch))
def generator(samples,newsize,batch_size=32,max_count=200): zero_thresh = 0.3 augment_prob = .3 zero_reject_range = 0.2 crop_on = 1 mult_camera = False increment_zero_thresh = 0 loop_count = 0 while 1: # Loop forever so the generator never terminates print(' GETTING NEW SAMPLES ' ) print('loop_count = ', loop_count) np.random.shuffle(samples) # create binned version of larger sample base with flatter distribution bin_samples = distribute_samples(samples,max_count=max_count) num_samples = len(bin_samples) loop_count += 1 if loop_count > 2 and increment_zero_thresh: zero_thresh += 0.1 # experimented with this, but not used in end print('zero_thresh = ',zero_thresh) for offset in range(0, num_samples, batch_size): batch_samples = bin_samples[offset:offset+batch_size] images = [] angles = [] for batch in batch_samples: if zero_thresh < 1.0: # if rejecting small angles keep_search = 1 while keep_search: # search for sample meeting criteria rand_indx = random.randint(0,len(bin_samples)-1) line_sample = bin_samples[rand_indx] if abs(float(line_sample[3])) < zero_reject_range: if random.random() < zero_thresh: keep_search = 0 else: keep_search = 0 else: line_sample = batch image,angle,im_indx = process_image_pipeline(line_sample, augment_prob,crop_on, newsize, mult_camera=mult_camera) images.append(image) angles.append(angle) X_train = np.array(images) y_train = np.array(angles) yield skshuffle(X_train, y_train)
def getTrainList(self, batch_size=1000, shuffle=True, aug=False): n_batch = self.n_train // batch_size batch_list = [] X_shuffle, Y_shuffle = self.X_train.copy(), self.Y_train.copy() if shuffle: X_shuffle, Y_shuffle = skshuffle(self.X_train, self.Y_train) for i in range(n_batch): X, Y = X_shuffle[batch_size * i:batch_size * (i + 1)], Y_shuffle[batch_size * i:batch_size * (i + 1)] if aug: X = self.random_crop(X) X = self.horizontal_flip(X) X, Y = self.to_tensor(X, Y, self.device) batch_list.append((X, Y)) return batch_list
def __init__(self, data,labels= None,shuffle =True, batch_size=32 ): if(labels is None): np.random.shuffle(data) self.data = data self.labels=None else: self.labels = labels self.data = data self.data, self.labels = skshuffle(self.data, self.labels) self.data_dim = len(data[0]) self.lenght_data = len(data) self.step_count=0 self.batch_size = batch_size self.batch_index=0 self.shuffle = shuffle
def __call__(self, names_and_labels, shuffle=False): # names_and_labels: nparray of (full-file-paths, label) tuples batches = [] ids_and_names = [] batch_size = self.batch_size # rows is a list of serialised jpgs in names_and_labels order? rows = Parallel(n_jobs=self.n_jobs)( delayed(_process_item)(self, name) for name, label in names_and_labels ) # why do we need this step? to get rid of cases where # _process_item did not process a jpg? (could not find one, or # chose not to?) names_and_labels=[v for (v, row) in zip(names_and_labels, rows) if row is not None] for id, (name, label) in enumerate(names_and_labels): ids_and_names.append((id, name)) data = np.vstack([r for r in rows if r is not None]) if shuffle: from sklearn.utils import shuffle as skshuffle names_and_labels, ids_and_names, data = skshuffle( names_and_labels, ids_and_names, data) labels_sorted = sorted(set(p[1] for p in names_and_labels)) labels = [labels_sorted.index(label) for name, label in names_and_labels] ids = [id for (id, fname) in ids_and_names] data = self.preprocess_data(data) # does nothing for batch_start in range(0, len(names_and_labels), batch_size): batch = {'data': None, 'labels': [], 'metadata': []} batch_end = batch_start + batch_size batch['data'] = data[batch_start:batch_end, :].T batch['labels'] = labels[batch_start:batch_end] batch['ids'] = ids[batch_start:batch_end] batches.append(batch) self.dot() for i, batch in enumerate(batches): path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1)) with open(path, 'wb') as f: cPickle.dump(batch, f, -1) self.dot() batches_meta = {} batches_meta['label_names'] = labels_sorted batches_meta['metadata'] = dict( (id, {'name': name}) for (id, name) in ids_and_names) batches_meta['data_mean'] = data.mean(axis=0) batches_meta.update(self.more_meta) with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f: cPickle.dump(batches_meta, f, -1) self.dot() print print "Wrote to %s" % self.output_path
def eval_blogcat(embeddings_file, labels_matrix=None, G=None, verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]): # 0. Files #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec" if labels_matrix is None and G is None: G, labels_matrix = load_blogcat() # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) labels = np.argwhere(labels_matrix) label_cnts = pd.Series(labels[:,1]).value_counts() if verbose > 1: print('\nLabel counts:') print(label_cnts) # delete the least frequent labels, which causes balancing problems labels_matrix = labels_matrix[:, :-2] # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = np.asarray([model[str(node)] for node in range(len(G))]) if normalize: norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms print norms.shape assert norms.shape[0] == features_matrix.shape[0] for i in range(features_matrix.shape[0]): features_matrix[i,:] /= norms[i] norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms if verbose: print('-'*100) print(embeddings_file) print('features_matrix.shape = %s' % str(features_matrix.shape)) print('labels_matrix.shape = %s' % str(labels_matrix.shape)) # 2. Shuffle, to create train/test groups shuffles = [] number_shuffles = 1 for x in range(number_shuffles): # if we just have one group, make the split the same every time if number_shuffles == 1: shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123)) else: shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) # uncomment for all training percents #training_percents = np.asarray(range(1,10))*.1 for train_percent in training_percents: # print('-'*100) # print('pct_train: %.2f' % train_percent) for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted #top_k_list = [len(l) for l in y_test] top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32) preds = clf.predict(X_test, top_k_list) if y_test.shape[1] != preds.shape[1]: raise Exception("imbalance of class dims") #continue results = OrderedDict() averages = ["micro", "macro", "samples", "weighted"] for average in averages: results[average] = f1_score(y_test, preds, average=average) all_results[train_percent].append(results) #break if verbose: print '-------------------' for train_percent in sorted(all_results.keys()): print 'Train percent:', train_percent for x in all_results[train_percent]: print x print '-------------------' return all_results
norm_only=False) # 2. Load labels mat = loadmat(matfile) A = mat['network'] graph = sparse2graph(A) labels_matrix = mat['group'] # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))]) # 2. Shuffle, to create train/test groups shuffles = [] number_shuffles = 2 for x in range(number_shuffles): shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) training_percents = [0.1, 0.5, 0.9] # uncomment for all training percents #training_percents = numpy.asarray(range(1,10))*.1 for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :]