def Encoding(data, general_matrix=None): encoder = LabelBinarizer() count = 0 # encoding for i in range(data.shape[1]): if type(data[0, i]) == str: count += 1 col = data[:, i] unique = np.unique(col if general_matrix is None else general_matrix[:, i]) try: except: pass new_col = encoder.transform(col) # split at i and i + 1 before, removed, after = np.hsplit(data, [i, i + 1]) # concatenate data = np.concatenate((before, new_col, after), axis=1) before, removed, after = np.hsplit(general_matrix, [i, i + 1]) general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1) print "count : %d" % count # return data return data
class BusinessCategoriesFeature(BaseEstimator): """ WARNING!!! Works only with a modified version of LabelBinarizer. A binarization of the reviews' business categories. """ def __init__(self, data=None): = data def __create_labels_list(self, review_list): labels = [] for review in review_list: business = labels.append(business['categories']) return labels def fit(self, X, y): self.binarizer = LabelBinarizer() labels = self.__create_labels_list(X) return self def transform(self, X): labels = self.__create_labels_list(X) binarized_labels = self.binarizer.transform(labels) return binarized_labels.astype(float)
def test_label_binarizer_multilabel(): lb = LabelBinarizer() # test input as lists of tuples inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) got = lb.fit_transform(inp) assert_array_equal(indicator_mat, got) assert_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat)) # regression test for the two-class multilabel case lb = LabelBinarizer() inp = [[1, 0], [0], [1], [0, 1]] expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]]) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_equal([set(x) for x in lb.inverse_transform(got)], [set(x) for x in inp])
def display_image_predictions(features, labels, predictions): n_classes = 10 label_names = _load_label_names() label_binarizer = LabelBinarizer() label_ids = label_binarizer.inverse_transform(np.array(labels)) fig, axies = plt.subplots(nrows=4, ncols=2) fig.tight_layout() fig.suptitle('Softmax Predictions', fontsize=20, y=1.1) n_predictions = 3 margin = 0.05 ind = np.arange(n_predictions) width = (1. - 2. * margin) / n_predictions for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)): pred_names = [label_names[pred_i] for pred_i in pred_indicies] correct_name = label_names[label_id] axies[image_i][0].imshow(feature*255) axies[image_i][0].set_title(correct_name) axies[image_i][0].set_axis_off() axies[image_i][1].barh(ind + margin, pred_values[::-1], width) axies[image_i][1].set_yticks(ind + margin) axies[image_i][1].set_yticklabels(pred_names[::-1]) axies[image_i][1].set_xticks([0, 0.5, 1.0])
class BaseSGD(object): def _get_loss(self): losses = { "modified_huber": ModifiedHuber(), "hinge": Hinge(1.0), "perceptron": Hinge(0.0), "log": Log(), "sparse_log": SparseLog(), "squared": SquaredLoss(), "huber": Huber(self.epsilon), "epsilon_insensitive": EpsilonInsensitive(self.epsilon), } return losses[self.loss] def _get_learning_rate(self): learning_rates = {"constant": 1, "pegasos": 2, "invscaling": 3} return learning_rates[self.learning_rate] def _set_label_transformers(self, y): if self.multiclass == "natural": self.label_encoder_ = LabelEncoder() y = self.label_encoder_.fit_transform(y).astype(np.float64) self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1) self.classes_ = self.label_binarizer_.classes_.astype(np.int32) n_classes = len(self.label_binarizer_.classes_) n_vectors = 1 if n_classes <= 2 else n_classes return n_classes, n_vectors
def get_abalone19(): """Loads abalone dataset, maps gender feature to binary features, adds new label to create abalone19 imbalanced binary classification dataset.""" raw_data = pd.read_csv(ABALONE_FILE, sep=',') genders = list(raw_data.ix[:, 'gender']) cts_data = raw_data.drop(labels='gender', axis=1) # initialize & fit preprocesser lbz = LabelBinarizer() # encode categorical var encoded_genders = pd.DataFrame(lbz.transform(genders)) encoded_genders.columns = ['gender_' + k for k in lbz.classes_] # recombine encoded data & return new_data = pd.concat(objs=[encoded_genders, cts_data], axis=1) new_data['label'] = raw_data['rings'].map( lambda k: 1 if k > 10 else 0) # binary clf task new_data = new_data.drop('rings', axis=1) # standardize cts features if STANDARDIZE: for col in new_data.ix[:, 3:-1]: mean = new_data[col].mean() std = new_data[col].std() new_data[col] = new_data[col].map(lambda k: (k - mean) / float(std)) pos_recs = new_data['label'].sum() print 'total pos class pct = {} %\n'.format( round(100 * pos_recs / float(len(new_data)), 3)) return new_data
def fit(self, X, y): X = np.array(X) y = np.array(y) samples, self.n_features = X.shape # because our space of targets are discrete lb = LabelBinarizer() self.classes = lb.classes_ self.n_class = self.classes.size self.class_prior = np.zeros(self.n_class, dtype=np.float64) self.feature_proba = [] for i, y_i in enumerate(self.classes): # get Xs only for y_i class X_yi = X[y == y_i] class_count = X_yi[:, 0].size self.class_prior[i] = np.float64(class_count) / samples count_all_features = 0 all_features = np.zeros(self.n_features) for sample_features in X_yi: # accumulate feature according our algorithm all_features, count_all_features = self._add_features_dens( sample_features, all_features, count_all_features) # calculate probabilites according our algorithm self.feature_proba.append( self._compute_proba(all_features, count_all_features)) return self
def one_hot_encoding(y_train, y_test): labelBinarizer = LabelBinarizer() y_train_one_hot = labelBinarizer.transform(y_train) y_test_one_hot = labelBinarizer.transform(y_test) return y_train_one_hot, y_test_one_hot
def train_logreg(X, y, test_X, test_y, load_vec=True): """ Trains logistic regression on the feature set. """ full_y = y + test_y lb = LabelBinarizer() # Convert into 1-D array print len(X), len(test_X) model = LogisticRegression() big_X = X + test_X features = featurize(big_X) X, test_X = features[:4500], features[4500:] print X.shape, X, y) y_pred = model.predict(X) print set(y_pred) print metrics.classification_report(y, y_pred, digits = 3) y_pred = model.predict(test_X) print set(y_pred) print metrics.classification_report(test_y, y_pred, digits = 3)
def logloss(act, pred): epsilon = 10 ** -15 pred = np.maximum(np.minimum(pred, 1 - epsilon), epsilon) lb = LabelBinarizer() act_binary = lb.transform(act) logloss = - np.sum(np.multiply(act_binary, np.log(pred))) / pred.shape[0] return logloss
def fit(self, Xt, yt, Xh, yh, callback=None): lbin = LabelBinarizer() Yt_multi = lbin.transform(yt) Yh_multi = lbin.transform(yh) sample_weight_train = np.ones(Xt.shape[0]) sample_weight_test = np.ones(Xh.shape[0]) if Yt_multi.shape[1] == 1: Yt_multi = np.hstack([1 - Yt_multi, Yt_multi]) Yh_multi = np.hstack([1 - Yh_multi, Yh_multi]) print('warning: only two classes detected') n_classes = Yt_multi.shape[1] n_features = Xt.shape[1] if self.alpha0 is None: self.alpha0 = np.zeros(n_classes * n_features) # if not np.all(np.unique(yt) == np.array([-1, 1])): # raise ValueError x0 = np.zeros(n_features * n_classes) # assert x0.size == self.alpha0.size def h_func_grad(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_loss_grad( x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[:2] def h_hessian(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_grad_hess( x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[1] def g_func_grad(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_loss_grad( x, Xh, Yh_multi, np.zeros(alpha.size), sample_weight_test)[:2] def h_crossed(x, alpha): # return x.reshape((n_classes, -1)) * alpha # x = x.reshape((-1,Yt_multi.shape[1])) tmp = np.exp(alpha) * x return sparse.dia_matrix( (tmp, 0), shape=(n_features * n_classes, n_features * n_classes)) opt = hoag_lbfgs( h_func_grad, h_hessian, h_crossed, g_func_grad, x0, callback=callback, tolerance_decrease=self.tolerance_decrease, lambda0=self.alpha0, maxiter=self.max_iter, verbose=self.verbose) self.coef_ = opt[0] self.alpha_ = opt[1] return self
def __init__( self, train_file, test_file, batch_size=32, embedding_size=20, max_norm=40, lr=0.01, num_hops=3, adj_weight_tying=True, linear_start=True, **kwargs ): train_lines, test_lines = self.get_lines(train_file), self.get_lines(test_file) lines = np.concatenate([train_lines, test_lines], axis=0) vocab, word_to_idx, idx_to_word, max_seqlen, max_sentlen = self.get_vocab(lines) = {"train": {}, "test": {}} S_train,["train"]["C"],["train"]["Q"],["train"]["Y"] = self.process_dataset( train_lines, word_to_idx, max_sentlen, offset=0 ) S_test,["test"]["C"],["test"]["Q"],["test"]["Y"] = self.process_dataset( test_lines, word_to_idx, max_sentlen, offset=len(S_train) ) S = np.concatenate([np.zeros((1, max_sentlen), dtype=np.int32), S_train, S_test], axis=0) for i in range(10): for k in ["C", "Q", "Y"]: print k,["test"][k][i] print "batch_size:", batch_size, "max_seqlen:", max_seqlen, "max_sentlen:", max_sentlen print "sentences:", S.shape print "vocab:", len(vocab), vocab for d in ["train", "test"]: print d, for k in ["C", "Q", "Y"]: print k,[d][k].shape, print "" lb = LabelBinarizer() vocab = lb.classes_.tolist() self.batch_size = batch_size self.max_seqlen = max_seqlen self.max_sentlen = max_sentlen self.embedding_size = embedding_size self.num_classes = len(vocab) + 1 self.vocab = vocab self.adj_weight_tying = adj_weight_tying self.num_hops = num_hops = lb self.init_lr = lr = self.init_lr self.max_norm = max_norm self.S = S self.idx_to_word = idx_to_word self.nonlinearity = None if linear_start else lasagne.nonlinearities.softmax self.build_network(self.nonlinearity)
def load_dataset2(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() X, y = shuffle(X, y) lb = LabelBinarizer() for y_bin in lb.transform(y).T: return X, y_bin
class BaseClassifier(BaseEstimator): def predict_proba(self, X): if len(self.classes_) != 2: raise NotImplementedError("predict_(log_)proba only supported" " for binary classification") if self.loss == "log": df = self.decision_function(X).ravel() prob = 1.0 / (1.0 + np.exp(-df)) elif self.loss == "modified_huber": df = self.decision_function(X).ravel() prob = np.minimum(1, np.maximum(-1, df)) prob += 1 prob /= 2 else: raise NotImplementedError("predict_(log_)proba only supported when" " loss='log' or loss='modified_huber' " "(%s given)" % self.loss) out = np.zeros((X.shape[0], 2), dtype=np.float64) out[:, 1] = prob out[:, 0] = 1 - prob return out def _set_label_transformers(self, y, reencode=False, neg_label=-1): if reencode: self.label_encoder_ = LabelEncoder() y = self.label_encoder_.fit_transform(y).astype(np.int32) else: y = y.astype(np.int32) self.label_binarizer_ = LabelBinarizer(neg_label=neg_label, pos_label=1) self.classes_ = self.label_binarizer_.classes_.astype(np.int32) n_classes = len(self.label_binarizer_.classes_) n_vectors = 1 if n_classes <= 2 else n_classes return y, n_classes, n_vectors def decision_function(self, X): pred = safe_sparse_dot(X, self.coef_.T) if hasattr(self, "intercept_"): pred += self.intercept_ return pred def predict(self, X): pred = self.decision_function(X) out = self.label_binarizer_.inverse_transform(pred) if hasattr(self, "label_encoder_"): out = self.label_encoder_.inverse_transform(out) return out
def fit(self, X, y=None): if not isinstance(X, pd.DataFrame): raise RuntimeError("Only works with DataFrames. Got {}".format(X.__class__)) self.binarizers_ = [] for col in X.columns: binarizer = LabelBinarizer(self.neg_label, self.pos_label)[col].values) self.binarizers_.append((col, binarizer)) return self
def encode(self, data, label, value_set=None): le =LabelBinarizer() if value_set is None: encoded = le.fit_transform(data[label]) else: encoded = le.transform(data[label]) for i in range(encoded.shape[1]): new_label = '{0}_is_{1}'.format(label, i) data[new_label] = encoded[:,i]
class _CategoricalEncoder: """OneHotEncoder that can handle categorical variables.""" def __init__(self): """Convert labeled categories into one-hot encoded features.""" self._lb = LabelBinarizer() def fit(self, X): """Fit a list or array of categories. Parameters ---------- * `X` [array-like, shape=(n_categories,)]: List of categories. """ self.mapping_ = {v: i for i, v in enumerate(X)} self.inverse_mapping_ = {i: v for v, i in self.mapping_.items()}[self.mapping_[v] for v in X]) self.n_classes = len(self._lb.classes_) return self def transform(self, X): """Transform an array of categories to a one-hot encoded representation. Parameters ---------- * `X` [array-like, shape=(n_samples,)]: List of categories. Returns ------- * `Xt` [array-like, shape=(n_samples, n_categories)]: The one-hot encoded categories. """ return self._lb.transform([self.mapping_[v] for v in X]) def inverse_transform(self, Xt): """Inverse transform one-hot encoded categories back to their original representation. Parameters ---------- * `Xt` [array-like, shape=(n_samples, n_categories)]: One-hot encoded categories. Returns ------- * `X` [array-like, shape=(n_samples,)]: The original categories. """ Xt = np.asarray(Xt) return [ self.inverse_mapping_[i] for i in self._lb.inverse_transform(Xt) ]
class PipelineLabelBinarizer(TransformerMixin): def __init__(self, *args, **kwargs): self.encoder = LabelBinarizer(*args, **kwargs) def fit(self, x, y=None): return self def transform(self, x, y=None): return self.encoder.transform(x)
def ndcg_score(ground_truth, predictions, k=5): lb = LabelBinarizer() + 1)) T = lb.transform(ground_truth) scores = [] for y_true, y_score in zip(T, predictions): actual = dcg_score(y_true, y_score, k) best = dcg_score(y_true, y_true, k) score = float(actual) / float(best) scores.append(score) return np.mean(scores)
def one_hot_encode(x): """ One hot encode a list of sample labels. Return a one-hot encoded vector for each label. : x: List of sample Labels : return: Numpy array of one-hot encoded labels """ # TODO: Implement Function labels=list(range(10)) lb = LabelBinarizer() return np.array(lb.transform(x))
def partb(): def load(file_name): file = np.load(file_name) X_train =file['X_train'].T y_train =file['y_train'] X_test =file['X_test'].T y_test =file['y_test'] X_cv =file['X_cv'].T y_cv =file['y_cv'] return X_train,y_train,X_cv,y_cv,X_test,y_test train_ = [0,0] test_ = [0,0] overall = [] for i in range(14): X_train,y_train,X_cv,y_cv,X_test,y_test = load('pofa{}.npz'.format(i)) from sklearn.preprocessing import LabelBinarizer binarizer = LabelBinarizer() Y_train = binarizer.transform(y_train).T Y_cv = binarizer.transform(y_cv).T #nn.forward(X) #nn.backprop(X,Y,graient_check=True) print(X_train.shape[0], Y_train.shape[0]) nn = NeuralNetwork([X_train.shape[0],30,Y_train.shape[0]], functions=[sigmoid,softmax], derivatives=[derivative_sigmoid]),Y_train,eta=0.01,momentum=0.5,minibatch=16,regularizer=0.15,max_iter=200,gradient_check=False,cv = (X_cv,Y_cv),graphs=False, lbfgs=False) output = nn.forward(X_train) y_train_output = binarizer.inverse_transform(output.T) y_test_output = binarizer.inverse_transform(nn.forward(X_test).T) print("Iteration: ",i) print((y_train_output==y_train).mean()) print((y_test_output ==y_test).mean()) overall.append((y_test == y_test_output).mean()) train_[0] += (y_train_output==y_train).sum() train_[1] += y_train.shape[0] test_[0] += (y_test_output==y_test).sum() test_[1] += y_test.shape[0] print("Average train accuracy: ", train_[0]/train_[1],"Average test accuracy: ",test_[0]/test_[1]) print(train_,test_) overall = np.array(overall) print(overall.mean())
def test_labelbinarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.LabelBinarizer # with sklearn.preprocessing.LabelBinarizer labelbinarizerr = LabelBinarizerR() labelbinarizer = LabelBinarizer() y_ref1 = labelbinarizerr.transform(labels[0]) y1 = labelbinarizer.transform(labels)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
class ELM(BaseEstimator): def __init__(self, h=60, activation='linear', random_state=None, C=100): = 'elm' self.h = h self.activation = activation self.random_state = random_state self.C = C assert self.activation in ['rbf', 'sigmoid', 'linear'] def fit(self, X, y): if self.random_state is None: self.random_state = np.random.RandomState(np.random.randint(0, np.iinfo(np.int32).max)) elif type(self.random_state) == int: self.random_state = np.random.RandomState(self.random_state) = LabelBinarizer() self.W = self.random_state.normal(size=(X.shape[1], self.h)) self.B = self.random_state.normal(size=self.h) if self.activation == 'rbf': H = _elm_vectorized_rbf(X, self.W, self.B) elif self.activation == 'sigmoid': H = _elm_sigmoid(X, self.W, self.B) else : H = lam = np.eye(H.shape[1]) * (1./self.C) H_inv = np.linalg.inv( + lam) self.beta = return self def decision_function(self, X): if self.activation == 'rbf': return _elm_vectorized_rbf(X, self.W, self.B).dot(self.beta) elif self.activation == 'sigmoid': return _elm_sigmoid(X, self.W, self.B).dot(self.beta) else : return def predict(self, X): return
def fit(self, train_data): X_data, y_data = train_data self.learned = [] lb = LabelBinarizer() = lb # We binarize the label and build a classifier for each case. # Thus, the number of iterations will be same as the number of the # classes. for y_bin_data in lb.transform(y_data).T: bin_train_data = [X_data, y_bin_data] params = self.fit_binary(bin_train_data) self.learned.append(params)
def load_dataset(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() lb = LabelBinarizer() for y_bin in lb.transform(y).T: y = y_bin break for y_bin in lb.transform(y_test).T: y_test = y_bin break return X, y, X_test, y_test
def load_data(): labels=pd.read_csv("train.csv") bismatch=pd.read_csv("train_photo_to_biz_ids.csv") labels=bismatch.merge(labels,how='left',on='business_id') labels=labels[pd.isnull(labels['labels'])==False] labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) training_=os.listdir("train_photos/train244") train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]}) train_ids=train_ids.merge(labels,on='photo_id',how='inner') # val_ids=val_ids.merge(labels,on='photo_id',how='inner') mlb=LabelBinarizer()['business_id'].tolist()) # X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32) # X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32) return train_ids,mlb
def ndcg_score(truth, pred): lb = LabelBinarizer() + 1)) T = lb.transform(truth) scores = [] for y_true, y_score in zip(T, pred): actual = dcg_score(y_true, y_score) best = dcg_score(y_true, y_true) score = float(actual) / float(best) scores.append(scores) return np.mean(scores)
def ndcg_score(ground_truth, predictions, k=5): """Normalized discounted cumulative gain (NDCG) at rank K. Normalized Discounted Cumulative Gain (NDCG) measures the performance of a recommendation system based on the graded relevance of the recommended entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal ranking of the entities. Parameters ---------- ground_truth : array, shape = [n_samples] Ground truth (true labels represended as integers). predictions : array, shape = [n_samples, n_classes] Predicted probabilities. k : int Rank. Returns ------- score : float Example ------- >>> ground_truth = [1, 0, 2] >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] >>> score = ndcg_score(ground_truth, predictions, k=2) 1.0 >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] >>> score = ndcg_score(ground_truth, predictions, k=2) 0.6666666666 """ lb = LabelBinarizer() # + 1)) #[1] + 1)) T = lb.transform(ground_truth) scores = [] # Iterate over each y_true and compute the DCG score for y_true, y_score in zip(T, predictions): actual = dcg_score(y_true, y_score, k) best = dcg_score(y_true, y_true, k) score = float(actual) / max(0.000001, float(best)) scores.append(score) return np.mean(scores)
class AlchemyDiscretizer(BaseEstimator, TransformerMixin): def __init__(self, useful_features = None): useful_features = None or ['recreation', 'business', 'sports', 'unknown', 'arts_entertainment', 'computer_internet', 'health', 'culture_politics', 'science_technology', 'religion', 'gaming', 'law_crime', 'weather'] self.useful_features = useful_features self.alchemy_category_labeler_ = LabelBinarizer() def fit(self, X, y = None): """X: pd.DataFrame """ return self def transform(self, X): encoded_X = self.alchemy_category_labeler_.transform(X.alchemy_category.tolist()) result_X = X.copy() for i,f in enumerate(self.alchemy_category_labeler_.classes_): if self.useful_features is None or f in self.useful_features: result_X[f] = encoded_X[:, i] return result_X[result_X.columns - ['alchemy_category']]
def _get_child_predict(self, clf, X, index=None): if self.stack_by_proba and hasattr(clf, 'predict_proba'): if self.save_stage0 and index is not None: proba = util.saving_predict_proba(clf, X, index) else: proba = clf.predict_proba(X) return proba[:, 1:] elif hasattr(clf, 'predict'): predict_result = clf.predict(X) if isinstance(clf, ClassifierMixin): lb = LabelBinarizer() return lb.fit_transform(predict_result) else: return predict_result.reshape((predict_result.size, 1)) else: return clf.fit_transform(X)
from sklearn.svm import SVC, LinearSVC from sklearn import tree from sklearn import cluster from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix as conf_mat from viz import * import config as C import testing as T classes = list(vh[19]) classes.sort() X_val, y_val = collect_class_vs(vh[19]) lb = LabelBinarizer() le = LabelEncoder() y_val_enc = le.transform(y_val) Y_val = lb.transform(y_val) ##load test vectors / generate test vectors using CNN: cnn_name = 'epoch_19.model' oname = cnn_name.split('.')[0]+'_test_pred' ofile = os.path.join(C.obj_dir,oname) #T.save_model_predictions(os.path.join(C.model_dir,cnn_name), tdir=C.test_dir, # ofile=ofile) test = T.load_obj(ofile) X_test, y_test = collect_class_vs(test) y_test_enc = le.transform(y_test)
class OneHotVector(object): def __init__(self, chars: list, added: list = []): chars.extend(added) if not chars or type(chars) is not list or len(chars) == 0: raise Exception('values must be list and len(values)>0 %s' % chars) self.chars = chars self.encoder = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) @property def classes(self): return self.encoder.classes_ def __len__(self): return self.encoder.classes_.shape[0] @property def size(self): return self.encoder.classes_.shape[0] def __repr__(self): return '%s(len:%s)' % (self.__class__.__name__, self.__len__()) def to_vector(self, char: str) -> np.ndarray: """ :param char: character. len(c)==1 :return: """ return self.encoder.transform([char])[0] def to_vectors(self, chars: list) -> np.ndarray: """ :param chars: list of characters. len(chars)>0 :return: """ if type(chars) is str or type(chars) is np.str_: chars = [c for c in chars] return self.encoder.transform(chars) def to_value(self, vector: np.ndarray) -> np.ndarray: """ :param vector: one hot vector :return: """ if vector.ndim != 1: vector = vector.flatten() return self.encoder.inverse_transform(np.array([vector]))[0] # if not ch or ch == '': # return ' ' # else: # return ch def to_values(self, vectors: np.ndarray) -> np.ndarray: """ :param vectors: list of one hot vector :return: """ if vectors.ndim != 2: vectors = vectors.reshape((len(vectors) // self.size, self.size)) return ''.join(self.encoder.inverse_transform(vectors)) def to_index(self, c: str) -> int: return np.argmax(self.to_vector(c)) def index2value(self, index): if 0 < index < len(self.chars): return self.classes[index] else: return ''
def multiclass_roc_auc_score(y_test, y_pred, average="macro"): lb = LabelBinarizer() y_test = lb.transform(y_test) y_pred = lb.transform(y_pred) return roc_auc_score(y_test, y_pred, average=average)
def main(): NUM_BRANDS = 4004 NUM_CATEGORIES = 1001 NAME_MIN_DF = 10 MAX_FEATURES_ITEM_DESCRIPTION = 39000 if FLAGS.file_path.endswith('.tsv'): dat = pd.read_table(FLAGS.file_path, engine='c') else: dat = pd.read_table(FLAGS.file_path, sep=',', engine='python') start_time = time.time() handle_missing_inplace(dat) print('[{}] Finished to handle missing'.format(time.time() - start_time)) cutting(dat) print('[{}] Finished to cut'.format(time.time() - start_time)) to_categorical(dat) print('[{}] Finished to convert categorical'.format(time.time() - start_time)) if not FLAGS.is_training: with open(FLAGS.save_path + '/cv_name_save.pkl', 'rb') as pickle_in: cv_name = pickle_in with open(FLAGS.save_path + '/cv_category_save.pkl', 'rb') as pickle_in: cv_category = pickle_in with open(FLAGS.save_path + '/tv_desc_save.pkl', 'rb') as pickle_in: tv_desc = pickle_in with open(FLAGS.save_path + '/lb_brand_save.pkl', 'rb') as pickle_in: lb_brand = pickle_in else: cv_name = CountVectorizer(min_df=NAME_MIN_DF)['name']) cv_category = CountVectorizer()['category_name']) tv_desc = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION, ngram_range=(1, 3), stop_words='english')['item_description']) lb_brand = LabelBinarizer(sparse_output=True)['brand_name']) X_name = cv_name.transform(dat['name']) print('[{}] Finished count vectorize `name`'.format(time.time() - start_time)) X_category = cv_category.transform(dat['category_name']) print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time)) X_description = tv_desc.transform(dat['item_description']) print( '[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time)) X_brand = lb_brand.transform(dat['brand_name']) print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(dat[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'. format(time.time() - start_time)) sparse_dat = hstack( (X_dummies, X_description, X_brand, X_category, X_name)).tocsr() print('[{}] Finished to create sparse dat'.format(time.time() - start_time)) ## may as well get the pickle for price here price_label = dat['price'] if FLAGS.is_training: pickle.dump(sparse_dat, open(FLAGS.save_path + '/sparse_mat.pkl', 'wb')) pickle.dump(price_label, open(FLAGS.save_path + '/label.pkl', 'wb')) ## save the trained vectorizes and stuff for future cleaning (like with validation set) if FLAGS.is_training: pickle.dump(sparse_dat, open(FLAGS.save_path + '/sparse_mat_val.pkl', 'wb')) pickle.dump(price_label, open(FLAGS.save_path + '/label_val.pkl', 'wb')) pickle.dump(cv_name, open(FLAGS.save_path + '/cv_name_save.pkl', 'wb')) pickle.dump(cv_category, open(FLAGS.save_path + '/cv_category_save.pkl', 'wb')) pickle.dump(tv_desc, open(FLAGS.save_path + '/tv_desc_save.pkl', 'wb')) pickle.dump(lb_brand, open(FLAGS.save_path + '/lb_brand_save.pkl', 'wb')) print('Done!')
# Prepare data: df_train = pd.read_csv(data_path + "/emnist-letters-train.csv") df_test = pd.read_csv(data_path + "/emnist-letters-test.csv") y_train = df_train.iloc[:, 0].values y_test = df_test.iloc[:, 0].values X_train = df_train.iloc[:, 1:].values X_test = df_test.iloc[:, 1:].values X_train = X_train.reshape(-1, 28, 28, 1) X_test = X_test.reshape(-1, 28, 28, 1) # One-hot encode the y-values: lb = LabelBinarizer() y_train_enc = lb.transform(y_train) y_test_enc = lb.transform(y_test) # Define hyperparameters: num_epoch = 15 batch_size = 32 # Define model: model = SimpleConvnet(inp_w = 28, inp_h = 28, inp_d = 1), y_train_enc, num_epoch = num_epoch, batch_size = batch_size, weight_save_path = weight_save_path)
y_train = training_data['output2'] if predict_rating else training_data[ 'output1'] y_test = test_data['output2'] if predict_rating else test_data['output1'] else: #doc2vec training_data = pd.read_csv('./split_doc2vec/1/training_data.csv', header=0) test_data = pd.read_csv('./split_doc2vec/1/test_data.csv', header=0) X_train = listify(training_data['Vector']) X_test = listify(test_data['Vector']) y_train = training_data['overall'] if predict_rating else training_data[ 'Category'] y_test = test_data['overall'] if predict_rating else test_data['Category'] #one hot encoding of the ratings encode_label = LabelBinarizer() #calc class weights class_weight = None if weight_class: y_int = np.argmax(encode_label.transform(y_train), axis=1) class_weight = compute_class_weight('balanced', np.unique(y_int), y_int) class_weight = dict(enumerate(class_weight)) #grid search params to test num_hidden_layers = [1, 2, 3] num_nodes = [2, 4, 8, 16, 32] epochs = [50] batch_size = [32] callbacks = None
pred_gnb = gnb_model.predict(np.asarray(q_test)) #evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations #use multilabel_evaluation() gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer( d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes") #gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation( # d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes") documentation_file_modelopt.write(gnb_evaluation_scores) #split data in training and test data d_train_single, d_test_single, q_train_single, q_test_single = train_test_split( datasets_single, q_fasttext, test_size=0.2) #prepare queries and datasets for Neural Network application label_binarizer = LabelBinarizer() d_train_binarized = label_binarizer.transform(d_train_single) pickle.dump(label_binarizer, open("label_binarizer_fasttext.sav", 'wb')) array_q_train = np.array(q_train_single) X = np.expand_dims(array_q_train, axis=2) array_q_test = np.array(q_test_single) x_t = np.expand_dims(array_q_test, axis=2) d_train_array = np.array(d_train_binarized) d_test_array = np.array(d_test_single) num_classes = len(label_binarizer.classes_) #build CNN model and evaluate the model print("CNN model evaluation") def cnn_optimization(x_train, y_train, x_test, y_test, params):
train_tags = data['Status'][:train_size] train_files_names = data['filename'][:train_size] test_posts = data['title'][train_size:] test_tags = data['Status'][train_size:] test_files_names = data['filename'][train_size:] # define Tokenizer with Vocab Size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_posts) x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf') x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf') encoder = LabelBinarizer() y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) model = Sequential() model.add(Dense(512, input_shape=(vocab_size,))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy',
@author: mayur """ import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelBinarizer train = pd.read_csv('/Users/mayur/Documents/GitHub/'+ 'newtrain.csv') trial_data = train.iloc[:10,[0,4,6,8]] lb = LabelBinarizer() # checking number of unique visitor ids unique_visitors = list(trial_data.fullVisitorId.unique()) enc_channelGrouping = OneHotEncoder()['channelGrouping']) """ dict_channelGrouping = dict(enumerate(trial_data['channelGrouping'].astype( 'category').cat.categories)) dict_socialEngagementType = dict(enumerate(
def binarize_tokenized(x, vocab_len): binarizer = LabelBinarizer() x = np.array([binarizer.transform(x) for x in x]) return x
labels_flat = labels_flat[ind] ''' # summary of classes class_1 = labels_flat[labels_flat == 0] class_2 = labels_flat[labels_flat == 1] class_3 = labels_flat[labels_flat == 2] print(class_1.shape[0]) print(class_2.shape[0]) print(class_3.shape[0]) # under-sample dataset params, labels_flat = RandomUnderSampler(random_state=0).fit_resample( params, labels_flat) # label binarization # feature selection # params = SelectKBest(chi2, k=10).fit_transform(params, labels_flat) print(params.shape) # create classifier clf = RandomForestClassifier(n_estimators=500, random_state=666, min_samples_split=2, min_samples_leaf=1, n_jobs=-1, criterion='entropy', max_depth=None, max_features='sqrt', class_weight=[{ 0: 1,
class DistOneVsRestClassifier(OneVsRestClassifier): """ Same as sklearn `OneVsRestClassifier` but with distributed training using spark. Additionally implements flexible ``predict_proba`` method with custom `norm` input designating the normalization method used after individual predictions are made. Args: estimator (sklearn estimator): An estimator object implementing fit and one of decision_function or predict_proba. sc (sparkContext): Spark context for spark broadcasting and rdd operations. norm (string): default None, Normalization method for predict_proba. partitions (int or 'auto'): default 'auto' Number of partitions to use for parallelization of parameter search space. Integer values or None will be used directly for `numSlices`, while 'auto' will set `numSlices` to the number required fits. max_negatives (int or float): default None Maximum number of negative records allowed for each binary estimator. Use int for hard maximum, or float for percentage of total negatives. random_state (int): default None Random state for limiting negatives (if max_negatives is not None). method (str): 'ratio' or 'multiplier' Method used to calculate true maximum number of negatives. n_splits (int): default 1 Dials the number of splits for broadcasting X during fitting. Use values higher than 1 for large X. mlb_override (bool): pass over mlb step; this assumes that input `y` to `fit` is already in sparse (one-hot-encoded) format verbose (bool): print status messages **kwargs: Keyword arguments to be passed to `OneVsRestClassifier`. """ def __init__(self, estimator, sc=None, norm=None, partitions='auto', max_negatives=None, random_state=None, method="ratio", n_splits=1, mlb_override=False, verbose=False, **kwargs): OneVsRestClassifier.__init__( self, estimator, **kwargs) self.norm = norm = sc self.partitions = partitions self.max_negatives = max_negatives self.random_state = random_state self.method = method self.n_splits = n_splits self.mlb_override = mlb_override self.verbose = verbose def fit(self, X, y, **fit_params): """ Fit underlying estimators. Parallelize fit operation using spark. Args: X (array-like, shape = [n_samples, n_features]): input data y (array-like, shape = [n_samples, ], [n_samples, n_classes]): multi-class targets **fit_params (dict of string -> object): parameters passed to the ``fit`` method of the estimator """ _check_estimator(self, verbose=self.verbose) if (not self.mlb_override and not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], str)): = MultiLabelBinarizer() y = if isinstance(X, pd.DataFrame): X.index = list(range(len(X))) self.label_binarizer_ = LabelBinarizer(sparse_output=True) self.classes_ = self.label_binarizer_.classes_ self._fit(X, y, **fit_params) del if hasattr(self.estimator, "sc"): del return self def _fit(self, X, y, **fit_params): Y = self.label_binarizer_.transform(y) Y = Y.tocsc() max_negatives = self.max_negatives random_state = self.random_state n_splits = self.n_splits method = self.method estimator = _clone(self.estimator) if is None: models_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_binary)( estimator, X, x[1], fit_params, classes=["not %s" % x[0], x[0]], max_negatives=max_negatives, random_state=random_state, method=method) for x in list(zip(self.classes_, list(col.toarray().ravel() for col in Y.T)))) else: X = _split_X(X, n_splits, partitions = _parse_partitions(self.partitions, len(self.classes_)) estimator = columns = list(zip(self.classes_, list(col.toarray().ravel() for col in Y.T))), numSlices=partitions) models_ = x: _fit_binary( estimator, X, x[1], fit_params, classes=["not %s" % x[0], x[0]], max_negatives=max_negatives, random_state=random_state, method=method)).collect() estimators_ = [x[0] for x in models_] classes_ = [x[1] for x in models_] self.estimators_ = list([estimators_[classes_.index(x)] for x in self.classes_]) return self def predict_proba(self, X): """ Probability estimates. The returned estimates for all classes are ordered by label of classes. Args: X (array-like, shape = [n_samples, n_features]): input data Returns: T (array-like, shape = [n_samples, n_classes]): returns the probability of the sample for each class in the model, where classes are ordered as they are in self.classes_ """ probs = [] for index in range(len(self.estimators_)): probs.append(self.estimators_[index].predict_proba(X)[:,1]) out = np.array([ [probs[y][index] for y in range(len(self.estimators_))] for index in range(len(probs[0]))]) if self.norm: return normalize(out, norm=self.norm) else: return out
y_train, test_size=0.094, random_state=832289) print("No. of training samples: %d, No. of test samples: %d, No. of validation samples: %d"\ %(len(X_train), len(X_test), len(X_valid)) ) # Data preprocessing: converting to numpy array, normalizing data, and creating # one-hot labels. X_train = np.array(X_train) X_valid = np.array(X_valid) X_test = np.array(X_test) X_train = X_train.astype('float32') X_valid = X_valid.astype('float32') X_test = X_test.astype('float32') encoder = LabelBinarizer() X_train /= 255 X_valid /= 255 X_test /= 255 y_train_onehot = encoder.transform(y_train) y_valid_onehot = encoder.transform(y_valid) y_test_onehot = encoder.transform(y_test) data_train = [X_train, y_train_onehot, X_valid, y_valid_onehot] data_test = [X_test, y_test_onehot] batch_size = 32 # batch size lr = -4 # learning rate epochs = 25 # number of training epochs hyper_params = [pow(10, lr), epochs, batch_size]
def retrain(retrained_model_name, imagenet_model_name): # read codes and labels from file import csv with open('retrained_models/' + retrained_model_name + '/' + imagenet_model_name + '/labels') as f: reader = csv.reader(f, delimiter='\n') labels = np.array([each for each in reader if len(each) > 0]).squeeze() with open('retrained_models/' + retrained_model_name + '/' + imagenet_model_name + '/codes') as f: codes = np.fromfile(f, dtype=np.float32) codes = codes.reshape((len(labels), -1)) from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() labels_vecs = lb.transform(labels) # GET VALIDATION from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2) train_idx, val_idx = next(ss.split(codes, labels)) half_val_len = int(len(val_idx) / 2) val_idx, test_idx = val_idx[:half_val_len], val_idx[half_val_len:] train_x, train_y = codes[train_idx], labels_vecs[train_idx] val_x, val_y = codes[val_idx], labels_vecs[val_idx] test_x, test_y = codes[test_idx], labels_vecs[test_idx] print("Train shapes (x, y):", train_x.shape, train_y.shape) print("Validation shapes (x, y):", val_x.shape, val_y.shape) print("Test shapes (x, y):", test_x.shape, test_y.shape) inputs_ = tf.placeholder(tf.float32, shape=[None, codes.shape[1]], name='inputs_clf') labels_ = tf.placeholder(tf.int64, shape=[None, labels_vecs.shape[1]], name='labels_clf') with tf.name_scope('fc1'): W_fc1 = tf.Variable(tf.truncated_normal([bottleneck_size, 256], stddev=0.1), name='W') b_fc1 = tf.Variable(tf.constant(0.1, shape=[256]), name='b') fc1 = tf.add(tf.matmul(inputs_, W_fc1), b_fc1) fc1 = tf.nn.relu(fc1) with tf.name_scope('fc2'): W_fc2 = tf.Variable(tf.truncated_normal([256, labels_vecs.shape[1]], stddev=0.1), name='W') b_fc2 = tf.Variable(tf.constant(0.1, shape=[labels_vecs.shape[1]]), name='b') logits = tf.add(tf.matmul(fc1, W_fc2), b_fc2) probs = tf.nn.softmax(logits, name='probs') with tf.name_scope('train_clf'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels_, logits=logits) cost = tf.reduce_mean(cross_entropy) optimizer = tf.train.AdamOptimizer().minimize(cost) with tf.name_scope('accuracy_clf'): predicted = tf.nn.softmax(logits) correct_pred = tf.equal(tf.argmax(predicted, 1), tf.argmax(labels_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) def get_batches(x, y, n_batches=10): """ Return a generator that yields batches from arrays x and y. """ batch_size = len(x) // n_batches for ii in range(0, n_batches * batch_size, batch_size): # If we're not on the last batch, grab data with size batch_size if ii != (n_batches - 1) * batch_size: X, Y = x[ii:ii + batch_size], y[ii:ii + batch_size] # On the last batch, grab the rest of the data else: X, Y = x[ii:], y[ii:] # I love generators yield X, Y epochs = 10 iteration = 0 saver = tf.train.Saver() with tf.Session() as sess: for e in range(epochs): for x, y in get_batches(train_x, train_y): loss, _ =[cost, optimizer], feed_dict={ inputs_: x, labels_: y }) print("Epoch: {}/{}".format(e + 1, epochs), "Iteration: {}".format(iteration), "Training loss: {:.5f}".format(loss)) iteration += 1 if iteration % 5 == 0: val_acc =, feed_dict={ inputs_: val_x, labels_: val_y }) print("Validation Acc: {:.4f}".format(val_acc)) sess, './retrained_models/' + retrained_model_name + '/' + imagenet_model_name + '/model.ckpt') print('Model trained and saved in ./retrained_models/' + retrained_model_name + '/' + imagenet_model_name + '/model.ckpt') test_acc =, feed_dict={ inputs_: test_x, labels_: test_y }) print("Test accuracy: {:.4f}".format(test_acc)) return round(100 * test_acc, 2)
for i in range(len(imgs_ordered)): patches = image.extract_patches_2d(imgs_ordered[i], (224,224), max_patches = 100) label = data['primary_microconstituent'][i] for patch in patches: x = Image.fromarray(patch).convert('RGB') x = np.asarray(x) #x = np.expand_dims(patch, axis = 2) x = preprocess_input(x) processed_imgs.append(x) labels.append(label) progbar(i, (len(imgs_ordered)-1), 20) lb = LabelBinarizer()['primary_microconstituent'])) y = lb.transform(labels) print('\nLabels Binarized, converting array') input = np.asarray(processed_imgs) X_train, X_test, y_train, y_test = train_test_split( input, y, test_size=0.1, random_state=42) model = DenseNet169(weights=None, classes = 7) model.summary() model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy']) time_callback = TimeHistory()
class MultiLabelSKFlow(BaseEstimator): """ This is a wrapper class for TensorFlow, so it adheres to the fit/predict naming conventions of sk-learn. This class handles the output layer, mini-batch learning, early stopping, threshold optimization on the validation set, and the neural metalabeler. The concrete TensorFlow model up to the last hidden layer can be specified in terms of the 'get_model' function. This function in turn has to accept the dataset X (np.array, or csr_matrix), and the goldstandard y (csr_matrix). Moreover, get_model() is expected to return the following components: x_tensor: tf.placeholder Used to pass input data to the model at training and test time. y_tensor: tf.placeholder Used by to pass the ground truth to the model during training. last_layer: tf.Tensor The TensorFlow computation graph from input layer to last hidden layer of the implemented neural network. params_fit: dictionary Parameters to be added to the feed dictionary for training (e.g., keep_probability_placeholder -> 0.5) params_predict: dictionary Parameters to be added to the feed dictionary at prediction time (e.g., keep_probability_placeholder -> 1.0) initializer_operations: list of (tf.Tensor, dictionary) A list of pairs consisting of operations for initializing variables (e.g., embedding tables) before training starts, and the feed dictionary with data to execute the initialize operation. Moreover, training can be controlled by the following parameters: Parameters ---------- batch_size: int, default = 5 Batch size to use during training and at prediction time. num_epochs: int, default = 10 Number of iterations over the dataset during training. get_model: function, default = mlp_base() The function that returns the underlying neural network up to the last hidden layer. See above description. threshold: float, default = 0.2 Fixed threshold to use if "optimize_threshold" = False, or starting threshold when "optimize_threshold" = True. learning_rate: float, default = 0.1 Initial learning rate to use for Adam. patience, int, default = 5 Number of non-improving evaluations on the validation set before terminating training. validation_metric, function true_values, predicted_values -> float, default = f1_score The metric that is used for evaluating prediction on the validation set. optimize_threshold, boolean, default = True Determines whether the threshold is optimized on a validation set. threshold_window, array-like of float, default = np.linspace(-0.03, 0.03, num=7) An array of floats that are interpreted as offset from the current threshold value. When optimizing the threshold, each of these offsets is added to the current threshold and the validation performance is assessed. Afterwards, the threshold is set to the value that has yielded the best score. tf_model_path, str, default = ".tmp_best_models" A path to the folder where the weights of the best model are saved, so it can be loaded at prediction time. num_steps_before_validation, int, default = None Determines the number of batches between two performance evaluations on the validation set. If set to None, this number is determined from the size of the training set, i.e., it is set to one epoch. hidden_activation_function, TensorFlow operation, default = tf.nn.relu The activation function to apply after the bottleneck layer. bottleneck_layers, list of int, default = None As many layers as there are elements in this list are injected before the output layer. Element i specifies the number of units in bottleneck layer i. hidden_keep_prob, float, default = 0.5 Specifies the keep probability of dropout to apply after each bottleneck layer. gpu_memory_fraction, float, default = 1. Specifies how much of the RAM of each available GPU TensorFlow may reserve. meta_labeler_phi, str, default = None Determines which 'phi' function from the definition of Neural MetaLabeler we use: "content", "score", or None. If none is used, MetaLabeler is not used at all. If "content" is used, the prediction is based on the output of the last hidden layer from the underlying neural network (given by get_model). If "score" is used, the prediction is based on the probabilities given by the output layer. meta_labeler_alpha, float, default = 0.1 The label-classification objective is weighted by (1 - alpha), and the objective of predicting the number of labels is weighted by alpha. meta_labeler_min_labels, int, default = 1 Specifies the smallest possible number of labels that can be predicted by Neural MetaLabeler. meta_labeler_max_labels, int, default = None Specifies the largest possible number of labels that can be predicted by Neural MetaLabeler. If set to None, the maximum number of labels is determined from the training set. """ def __init__(self, batch_size=5, num_epochs=10, get_model=mlp_base(), threshold=0.2, learning_rate=0.1, patience=5, validation_metric=lambda y1, y2: f1_score( y1, y2, average="samples"), optimize_threshold=True, threshold_window=np.linspace(-0.03, 0.03, num=7), tf_model_path=".tmp_best_models", num_steps_before_validation=None, hidden_activation_function=tf.nn.relu, bottleneck_layers=None, hidden_keep_prob=0.5, gpu_memory_fraction=1., meta_labeler_phi=None, meta_labeler_alpha=0.1, meta_labeler_min_labels=1, meta_labeler_max_labels=None): """ """ self.get_model = get_model # enable early stopping on validation set self.validation_data_position = None self.num_steps_before_validation = num_steps_before_validation # configurations for bottleneck layers self.hidden_activation_function = hidden_activation_function self.bottleneck_layers = bottleneck_layers self.hidden_keep_prob = hidden_keep_prob # configuration for meta-labeler self.meta_labeler_phi = meta_labeler_phi self.meta_labeler_alpha = meta_labeler_alpha self.num_label_binarizer = None self.meta_labeler_max_labels = meta_labeler_max_labels self.meta_labeler_min_labels = meta_labeler_min_labels # used by this class self.validation_metric = validation_metric self.optimize_threshold = optimize_threshold self.threshold_window = threshold_window self.patience = patience self.batch_size = batch_size self.num_epochs = num_epochs self.threshold = threshold if learning_rate is None: self.learning_rate = self.batch_size / 512 * 0.01 else: self.learning_rate = learning_rate # path to save the tensorflow model to self.TF_MODEL_PATH = tf_model_path self._save_model_path = self._get_save_model_path() # determine how much of gpu to use self.gpu_memory_fraction = gpu_memory_fraction def _get_save_model_path(self): TMP_FOLDER = self.TF_MODEL_PATH if not os.path.exists(TMP_FOLDER): os.makedirs(TMP_FOLDER) return TMP_FOLDER + "/best-model-" + self.get_model.__name__ + str( def _calc_num_steps(self, X): return int(np.ceil(X.shape[0] / self.batch_size)) def _predict_batch(self, X_batch): feed_dict = {self.x_tensor: X_batch} feed_dict.update(self.params_predict) if self.meta_labeler_phi is None: predictions =, feed_dict=feed_dict) else: predictions = [self.predictions, self.meta_labeler_prediction], feed_dict=feed_dict) return predictions def _make_binary_decision(self, predictions): if self.meta_labeler_phi is None: y_pred = predictions > self.threshold else: predictions, meta_labeler_predictions = predictions max_probability_cols = np.argmax(meta_labeler_predictions, axis=1) max_probability_indices = tuple( np.indices([meta_labeler_predictions.shape[0] ])) + (max_probability_cols, ) meta_labeler_predictions = np.zeros_like(meta_labeler_predictions) meta_labeler_predictions[max_probability_indices] = 1 meta_labeler_predictions = self.num_label_binarizer.inverse_transform( meta_labeler_predictions, 0) y_pred = np.zeros_like(predictions) for i in range(predictions.shape[0]): num_labels_for_sample = meta_labeler_predictions[i] top_indices = ( -predictions[i, :]).argsort()[:num_labels_for_sample] y_pred[i, top_indices] = 1 return csr_matrix(y_pred) def _compute_validation_score(self, session, X_val_batch, y_val_batch): feed_dict = {self.x_tensor: X_val_batch} feed_dict.update(self.params_predict) if self.validation_metric == "val_loss": return, feed_dict=feed_dict) elif callable(self.validation_metric): predictions = self._predict_batch(X_val_batch) y_pred = self._make_binary_decision(predictions) if self.optimize_threshold: return self.validation_metric(y_val_batch, y_pred), predictions else: return self.validation_metric(y_val_batch, y_pred) def _print_progress(self, epoch, batch_i, steps_per_epoch, avg_validation_score, best_validation_score, total_loss, meta_loss, label_loss): progress_string = 'Epoch {:>2}/{:>2}, Batch {:>2}/{:>2}, Loss: {:0.4f}, Validation-Score: {:0.4f}, Best Validation-Score: {:0.4f}' format_parameters = [ epoch + 1, self.num_epochs, batch_i + 1, steps_per_epoch, total_loss, avg_validation_score, best_validation_score ] if self.meta_labeler_phi is None: progress_string += ', Threshold: {:0.2f}' format_parameters.append(self.threshold) else: progress_string += ', Label-Loss: {:0.4f}, Meta-Loss: {:0.4f}' format_parameters.extend([label_loss, meta_loss]) progress_string = progress_string.format(*format_parameters) print(progress_string, end='\r') def _num_labels_discrete(self, y, min_number_labels=1, max_number_labels=None): """ Counts for each row in 'y' how many of the columns are set to 1. Outputs the result in turn as a binary indicator matrix where the columns 0, ..., m correspond to 'min_number_labels', 'min_number_labels' + 1, ..., 'max_number_labels'. Parameters ---------- y: (sparse) numpy array of shape [n_samples, n_classes] An indicator matrix denoting which classes are assigned to a sample (multiple columns per row may be 1) min_number_labels: int, default=1 Minimum number of labels each sample has to have. If a sample has less than 'min_number_labels' assigned, the corresponding output is set to 'min_number_labels'. max_number_labels: int, default=None Maximum number of labels each sample has to have. If a sample has more than 'min_number_labels' assigned, the corresponding output is set to 'max_number_labels'. If 'max_number_labels' is None, it is set to the max number found in y. Returns --------- num_samples_y: (sparse) numpy array of shape [n_samples, max_number_samples - min_number_samples + 1] """ num_samples_y = np.array(np.sum(y, axis=1)) num_samples_y = num_samples_y.reshape(-1) num_samples_y[num_samples_y < min_number_labels] = min_number_labels if max_number_labels is None: max_number_labels = np.max(num_samples_y) num_samples_y[num_samples_y > max_number_labels] = max_number_labels # 'fit' method calls this if self.num_label_binarizer is None: self.num_label_binarizer = LabelBinarizer() indicator_matrix_num_labels = self.num_label_binarizer.transform( num_samples_y) return indicator_matrix_num_labels def fit(self, X, y): self.y = y val_pos = self.validation_data_position if val_pos is not None: X_train, y_train, X_val, y_val = X[:val_pos, :], y[:val_pos, :], X[ val_pos:, :], y[val_pos:, :] validation_batch_generator = BatchGenerator( X_val, y_val, self.batch_size, False, False) validation_predictions = self._calc_num_steps(X_val) steps_per_epoch = self._calc_num_steps(X_train) # determine after how many batches to perform validation num_steps_before_validation = self.num_steps_before_validation if self.num_steps_before_validation is None: num_steps_before_validation = steps_per_epoch num_steps_before_validation = int( min(steps_per_epoch, num_steps_before_validation)) else: steps_per_epoch = self._calc_num_steps(X) X_train = X y_train = y # Remove previous weights, bias, inputs, etc.. tf.reset_default_graph() tf.set_random_seed(1337) # get_model has to return a self.x_tensor, self.y_tensor, self.last_layer, self.params_fit, self.params_predict, initializer_operations = self.get_model( X, y) # add bottleneck layer if self.bottleneck_layers is not None: bottleneck_dropout_tensor = tf.placeholder( tf.float32, name="bottleneck_dropout") self.params_fit.update( {bottleneck_dropout_tensor: self.hidden_keep_prob}) self.params_predict.update({bottleneck_dropout_tensor: 1}) for units in self.bottleneck_layers: self.last_layer = tf.contrib.layers.fully_connected( self.last_layer, units, activation_fn=self.hidden_activation_function) self.last_layer = tf.nn.dropout(self.last_layer, bottleneck_dropout_tensor) # Name logits Tensor, so that is can be loaded from disk after training #logits = tf.identity(logits, name='logits') logits = tf.contrib.layers.linear(self.last_layer, num_outputs=y.shape[1]) # Loss and Optimizer losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.y_tensor) loss = tf.reduce_sum(losses, axis=1) self.label_loss = tf.reduce_mean(loss, axis=0) # prediction self.predictions = tf.sigmoid(logits) if self.meta_labeler_phi is not None: # compute target of meta labeler y_num_labels = self._num_labels_discrete( y_train, min_number_labels=self.meta_labeler_min_labels, max_number_labels=self.meta_labeler_max_labels) y_num_labels_tensor = tf.placeholder(tf.float32, shape=(None, y_num_labels.shape[1]), name="y_num_labels") # compute logits of meta labeler if self.meta_labeler_phi == "content": meta_logits = tf.contrib.layers.linear( self.last_layer, num_outputs=y_num_labels.shape[1]) elif self.meta_labeler_phi == "score": meta_logits = tf.contrib.layers.linear( self.predictions, num_outputs=y_num_labels.shape[1]) # compute loss of meta labeler meta_labeler_loss = tf.nn.softmax_cross_entropy_with_logits( labels=y_num_labels_tensor, logits=meta_logits) self.meta_labeler_loss = tf.reduce_mean(meta_labeler_loss, axis=0) # compute prediction of meta labeler self.meta_labeler_prediction = tf.nn.softmax(meta_logits) # add meta labeler loss to labeling loss self.loss = ( 1 - self.meta_labeler_alpha ) * self.label_loss + self.meta_labeler_alpha * self.meta_labeler_loss else: self.loss = self.label_loss # optimize update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.gpu_memory_fraction) session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.session = session # Initializing the variables for (init_op, init_op_feed_dict) in initializer_operations:, feed_dict=init_op_feed_dict) batch_generator = BatchGenerator(X_train, y_train, self.batch_size, True, False) # Training cycle objective = 1 if self.validation_metric == "val_loss" else -1 avg_validation_score = math.inf * objective best_validation_score = math.inf * objective epochs_of_no_improvement = 0 most_consecutive_epochs_with_no_improvement = 0 batches_counter = 0 epoch = 0 stop_early = False while epoch < self.num_epochs and not stop_early: if val_pos is not None and epochs_of_no_improvement == self.patience: break # Loop over all batches for batch_i in range(steps_per_epoch): X_batch, y_batch = batch_generator._batch_generator() feed_dict = {self.x_tensor: X_batch, self.y_tensor: y_batch} feed_dict.update(self.params_fit) if self.meta_labeler_phi is not None: feed_dict.update({ y_num_labels_tensor: self._num_labels_discrete(y_batch) }), feed_dict=feed_dict) # overwrite parameter values for prediction step feed_dict.update(self.params_predict) # compute losses to track progress if self.meta_labeler_phi is not None: total_loss, label_loss, meta_loss = [self.loss, self.label_loss, self.meta_labeler_loss], feed_dict=feed_dict) else: total_loss =, feed_dict=feed_dict) label_loss, meta_loss = None, None batches_counter += 1 is_last_epoch = epoch == self.num_epochs - 1 is_last_batch_in_epoch = batch_i == steps_per_epoch - 1 # calculate validation loss at end of epoch if early stopping is on if val_pos is not None and ( batches_counter == num_steps_before_validation or (is_last_epoch and is_last_batch_in_epoch)): batches_counter = 0 validation_scores = [] weights = [] # save predictions so we can optimize threshold later val_predictions = np.zeros( (X_val.shape[0], self.y.shape[1])) for i in range(validation_predictions): X_val_batch, y_val_batch = validation_batch_generator._batch_generator( ) weights.append(X_val_batch.shape[0]) if self.optimize_threshold: batch_val_score, val_predictions[ i * self.batch_size:(i + 1) * self. batch_size, :] = self._compute_validation_score( session, X_val_batch, y_val_batch) else: batch_val_score = self._compute_validation_score( session, X_val_batch, y_val_batch) validation_scores.append(batch_val_score) avg_validation_score = np.average( np.array(validation_scores), weights=np.array(weights)) if self.optimize_threshold: best_score = -1 * math.inf best_threshold = self.threshold for t_diff in self.threshold_window: t = self.threshold + t_diff score = self.validation_metric( y_val, csr_matrix(val_predictions > t)) if score > best_score: best_threshold = t best_score = score is_better_score = avg_validation_score < best_validation_score if objective == 1 else avg_validation_score > best_validation_score if is_better_score: # save model # Save model for prediction step best_validation_score = avg_validation_score saver = tf.train.Saver(), self._save_model_path) if most_consecutive_epochs_with_no_improvement < epochs_of_no_improvement: most_consecutive_epochs_with_no_improvement = epochs_of_no_improvement epochs_of_no_improvement = 0 # save the threshold at best model, too. if self.optimize_threshold: self.threshold = best_threshold else: epochs_of_no_improvement += 1 if epochs_of_no_improvement > self.patience: print("No improvement in validation loss for", self.patience, "epochs. Stopping early.") stop_early = True break # print progress self._print_progress(epoch, batch_i, steps_per_epoch, avg_validation_score, best_validation_score, total_loss, meta_loss, label_loss) epoch += 1 print('') print("Training of TensorFlow model finished!") print("Longest sequence of epochs of no improvement:", most_consecutive_epochs_with_no_improvement) def predict(self, X): session = self.session #loaded_graph = tf.Graph() if self.validation_data_position: # Load model loader = tf.train.import_meta_graph(self._save_model_path + '.meta') loader.restore(self.session, self._save_model_path) prediction = np.zeros((X.shape[0], self.y.shape[1])) batch_generator = BatchGenerator(X, None, self.batch_size, False, True) prediction_steps = self._calc_num_steps(X) for i in range(prediction_steps): X_batch = batch_generator._batch_generator() preds = self._predict_batch(X_batch) binary_decided_preds = self._make_binary_decision(preds) prediction[i * self.batch_size:(i + 1) * self.batch_size, :] = binary_decided_preds.todense() result = csr_matrix(prediction) # close the session, since no longer needed session.close() return result
def roc_multiclass_cruve_nn(y_test_class, y_pred_class): lb = LabelBinarizer() y_test_b = lb.transform(y_test_class) y_pred_b = lb.transform(y_pred_class) fpr = dict() tpr = dict() roc_auc = dict() for i in range(3): fpr[i], tpr[i], _ = roc_curve(y_test_b[:, i], y_pred_b[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro fpr["micro"], tpr["micro"], _ = roc_curve(y_test_b.ravel(), y_pred_b.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) lw = 1 # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(3)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(3): mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= 3 fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(3), colors): plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC For Neural Network') plt.legend(loc="lower right") plt.savefig('ROC For Neural Network') return
def nnCostFunction(nn_params, *args): in_size, hid_size, num_labels, X, y, lam = args Theta1 = nn_params[0:(in_size + 1) * hid_size].reshape( (hid_size, in_size + 1)) Theta2 = nn_params[(in_size + 1) * hid_size:].reshape( (num_labels, hid_size + 1)) #print(Theta1.shape) #print(Theta2.shape) Theta1_grad = np.zeros(Theta1.shape) Theta2_grad = np.zeros(Theta2.shape) m = X.shape[0] X = np.hstack((np.ones((m, 1)), X)) lb = LabelBinarizer() y = lb.transform(y) J = 0 for i in range(m): xi = X[i, :] yi = y[i] # forward propagation a1 = xi z2 =, a1) a2 = sigmoid(z2) a2 = np.hstack((1, a2)) z3 =, a2) a3 = sigmoid(z3) #print("-- a3 shape %s" % (a3.shape,)) #print("-- yi shape %s" % (yi.shape,)) J += sum(-yi * safe_log(a3) - (1 - yi) * safe_log(1 - a3)) # backpropagation delta3 = a3 - yi delta2 =, delta3) * sigmoidGradient(np.hstack((1, z2))) delta2 = delta2[1:] # delta2 = delta2.reshape((-1, 1)) delta3 = delta3.reshape((-1, 1)) a1 = a1.reshape((-1, 1)) a2 = a2.reshape((-1, 1)) Theta1_grad +=, a1.T) Theta2_grad +=, a2.T) J /= m temp = 0.0 for j in range(hid_size): for k in range(1, in_size + 1): # temp += Theta1[j, k]**2 for j in range(num_labels): for k in range(1, hid_size + 1): # temp += Theta2[j, k]**2 J += lam / (2.0 * m) * temp # Theta1_grad /= m Theta1_grad[:, 1:] += (lam / m) * Theta1_grad[:, 1:] Theta2_grad /= m Theta2_grad[:, 1:] += (lam / m) * Theta2_grad[:, 1:] # grad = np.hstack((np.ravel(Theta1_grad), np.ravel(Theta2_grad))) print "J =", J return J, grad
from gensim.models import word2vec from scipy.stats import skew from scipy.stats import kurtosis from sklearn.preprocessing import LabelBinarizer path = "dataSet//" '''train''' train_log = pd.read_csv(path + 'train_log.csv', encoding='utf-8', sep='\t') train_agg = pd.read_csv(path + 'train_agg.csv', encoding='utf-8', sep='\t') train_flg = pd.read_csv(path + 'train_flg.csv', encoding='utf-8', sep='\t') '''test''' test_log = pd.read_csv(path + 'test_log.csv', encoding='utf-8', sep='\t') test_agg = pd.read_csv(path + 'test_agg.csv', encoding='utf-8', sep='\t') '''EVT_LBL one-hot feature''' model_one_hot = LabelBinarizer()['EVT_LBL']) def return_list(group): return list(group) def return_set(group): return set(group) def return_set_len(group): return len(set(group)) def calc_continue_day(group):
def plot_mc_roc(y_test, y_score, interpreter=None): ''' plotting function that generates roc curves for data given to it. :param y_test: is the testing data used :param y_score: is the score when the testing data was called :param interpreter: is what was used to preprocess :return a roc plot ''' lw = 2 n_classes = len(np.unique(y_test)) classes = pd.unique(y_test) label_binarizer = LabelBinarizer(), y_score))) if n_classes != 2: y_test = label_binarizer.transform(y_test) y_score = label_binarizer.transform(y_score) else: n_classes = 1 y_test = y_test.reshape(-1, 1) y_score = y_score.reshape(-1, 1) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = sklearn.metrics.auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves img = plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) for i in range(n_classes): plt.plot( fpr[i], tpr[i], lw=lw, label='ROC curve of class {0} (area = {1:0.2f})' ''.format( interpreter.inverse_transform( [[label_binarizer.classes_[i]]])[0], roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc="lower right") return img
# a = 0.1 # b = 0.9 # grayscale_min = 0 # grayscale_max = 255 # return a + ( ( (image_data - grayscale_min)*(b - a) )/( grayscale_max - grayscale_min ) ) if not is_features_normal: train_features = (train_features / 255.0 * 0.99) + 0.01 test_features = (test_features / 255.0 * 0.99) + 0.01 is_features_normal = True print('Tests Passed!') if not is_labels_encod: #对ABCD10个字母编码 # Turn labels into numbers and apply One-Hot Encoding encoder = LabelBinarizer() #LabelBinarizer是sklearn库里的数值便签二值化的工具 train_labels = encoder.transform(train_labels) test_labels = encoder.transform(test_labels) # Change to float32, so it can be multiplied against the features in TensorFlow, which are float32 train_labels = train_labels.astype(np.float32) test_labels = test_labels.astype(np.float32) is_labels_encod = True # print('Labels One-Hot Encoded') #产生训练集和验证集 train_features, valid_features, train_labels, valid_labels = train_test_split( train_features, train_labels, test_size=0.05, random_state=832289) print('Training features and labels randomized and split.')
print(" X train shape: ", X_train.shape, "\n X test shape:", X_test.shape, " \n y train shape:", y_train.shape, "\n y test shape:", y_test.shape) # Pipeline: it takes a list of tuples as parameter pipeline_1 = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]) # use the pipeline object as you would # a regular classifier, y_train) y_preds = pipeline_1.predict(X_test) print(y_preds) ac_score = accuracy_score(y_test, y_preds) print("Log reg: ", ac_score) #Another Pipeline: it takes a list of tuples as parameter pipeline_2 = Pipeline([('scaler', StandardScaler()), ('clf', SVC())]) # use the pipeline object as you would # a regular classifier, y_train) y_preds = pipeline_2.predict(X_test) print(y_preds) ac_score = accuracy_score(y_test, y_preds) print("SVM classifier: ", ac_score) # Another Example from sklearn.preprocessing import LabelBinarizer bin = LabelBinarizer() #first we initialize vec = ['cat', 'dog', 'dog', 'dog'] #we have our label list we want binarized print(bin.classes_) print(bin.transform(vec))
def preprocess(sentences, targets, tokenizer=None, summary=False, labelizer=None, statssummary=False, pad=False, replace_dig=True): stats = {} vocab = set() for s in sentences: if replace_dig: s = re.sub(r'\d', 'DIG', s) vocab |= set(s.split()) lengths = [len(s.split()) for s in sentences] stats['nb_sentences'] = len(sentences) stats['nb_words'] = len(vocab) stats['nb_classes'] = len(np.unique(targets)) stats['class_cnt'] = dict(zip(*np.unique(targets, return_counts=True))) stats['max_len'] = max(lengths) stats['weights'] = compute_class_weight('balanced', np.unique(targets), targets) if summary: print(f'Total: {len(sentences)} sentences ') print(f'Max sentence length {max(lengths)}') print(f'Number of words: {len(vocab)}') print(f'Number of classes: {len(np.unique(targets))}') if tokenizer is None: tokenizer = Tokenizer(stats['nb_words'], oov_token='UNK') tokenizer.fit_on_texts(sentences) if not os.path.isdir('./out/'): os.mkdir('./out/') with open('./out/tokenizer.pickle', 'wb') as file: pickle.dump(tokenizer, file) X = tokenizer.texts_to_sequences(sentences) if pad: X = pad_sequences(X, maxlen=stats['max_len']) else: X = np.asarray([np.asarray(xx) for xx in X]) if labelizer is None: labelizer = LabelBinarizer() with open('./out/label_encoder.pickle', 'wb') as file: pickle.dump(labelizer, file) y = labelizer.transform(targets) return X, y, stats
joined_pred = np.zeros(n_test, dtype=object) for row in range(n_test): joined_pred[row] = " ".join(pred[row, :]) return joined_pred if __name__ == "__main__": if len(sys.argv) < 3 or not sys.argv[1].endswith(".csv"): print("usage: %s <dest.csv> <file1.npz> <file2.npz> ..." % sys.argv[0]) sys.exit() train_df = pd.read_csv("../data/train.csv", index_col="fname") label_binarizer = LabelBinarizer()["label"]) test_files = np.array(find_files("../data/audio_test/")) test_idx = [os.path.basename(f) for f in test_files] dest_filename = sys.argv[1] predict_count = len(sys.argv[2:]) pred = np.zeros((len(test_idx), predict_count, NUM_CLASSES)) for k, pred_name in enumerate(sys.argv[2:]): print("reading", pred_name) pred[:, k, :] = np.load(pred_name)["predict"] pred = merge_predictions(pred, "geom_mean", axis=1) print("predictions after final merge", pred.shape)
def eval_individual_device(train_data_file, dname, specified_models=None): global root_feature, root_model, root_output, dir_tsne_plots """ Assumptions: the train_data_file contains only 1 device, all possible states(tags); the models can only be one of the implementated: knn, kmeans, dbscan, random forest classifier """ warnings.simplefilter("ignore", category=DeprecationWarning) warnings.simplefilter("ignore", category=FutureWarning) """ Skip trained models, return if there is no model to train. """ list_all_models = model_list if specified_models is not None: list_all_models = specified_models list_models_todo = [] for model_alg in list_all_models: """ Prepare the directories and add only models that have not been trained yet """ model_dir = '%s/%s' % (root_model, model_alg) model_file = '%s/%s%s.model' % (model_dir, dname, model_alg) if not os.path.exists(model_file) and os.path.exists(train_data_file): # check .model # check if training data set is available list_models_todo.append(model_alg) if len(list_models_todo) < 1: print('skip %s, all models trained for alg: %s' % (dname, str(list_all_models))) return print('Training %s using algorithm(s): %s' % (dname, str(list_models_todo))) train_data = pd.read_csv(train_data_file) num_data_points = len(train_data) if num_data_points < 1: print(' No enough data points for %s' % dname) return print('\t#Total data points: %d ' % num_data_points) X_feature = train_data.drop(['device', 'state'], axis=1).fillna(-1) ss = StandardScaler() pca = PCA(n_components=20) X_std = ss.fit_transform(X_feature) # Create a PCA instance: pca X_std = pca.fit_transform(X_std) # Save components to a DataFrame X_std = pd.DataFrame(X_std) X_feature = X_std.iloc[:, :4] y_labels = np.array(train_data.state) # y_labels, example: on, off, change_color """ Split data set into train & test, default fraction is 30% test """ X_train, X_test, y_train, y_test = train_test_split(X_feature, y_labels, test_size=.3, random_state=42) print('Train: %s' % len(X_train)) print('Test: %s' % len(X_test)) num_lables = len(set(y_labels)) if num_lables < 2: print('\tNo enough labels for %s' % dname) return """ One hot encoding y labels """ lb = LabelBinarizer() # collect all possible labels y_train_bin = lb.transform(y_train) y_test_bin = lb.transform(y_test) y_test_bin_1d = np.argmax(y_test_bin, axis=1) """ Train through the list of interested ML algorithms """ ret_results = [] for model_alg in list_models_todo: model_dir = '%s/%s' % (root_model, model_alg) if not os.path.exists(model_dir): os.system('mkdir -pv %s' % model_dir) model_file = f'{model_dir}/{dname}{model_alg}.model' label_file = '%s/%s.label.txt' % (model_dir, dname) single_outfile = '%s/%s.result.csv' % (model_dir, dname) output_file = '%s/result_%s.txt' % (root_output, model_alg) _acc_score = -1 _noise = -1 _silhouette = -1 """ Two steps 1. Train (70%) 2. Test 3. Evaluate """ if model_alg == 'knn': print(' knn: n_neighbors=%s' % num_lables) trained_model = KNeighborsClassifier(n_neighbors=num_lables), y_train_bin) y_predicted = trained_model.predict(X_test) y_predicted_1d = np.argmax(y_predicted, axis=1) if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'kmeans': print(' kmeans: n_clusters=%s' % num_lables) trained_model = MiniBatchKMeans(n_clusters=num_lables, random_state=0, batch_size=6) y_predicted_1d = trained_model.predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'spectral': print(' Spectral Clustering: n_clusters=%s' % num_lables) trained_model = SpectralClustering(n_clusters=num_lables, affinity='nearest_neighbors', random_state=0) y_predicted_1d = trained_model.fit_predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'dbscan': print(' eps=%s' % 300) trained_model = DBSCAN(eps=200, min_samples=5) y_predicted_1d = trained_model.fit_predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) _noise = list(y_predicted_1d).count(-1) * 1. / num_data_points elif model_alg == 'rf': trained_model = RandomForestClassifier(n_estimators=1000, random_state=42), y_train_bin) y_predicted = trained_model.predict(X_test).round() # print(y_predicted) if y_predicted.ndim == 1: y_predicted_1d = y_predicted else: y_predicted_1d = np.argmax(y_predicted, axis=1) _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d) """ Eval clustering based metrics """ _homogeneity = -1 _complete = -1 _vmeasure = -1 _ari = -1 _f1_score = -1 if model_alg not in ['rf']: """ Metrics for clustering algorithms """ # print('y_test_bin: %s' % y_test_bin_1d) # print('y_predicted_1d: %s' % y_predicted_1d) _homogeneity = homogeneity_score(y_test_bin_1d, y_predicted_1d) _complete = completeness_score(y_test_bin_1d, y_predicted_1d) _vmeasure = v_measure_score(y_test_bin_1d, y_predicted_1d) _ari = adjusted_rand_score(y_test_bin_1d, y_predicted_1d) """ Plot tSNE graph """ figfile = '%s/%s/%s-%s.png' % (root_model, model_alg, model_alg, dname) pp = 30 # perplexity if num_data_points > 200: pp = 50 tsne_plot(X_feature, y_labels, figfile, pp) """ Save the model """ model_dictionary = dict({ 'standard_scaler': ss, 'pca': pca, 'trained_model': trained_model }) pickle.dump(model_dictionary, open(model_file, 'wb')) """ Save the label for onehot encoding """ # unique_labels = label_encoder.classes_.tolist() unique_labels = lb.classes_.tolist() open(label_file, 'w').write('%s\n' % '\n'.join(unique_labels)) """ Save eval results """ # TODO: due to the multi-thread, needs to change the settings with open(single_outfile, 'a+') as off: off.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (dname, _acc_score, _homogeneity, _complete, _vmeasure, _ari, _noise, _silhouette)) # y_test_bin_1d, y_predicted_1d off.write('%s\n' % ','.join(map(str, y_test_bin_1d))) off.write('%s\n' % ','.join(map(str, y_predicted_1d))) ret_results.append([ output_file, dname, _acc_score, _homogeneity, _complete, _vmeasure, _ari, _noise, _silhouette ]) """ Print to Terminal """ print(' model -> %s' % model_file) print(' labels -> %s' % label_file) print('\t' + '\n\t'.join(unique_labels) + '\n') if model_alg not in ['rf']: print(' _homogeneity: %.3f' % _homogeneity) print(' _completeness: %.3f' % _complete) print(' _vmeausre: %.3f' % _vmeasure) print(' _ari: %.3f' % _ari) print(' _silhouette: %.3f' % _silhouette) print(' _acc_score: %.3f' % _acc_score) print(' measures saved to: %s' % single_outfile) return ret_results
return model if __name__ == '__main__': # Parameters batch_size = 32 epochs = 50 k = 5 n_models = 3 x = np.load('../input/X_train_kaggle.npy') y_df = pd.read_csv('../input/y_train_final_kaggle.csv') y_labels = y_df['Surface'].values label_binarizer = LabelBinarizer() scalers = {} for i in range(x.shape[1]): scalers[i] = StandardScaler() x[:, i, :] = scalers[i].fit_transform(x[:, i, :]) x = np.expand_dims(x, axis=-1) input_shape = x.shape[1::] skf = StratifiedKFold(n_splits=k) cvscores = np.zeros((k, n_models)) train_cvscores = np.zeros((k, n_models)) k_index = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./', help='Directory for input data') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--batch_size', type=int, default=50, help='Batch size') parser.add_argument('--training_epochs', type=int, default=500, help='Number of epochs') parser.add_argument('--n_distribution', type=int, default=3, help='Number of distributions') FLAGS, unparsed = parser.parse_known_args() # print([sys.argv[0]] + unparsed) path_dir = FLAGS.data_dir # Parameters learning_rate = FLAGS.learning_rate batch_size = FLAGS.batch_size training_epochs = FLAGS.training_epochs # Network Parameters n_distribution = FLAGS.n_distribution data = read_data(os.path.join(path_dir, '_data.txt')) data, minx, maxx = scaler_range(data, feature_range=(-1, 1)) labels = read_data(os.path.join(path_dir, '_labels.txt')) lb = LabelBinarizer() class_name = lb.classes_ n_class = class_name.shape[0] if n_class == 2:, np.max(class_name) + 1)) n_features = data.shape[1] num_hidden_1 = int(0.5 * n_features) num_hidden_2 = num_hidden_1 num_hidden_3 = num_hidden_1 num_hidden_4 = num_hidden_1 num_hidden_5 = n_class imp = SimpleImputer(missing_values=np.nan, strategy='mean') complate_data = imp.fit_transform(data) gmm = GaussianMixture(n_components=n_distribution, covariance_type='diag').fit(complate_data) del complate_data, imp gmm_weights = np.log(gmm.weights_.reshape((-1, 1))) gmm_means = gmm.means_ gmm_covariances = gmm.covariances_ del gmm acc = np.zeros((3, 5)) time_train = np.zeros(5) time_test = np.zeros(5) skf = StratifiedKFold(n_splits=5) id_acc = 0 for trn_index, test_index in skf.split(data, labels): X_train = data[trn_index] X_lab = labels[trn_index] train_index, valid_index = next(StratifiedKFold(n_splits=5).split(X_train, X_lab)) train_x = X_train[train_index, :] valid_x = X_train[valid_index, :] test_x = data[test_index, :] train_y = lb.transform(X_lab[train_index]) valid_y = lb.transform(X_lab[valid_index]) test_y = lb.transform(labels[test_index]) if n_class == 2: train_y = train_y[:, :-1] valid_y = valid_y[:, :-1] test_y = test_y[:, :-1] with tf.Graph().as_default() as graph: initializer = tf.contrib.layers.variance_scaling_initializer() weights = { 'h1': tf.Variable(initializer([n_features, num_hidden_1])), 'h2': tf.Variable(initializer([num_hidden_1, num_hidden_2])), 'h3': tf.Variable(initializer([num_hidden_2, num_hidden_3])), 'h4': tf.Variable(initializer([num_hidden_3, num_hidden_4])), 'h5': tf.Variable(initializer([num_hidden_4, num_hidden_5])), } biases = { 'b1': tf.Variable(tf.random_normal([num_hidden_1])), 'b2': tf.Variable(tf.random_normal([num_hidden_2])), 'b3': tf.Variable(tf.random_normal([num_hidden_3])), 'b4': tf.Variable(tf.random_normal([num_hidden_4])), 'b5': tf.Variable(tf.random_normal([num_hidden_5])), } # Symbols z = tf.placeholder(shape=[None, n_features], dtype=tf.float32) y = tf.placeholder(shape=[None, n_class], dtype=tf.float32) p = tf.Variable(initial_value=gmm_weights, dtype=tf.float32) means = tf.Variable(initial_value=gmm_means, dtype=tf.float32) covs = tf.Variable(initial_value=gmm_covariances, dtype=tf.float32) gamma = tf.Variable(initial_value=tf.random_normal(shape=(1,), mean=2, stddev=1.), dtype=tf.float32) # Construct model predict = multilayer_perceptron(z, means, covs, p, gamma, n_distribution, weights, biases) y_true = prep_labels(z, y) # Mean squared error cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=predict, labels=y_true)) l_r = learning_rate # Gradient descent optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() nr_epoch = 10 val_weights = None val_biases = None val_p = None val_means = None val_covs = None val_gamma = None with tf.Session(graph=graph) as sess: min_cost = np.inf n_cost_up = 0 prev_train_cost = np.inf time_train[id_acc] = time() epoch = 0 # Training cycle for epoch in range(training_epochs): # print("\r[{}|{}] Step: {:d} from 5".format(epoch + 1, training_epochs, id_acc), end="") # sys.stdout.flush() curr_train_cost = [] for batch_idx in range(0, train_y.shape[0], batch_size): x_batch = train_x[batch_idx:batch_idx + batch_size, :] y_batch = train_y[batch_idx:batch_idx + batch_size, :] temp_train_cost, _ =[cost, optimizer], feed_dict={z: x_batch, y: y_batch}) curr_train_cost.append(temp_train_cost) curr_train_cost = np.asarray(curr_train_cost).mean() if epoch > nr_epoch and (prev_train_cost - curr_train_cost) < 1e-4 < l_r: l_r = l_r / 2. optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost) prev_train_cost = curr_train_cost curr_cost = [] for batch_idx in range(0, valid_y.shape[0], batch_size): x_batch = valid_x[batch_idx:batch_idx + batch_size, :] y_batch = valid_y[batch_idx:batch_idx + batch_size, :] curr_cost.append(, feed_dict={z: x_batch, y: y_batch})) curr_cost = np.asarray(curr_cost).mean() if min_cost > curr_cost: min_cost = curr_cost n_cost_up = 0 val_weights = { 'h1': weights['h1'].eval(), 'h2': weights['h2'].eval(), 'h3': weights['h3'].eval(), 'h4': weights['h4'].eval(), 'h5': weights['h5'].eval(), } val_biases = { 'b1': biases['b1'].eval(), 'b2': biases['b2'].eval(), 'b3': biases['b3'].eval(), 'b4': biases['b4'].eval(), 'b5': biases['b5'].eval(), } val_p = p.eval() val_means = means.eval() val_covs = covs.eval() val_gamma = gamma.eval() elif epoch > nr_epoch: n_cost_up = n_cost_up + 1 if n_cost_up == 5 and 1e-4 < l_r: l_r = l_r / 2. optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost) elif n_cost_up == 10: break time_train[id_acc] = (time() - time_train[id_acc]) / (epoch + 1) means.load(val_means) covs.load(val_covs) p.load(val_p) gamma.load(val_gamma) for key in weights.keys(): weights[key].load(val_weights[key]) for key in biases.keys(): biases[key].load(val_biases[key]) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(predict, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) train_accuracy = [] for batch_idx in range(0, train_y.shape[0], batch_size): x_batch = train_x[batch_idx:batch_idx + batch_size, :] y_batch = train_y[batch_idx:batch_idx + batch_size, :] train_accuracy.append(accuracy.eval({z: x_batch, y: y_batch})) train_accuracy = np.mean(train_accuracy) valid_accuracy = [] for batch_idx in range(0, valid_y.shape[0], batch_size): x_batch = valid_x[batch_idx:batch_idx + batch_size, :] y_batch = valid_y[batch_idx:batch_idx + batch_size, :] valid_accuracy.append(accuracy.eval({z: x_batch, y: y_batch})) valid_accuracy = np.mean(valid_accuracy) time_test[id_acc] = time() test_accuracy = [] for batch_idx in range(0, test_y.shape[0], batch_size): x_batch = test_x[batch_idx:batch_idx + batch_size, :] y_batch = test_y[batch_idx:batch_idx + batch_size, :] test_accuracy.append(accuracy.eval({z: x_batch, y: y_batch})) test_accuracy = np.mean(test_accuracy) time_test[id_acc] = time() - time_test[id_acc] acc[0, id_acc] = train_accuracy acc[1, id_acc] = valid_accuracy acc[2, id_acc] = test_accuracy id_acc = id_acc + 1 mean_acc = np.average(acc, axis=1) std_acc = np.std(acc, axis=1) sys.stdout.flush() print( "{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{};{};{};{}".format( mean_acc[0], std_acc[0], mean_acc[1], std_acc[1], mean_acc[2], std_acc[2], np.average(time_train), np.average(time_test), FLAGS.learning_rate, FLAGS.batch_size, FLAGS.training_epochs, FLAGS.n_distribution))
class ImageSequenceGenerator: def __init__(self): = LabelBinarizer() labels = set(['BOO', 'BLO', 'BOR', 'BLR', 'OLR', 'OLO', 'OOR', 'OOO']) # label_type: "binary" or "categorical" def png_image_generator(self, path, bs, folder_list, difficulty="All", sequence_limit=16, resize_dimension=128, label_type="categorical", aug=None): f = open("{0}/{1}.txt".format(path, difficulty)) while True: X_data = [] Y_data = [] for x in range(len(folder_list)): folder = random.choice(folder_list) label = folder.split("_")[-2] folder += '/difference' images = [ folder + "/" + f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) ] img_list = [ ] #np.empty((16, 227,227,3)) # images from all the sequences flow_path_list = [] warped_path_list = [] diff_path_list = [] # split the images into sequneces of length 16 #(e.g. folder contains 20 images, then first seq is 1-16, second seq 2-17, third seq 3-18 etc) rnd = random.randint(0, max((len(images) - sequence_limit), 0)) try: each = images[rnd:rnd + sequence_limit] except IndexError: each = images[rnd:len(images) - 1] img_seq_list = [ ] # only images from one 16 image sequence, size will be (16, 227,227,3) one_images_seq = np.array(each) # 1-16, 2-17, etc # read each image to numpy sequence for img in one_images_seq: img_load = load_img(img, target_size=(resize_dimension, resize_dimension)) img_array = img_to_array(img_load) img_seq_list.append(img_array) X_data.append(np.asarray(img_seq_list)) Y_data.append(label) #folder_components[-2]) if (len(X_data) == bs): if label_type == "categorical": Y_data = elif label_type == "binary": Y_data = labels_to_binary(Y_data) else: print('Invalid label type!') yield np.asarray(X_data), np.asarray(Y_data) X_data = [] Y_data = []
assert len(set(train_labels)) == len(set(test_labels)), ( 'Something went wrong. Some classes are only in train or test data.' ) # yapf: disable # convert the data and labels to NumPy arrays while scaling the pixel # intensities to the range [0, 255] # train_data = np.array(train_data) / 255.0 # test_data = np.array(test_data) / 255.0 train_labels_text = np.array(train_labels) test_labels_text = np.array(test_labels) num_classes = len(set(train_labels)) # perform one-hot encoding on the labels lb = LabelBinarizer() train_labels = lb.transform(train_labels_text) test_labels = lb.transform(test_labels_text) if num_classes == 2: train_labels = to_categorical(train_labels, num_classes=num_classes) test_labels = to_categorical(test_labels, num_classes=num_classes) trainX = np.stack(train_data) trainY = np.stack(train_labels) testX = np.stack(test_data) testY = np.stack(test_labels) print('Class mappings are:', lb.classes_) print(trainX.shape, trainY.shape, testX.shape, testY.shape)
class SimClassifier(BaseSim, ClassifierMixin): """ Sim classification. Parameters ---------- reg_lambda : float, optional. default=0 Sparsity strength reg_gamma : float or list of float, optional. default=0.1 Roughness penalty strength of the spline algorithm degree : int, optional. default=3 The order of the spline knot_num : int, optional. default=5 Number of knots random_state : int, optional. default=0 Random seed """ def __init__(self, reg_lambda=0, reg_gamma=1e-5, knot_num=5, degree=3, random_state=0): super(SimClassifier, self).__init__(reg_lambda=reg_lambda, reg_gamma=reg_gamma, knot_num=knot_num, degree=degree, random_state=random_state) def _validate_input(self, x, y): """method to validate data Parameters --------- x : array-like of shape (n_samples, n_features) containing the input dataset y : array-like of shape (n_samples,) containing target values """ x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"], multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=False) self._label_binarizer = LabelBinarizer() self.classes_ = self._label_binarizer.classes_ y = self._label_binarizer.transform(y) * 1.0 return x, y.ravel() def _estimate_shape(self, x, y, xmin, xmax): """estimate the ridge function Parameters --------- x : array-like of shape (n_samples, n_features) containing the input dataset y : array-like of shape (n_samples,) containing the output dataset xmin : float the minimum value of beta ^ x xmax : float the maximum value of beta ^ x """ self.shape_fit_ = SMSplineClassifier(knot_num=self.knot_num, reg_gamma=self.reg_gamma, xmin=xmin, xmax=xmax,, y) def predict_proba(self, x): """output probability prediction for given samples Parameters --------- x : array-like of shape (n_samples, n_features) containing the input dataset Returns ------- np.array of shape (n_samples, 2) containing probability prediction """ pred = self.decision_function(x) pred_proba = softmax(np.vstack([-pred, pred]).T / 2, copy=False) return pred_proba def predict(self, x): """output binary prediction for given samples Parameters --------- x : array-like of shape (n_samples, n_features) containing the input dataset Returns ------- np.array of shape (n_samples,) containing binary prediction """ pred_proba = self.predict_proba(x)[:, 1] return self._label_binarizer.inverse_transform(pred_proba)