def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Note: This function was copied from http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb Args: y_true: True labels, list of strings y_pred: Predicted labels, list of strings Returns: classification report as string """ lbin = LabelBinarizer() y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred))) #tagset = set(lbin.classes_) - {NO_NE_LABEL} tagset = set(lbin.classes_) tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
class MLPClassifier(BaseMLP, ClassifierMixin): """ Multilayer Perceptron Classifier. Uses a neural network with one hidden layer. Parameters ---------- Attributes ---------- Notes ----- References ----------""" def __init__( self, n_hidden=200, lr=0.1, l2decay=0, loss="cross_entropy", output_layer="softmax", batch_size=100, verbose=0 ): super(MLPClassifier, self).__init__(n_hidden, lr, l2decay, loss, output_layer, batch_size, verbose) def fit(self, X, y, max_epochs=10, shuffle_data=False): self.lb = LabelBinarizer() one_hot_labels = self.lb.fit_transform(y) super(MLPClassifier, self).fit(X, one_hot_labels, max_epochs, shuffle_data) return self def predict(self, X): prediction = super(MLPClassifier, self).predict(X) return self.lb.inverse_transform(prediction)
def test_multinomial_loss(): # test if the multinomial loss and gradient computations are consistent X, y = iris.data, iris.target.astype(np.float64) n_samples, n_features = X.shape n_classes = len(np.unique(y)) rng = check_random_state(42) weights = rng.randn(n_features, n_classes) intercept = rng.randn(n_classes) sample_weights = rng.randn(n_samples) np.abs(sample_weights, sample_weights) # compute loss and gradient like in multinomial SAG dataset, _ = make_dataset(X, y, sample_weights, random_state=42) loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights, intercept, n_samples, n_features, n_classes) # compute loss and gradient like in multinomial LogisticRegression lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T # comparison assert_array_almost_equal(grad_1, grad_2) assert_almost_equal(loss_1, loss_2)
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin): def __init__(self, estimator): self.estimator = estimator def fit(self, X, Y): # binarize labels self.bl = LabelBinarizer() Y = self.bl.fit_transform(Y) self.classes_ = self.bl.classes_ # create an estimator for each label self.estimators_ = [] for i in xrange(self.bl.classes_.shape[0]): estimator = clone(self.estimator) estimator.fit(X, Y[:, i]) self.estimators_.append(estimator) def predict(self, X): self._check_is_fitted() X = np.atleast_2d(X) Y = np.empty((X.shape[0], self.classes_.shape[0])) for i, estimator in enumerate(self.estimators_): Y[:, i] = estimator.predict(X).T return self.bl.inverse_transform(Y) def _check_is_fitted(self): if not hasattr(self, "estimators_"): raise ValueError("The object hasn't been fitted yet!")
def test_multinomial_loss_ground_truth(): # n_samples, n_features, n_classes = 4, 2, 3 n_classes = 3 X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]]) y = np.array([0, 1, 2, 0]) lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]]) intercept = np.array([1., 0, -.2]) sample_weights = np.array([0.8, 1, 1, 0.8]) prediction = np.dot(X, weights) + intercept logsumexp_prediction = logsumexp(prediction, axis=1) p = prediction - logsumexp_prediction[:, np.newaxis] loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum() diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin) grad_1 = np.dot(X.T, diff) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T assert_almost_equal(loss_1, loss_2) assert_array_almost_equal(grad_1, grad_2) # ground truth loss_gt = 11.680360354325961 grad_gt = np.array([[-0.557487, -1.619151, +2.176638], [-0.903942, +5.258745, -4.354803]]) assert_almost_equal(loss_1, loss_gt) assert_array_almost_equal(grad_1, grad_gt)
def chi2_contingency_matrix(X_train, y_train): X = X_train.copy() X.data = np.ones_like(X.data) X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = LabelBinarizer().fit_transform(y_train) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = safe_sparse_dot(Y.T, X) # n_classes * n_features # feature_count = check_array(X.sum(axis=0)) # class_prob = check_array(Y.mean(axis=0)) feature_count = X.sum(axis=0).reshape(1, -1) class_prob = Y.mean(axis=0).reshape(1, -1) expected = np.dot(class_prob.T, feature_count) observed = np.asarray(observed, dtype=np.float64) k = len(observed) # Reuse observed for chi-squared statistics contingency_matrix = observed contingency_matrix -= expected contingency_matrix **= 2 expected[expected == 0.0] = 1.0 contingency_matrix /= expected # weights = contingency_matrix.max(axis=0) return contingency_matrix
class GBClassifier(_BaseGB, ClassifierMixin): def __init__(self, estimator, n_estimators=100, step_size="line_search", learning_rate=0.1, loss="squared_hinge", subsample=1.0, callback=None, random_state=None): self.estimator = estimator self.n_estimators = n_estimators self.step_size = step_size self.learning_rate = learning_rate self.loss = loss self.subsample = subsample self.callback = callback self.random_state = random_state def _get_loss(self): losses = dict(squared_hinge=_SquaredHingeLoss(), log=_LogLoss()) return losses[self.loss] def fit(self, X, y): self._lb = LabelBinarizer(neg_label=-1) Y = self._lb.fit_transform(y) return super(GBClassifier, self).fit(X, Y) def predict(self, X): pred = self.decision_function(X) return self._lb.inverse_transform(pred)
def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! """ lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} labs = [class_indices[cls] for cls in tagset] return((precision_recall_fscore_support(y_true_combined, y_pred_combined, labels=labs, average=None, sample_weight=None)), (classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )), labs)
def display_image_predictions(features, labels, predictions): n_classes = 10 label_names = _load_label_names() label_binarizer = LabelBinarizer() label_binarizer.fit(range(n_classes)) label_ids = label_binarizer.inverse_transform(np.array(labels)) fig, axies = plt.subplots(nrows=4, ncols=2) fig.tight_layout() fig.suptitle('Softmax Predictions', fontsize=20, y=1.1) n_predictions = 3 margin = 0.05 ind = np.arange(n_predictions) width = (1. - 2. * margin) / n_predictions for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)): pred_names = [label_names[pred_i] for pred_i in pred_indicies] correct_name = label_names[label_id] axies[image_i][0].imshow(feature*255) axies[image_i][0].set_title(correct_name) axies[image_i][0].set_axis_off() axies[image_i][1].barh(ind + margin, pred_values[::-1], width) axies[image_i][1].set_yticks(ind + margin) axies[image_i][1].set_yticklabels(pred_names[::-1]) axies[image_i][1].set_xticks([0, 0.5, 1.0])
def train_logreg(X, y, test_X, test_y, load_vec=True): """ Trains logistic regression on the feature set. """ full_y = y + test_y lb = LabelBinarizer() lb.fit(full_y) # Convert into 1-D array print len(X), len(test_X) model = LogisticRegression() big_X = X + test_X features = featurize(big_X) X, test_X = features[:4500], features[4500:] print X.shape, X model.fit(X, y) y_pred = model.predict(X) print set(y_pred) print metrics.classification_report(y, y_pred, digits = 3) y_pred = model.predict(test_X) print set(y_pred) print metrics.classification_report(test_y, y_pred, digits = 3)
def Encoding(data, general_matrix=None): encoder = LabelBinarizer() count = 0 # encoding for i in range(data.shape[1]): if type(data[0, i]) == str: count += 1 col = data[:, i] unique = np.unique(col if general_matrix is None else general_matrix[:, i]) try: encoder.fit(unique) except: pass new_col = encoder.transform(col) # split at i and i + 1 before, removed, after = np.hsplit(data, [i, i + 1]) # concatenate data = np.concatenate((before, new_col, after), axis=1) before, removed, after = np.hsplit(general_matrix, [i, i + 1]) general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1) print "count : %d" % count # return data return data
def binarize_seqfeature(X): """ Binarizes the sequence features into 1s and 0s. Parameters: =========== - X: (pandas DataFrame) the sequence feature matrix without drug resistance values. Returns: ======== - binarized: (pandas DataFrame) a binarized sequence feature matrix with columns corresponding to particular amino acids at each position. - binarizers: (dictionary) a dictionary of binarizer objects for each position. """ binarized = pd.DataFrame() binarizers = dict() for col in X.columns: lb = LabelBinarizer() binarized_cols = lb.fit_transform(X[col]) if len(lb.classes_) == 2: binarized[col] = pd.Series(binarized_cols[:, 0]) else: for i, c in enumerate(lb.classes_): binarized[col + "_" + c] = binarized_cols[:, i] binarizers[col] = lb return binarized, binarizers
def bio_classification_report(y_true, y_pred): lb = LabelBinarizer() y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = list(chain.from_iterable(y_pred)) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined)) print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None), roc_auc_score(y_true_combined, y_pred_combined, average=None)) #plt.figure() #fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined) #area = auc(fpr, tpr) #plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area)) #plt.legend(loc=4) #plt.savefig('sub3.jpg') return classification_report( 1 - y_true_combined, [0 if v > 0.1 else 1 for v in y_pred_combined], labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
def just_categorical(dropped): # create initial matrix print('starting with m0') lb = LabelBinarizer(sparse_output=True) m = lb.fit_transform(dropped.restaurant_id) print(m.shape) # build matrix # making nan its own category for categorical print("adding categorical to matrix") m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city', 'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode', 'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',]) print(m.shape) print("adding bool to matrix") m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating', 'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out', 'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ]) print(m.shape) print("adding restaurant categories to matrix") cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("adding restaurant neighborhoods to matrix") cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("matrix shape of {}".format(m.shape)) joblib.dump(m, 'pickle_jar/categorical_matrix')
def transform(self, data_dict): listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml", "cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"] regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")" data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower() lb = LabelBinarizer() return lb.fit_transform(data.fillna(""))
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. val.check_consistent_length(T, Y) T = val.check_array(T) Y = val.check_array(Y) print(T) print(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def full_matrix(dropped): # create initial matrix print('starting with m0') lb = LabelBinarizer(sparse_output=True) # m = lb.fit_transform(dropped.restaurant_id) m = lb.fit_transform(dropped.user_name) print(m.shape) # build matrix # making nan its own category for categorical print("adding categorical to matrix") m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city', 'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode', 'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',]) print(m.shape) print("adding bool to matrix") m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating', 'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out', 'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ]) print(m.shape) m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold']) print(m.shape) print("adding restaurant categories to matrix") cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("adding restaurant neighborhoods to matrix") cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("matrix shape of {}".format(m.shape)) joblib.dump(m, 'pickle_jar/full_matrix')
class BusinessCategoriesFeature(BaseEstimator): """ WARNING!!! Works only with a modified version of LabelBinarizer. A binarization of the reviews' business categories. """ def __init__(self, data=None): self.data = data def __create_labels_list(self, review_list): labels = [] for review in review_list: business = self.data.get_business_for_review(review) labels.append(business['categories']) return labels def fit(self, X, y): self.binarizer = LabelBinarizer() labels = self.__create_labels_list(X) self.binarizer.fit(labels) return self def transform(self, X): labels = self.__create_labels_list(X) binarized_labels = self.binarizer.transform(labels) return binarized_labels.astype(float)
def fit(self, X, y): """ performs one step of gradient descent """ # get the dimensions of our data n_samples, n_features = X.shape[0], X.shape[1]+1 n_targets = len(np.unique(y)) # add a column to the data matrix to incorporate the bias term X = np.c_[np.ones(n_samples), X] # one-vs-all labeling lb = LabelBinarizer() y = lb.fit_transform(y) # initialize the weights if self.W is None: self.W = np.zeros( (n_features, n_targets) ) # perform the optimization using gradient descent with momentum grad = self.gradient(X,y) self.W = self.W - self.learning_rate*(grad + self.momentum*self.prev_grad) self.prev_grad = grad return self.loss(X,y)
def run(): # Load and preprocess data label_to_unique_instance = load_data() X, Y = preprocess_data(label_to_unique_instance) # Encode labels label_binarizer = LabelBinarizer() transformed_Y = label_binarizer.fit_transform(Y) # Cross validation cross_validation_iterator = StratifiedShuffleSplit(Y, n_iter=1, test_size=0.4, random_state=0) for train_index, test_index in cross_validation_iterator: break # Init model model = init_model(raw_feature_dim=X.shape[-1], unique_lable_num=len(label_binarizer.classes_)) # Training procedure model.fit(X[train_index], transformed_Y[train_index], batch_size=BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM, validation_data=(X[test_index], transformed_Y[test_index]), callbacks=[TensorBoard(log_dir="/tmp/Sequence Classification")], verbose=2) print("All done!")
def report(test_y, pred_y): lb = LabelBinarizer() test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y))) pred_y_combined = lb.transform(list(chain.from_iterable(pred_y))) tagset = sorted(set(lb.classes_)) class_indices = {cls: idx for idx, cls in enumerate(tagset)} print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
def conv_demo(): # load the digits dataset digits = load_digits() X = digits['data'] y_labels = digits['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building neural net to classify digits") conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1], random_state=RANDOM_STATE) print("Training") conv_net.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05) y_pred = conv_net.predict(X_test) print("digits accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
class CategoricalToNumerical(object): def __init__(self, dimensionality_reducer=None, verify=True): pass """Takes in a dimensionality reducer in order to convert categorical features into numerical. """ if dimensionality_reducer is None: dimensionality_reducer = RandomizedPCA(1) self.dimensionality_reducer = dimensionality_reducer self.verify = verify self.binarizer = LabelBinarizer() def fit(self, X, y=None): self._verify(X, self.verify) binarized = self.binarizer.fit_transform(X) self.dimensionality_reducer.fit(binarized) def transform(self, X): self._verify(X, False) binarized = self.binarizer.transform(X) result = self.dimensionality_reducer.transform(binarized).flatten() assert X.shape == result.shape return result def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def _verify(self, X, verify): if verify: assert is_categorical(X) else: assert isinstance(X, np.ndarray) assert len(X.shape) == 1
def get_dataset2(test_fraction): """ @:param: test_fraction used to split train and test Vectorizes the features and labels into categorical values and randomly splits into train and test set :return: X_train, X_test, y_train, y_test """ data = [] with open('labels.csv', 'r') as datafile: csv_reader = csv.reader(datafile, delimiter=',', quotechar='|') for row in csv_reader: data.append(row) data = numpy.asarray(data) X = data[:, 0:data.shape[1]-1] y = data[:, data.shape[1]-1] # X,y = get_tabledata() vec = DictVectorizer() feature_dict = [dict(enumerate(x)) for x in X.tolist()] X = vec.fit_transform(feature_dict).toarray() joblib.dump(vec, 'vectorizer.pkl') lb = LabelBinarizer() y = lb.fit_transform(y) joblib.dump(lb, 'binarizer.pkl') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction) return X_train, X_test, y_train, y_test
def bio_classification_report(y_true, y_pred): """Evaluates entity extraction accuracy. Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb """ from sklearn.preprocessing import LabelBinarizer from itertools import chain from sklearn.metrics import classification_report lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
def binarize_label_columns(df, columns, two_classes_as='single'): ''' Inputs: df: Pandas dataframe object. columns: Columns to binarize. tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns. Returns a tuple with the following items: df: Pandas dataframe object with new columns. binlabel_names: Names of the newly created binary variables. lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer objects as values. ''' binlabel_names = [] lb_objects = {} for col in columns: if len(df[col].unique()) > 1: rows_notnull = df[col].notnull() # Use only valid feature observations lb = LabelBinarizer() binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations if len(lb.classes_) == 2 and two_classes_as == 'multiple': binclass = np.hstack((1 - binclass, binclass)) lb_objects[col] = lb if len(lb.classes_) > 2 or two_classes_as == 'multiple': col_binlabel_names = [col+'_'+str(c) for c in lb.classes_] binlabel_names += col_binlabel_names # Names for the binarized classes for n in col_binlabel_names: df[n] = np.NaN # Initialize columns df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data elif two_classes_as == 'single': binlabel_names.append(col+'_bin') # Names for the binarized classes df[col+'_bin'] = np.NaN # Initialize columns df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data return df, binlabel_names, lb_objects
def scorer_auc(y_true, y_pred): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import LabelBinarizer """Dedicated to 2class probabilistic outputs""" le = LabelBinarizer() y_true = le.fit_transform(y_true) return roc_auc_score(y_true, y_pred)
def iris_demo(): # load the iris dataset iris = load_iris() X = iris['data'] y_labels = iris['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building logistic regression classifier to classify iris data") nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]]) print("Training") nn.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05, random_state=RANDOM_STATE) y_pred = nn.predict(X_test) print("iris accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
def one_hot_encoding(y_train, y_test): labelBinarizer = LabelBinarizer() labelBinarizer.fit(y_train) y_train_one_hot = labelBinarizer.transform(y_train) y_test_one_hot = labelBinarizer.transform(y_test) return y_train_one_hot, y_test_one_hot
class BaseSGD(object): def _get_loss(self): losses = { "modified_huber": ModifiedHuber(), "hinge": Hinge(1.0), "perceptron": Hinge(0.0), "log": Log(), "sparse_log": SparseLog(), "squared": SquaredLoss(), "huber": Huber(self.epsilon), "epsilon_insensitive": EpsilonInsensitive(self.epsilon), } return losses[self.loss] def _get_learning_rate(self): learning_rates = {"constant": 1, "pegasos": 2, "invscaling": 3} return learning_rates[self.learning_rate] def _set_label_transformers(self, y): if self.multiclass == "natural": self.label_encoder_ = LabelEncoder() y = self.label_encoder_.fit_transform(y).astype(np.float64) self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1) self.label_binarizer_.fit(y) self.classes_ = self.label_binarizer_.classes_.astype(np.int32) n_classes = len(self.label_binarizer_.classes_) n_vectors = 1 if n_classes <= 2 else n_classes return n_classes, n_vectors
import argparse # construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-o", "--output", required=True, help="path to the output loss/accuracy plot") args = vars(ap.parse_args()) # load the training and testing data, then scale it into the range [0, 1] print("[INFO] loading the CIFAR-10 data...") ((trainX, trainY), (testX, testY)) = cifar10.load_data() trainX = trainX.astype("float")/255.0 testX = testX.astype("float")/255.0 # convert the labels from integers to vectors lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.fit_transform(testY) # initialize the label names for CIFAR-10 dataset labelNames=["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"] # initialize the optimizer and model print("[INFO] compiling the model...") opt = SGD(lr=0.001, decay=0.01/40, momentum=0.9, nesterov=True) model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) print(model.summary())
print(data.head()) print ("\nFeatures : \n" ,data.columns.tolist()) print ("\nMissing values : ", data.isnull().sum().values.sum()) print ("\nUnique values : \n",data.nunique()) for i in data.columns: data[i] = data[i].replace("?",np.nan) data = data[data[i].notnull()] data = data.reset_index()[data.columns] data[i] = data[i].astype(float) X = data.drop(['income'], axis=1) y = data['income'] from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer() Y = encoder.fit_transform(y) from keras.models import Sequential #Sequential Models from keras.layers import Dense #Dense Fully Connected Layer Type from keras.optimizers import SGD #Stochastic Gradient Descent Optimizer def create_network(): model = Sequential() model.add(Dense(25, input_shape=(13,), activation='relu')) model.add(Dense(9, activation='softmax')) #stochastic gradient descent
image = img_to_array(image) data.append(image) # extract the class label from the image path and update the label list label = imagePath.split(os.path.sep)[-2] #print(str(imagePath)+"STR"+str(label)) labels.append(label) # scale the raw pixel intensities to the range [0, 1] data = np.array(data, dtype="float") / 255.0 labels = np.array(labels) #print(labels) print("[INFO] data matrix: {:.2f}MB".format(data.nbytes / (1024 * 1000.0))) # binarize the labels lb = LabelBinarizer() labels = lb.fit_transform(labels) print("LB.CLASSES=" + str(lb.classes_)) # partition data into train and validation (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.2, random_state=42) # construct image generator for data augmentation aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2,
dataUni = stretch_interp(data, dur) dataUni = np.array(dataUni, dtype="float") (train_x, test_x, train_y, test_y) = train_test_split(dataUni, labels, test_size=0.1, random_state=14) #train_x = train_x[:, :, np.newaxis] #test_x = test_x[:, :, np.newaxis] print(np.shape(train_x), np.shape(train_y), np.shape(test_x), np.shape(test_y)) print("labels:", train_y) lb = LabelBinarizer() train_y = lb.fit_transform(train_y) test_y = lb.transform(test_y) print("labels after transform:", train_y) print(np.shape(train_x), np.shape(train_y), np.shape(test_x), np.shape(test_y)) batch_size = np.shape(train_x)[0] model = Sequential() #model.add(Conv1D(3012, input_shape=(53, dur*3012), kernel_size=(12), activation='relu')) model.add(Dense(3012, activation="relu", input_shape=(dur * 3012, ))) print("after 1 dense:", model.input_shape, model.output_shape) model.add(Dense(506, activation="sigmoid")) model.add(Dropout(0.25)) model.add(Dense(253, activation="sigmoid"))
def train_on_texts(self, texts, context_labels=None, batch_size=128, num_epochs=50, verbose=1, new_model=False, gen_epochs=1, train_size=1.0, max_gen_length=300, validation=True, dropout=0.0, via_new_model=False, save_epochs=0, multi_gpu=False, **kwargs): if new_model and not via_new_model: self.train_new_model(texts, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, train_size=train_size, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs) return if context_labels: context_labels = LabelBinarizer().fit_transform(context_labels) if 'prop_keep' in kwargs: train_size = prop_keep if self.config['word_level']: # If training word level, must add spaces around each # punctuation. https://stackoverflow.com/a/3645946/9314418 punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—…' for i in range(len(texts)): texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i]) texts[i] = re.sub(' {2,}', ' ', texts[i]) texts = [text_to_word_sequence(text, filters='') for text in texts] # calculate all combinations of text indices + token indices indices_list = [np.meshgrid(np.array(i), np.arange( len(text) + 1)) for i, text in enumerate(texts)] # indices_list = np.block(indices_list) # this hangs when indices_list is large enough # FIX BEGIN ------ indices_list_o = np.block(indices_list[0]) for i in range(len(indices_list)-1): tmp = np.block(indices_list[i+1]) indices_list_o = np.concatenate([indices_list_o, tmp]) indices_list = indices_list_o # FIX END ------ # If a single text, there will be 2 extra indices, so remove them # Also remove first sequences which use padding if self.config['single_text']: indices_list = indices_list[self.config['max_length']:-2, :] indices_mask = np.random.rand(indices_list.shape[0]) < train_size if multi_gpu: num_gpus = len(config.get_visible_devices('GPU')) batch_size = batch_size * num_gpus gen_val = None val_steps = None if train_size < 1.0 and validation: indices_list_val = indices_list[~indices_mask, :] gen_val = generate_sequences_from_texts( texts, indices_list_val, self, context_labels, batch_size) val_steps = max( int(np.floor(indices_list_val.shape[0] / batch_size)), 1) indices_list = indices_list[indices_mask, :] num_tokens = indices_list.shape[0] assert num_tokens >= batch_size, "Fewer tokens than batch_size." level = 'word' if self.config['word_level'] else 'character' print("Training on {:,} {} sequences.".format(num_tokens, level)) steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1) gen = generate_sequences_from_texts( texts, indices_list, self, context_labels, batch_size) base_lr = 4e-3 # scheduler function must be defined inline. def lr_linear_decay(epoch): return (base_lr * (1 - (epoch / num_epochs))) ''' FIXME This part is a bit messy as we need to initialize the model within strategy.scope() when using multi-GPU. Can probably be cleaned up a bit. ''' if context_labels is not None: if new_model: weights_path = None else: weights_path = "{}_weights.hdf5".format(self.config['name']) self.save(weights_path) if multi_gpu: from tensorflow import distribute as distribute strategy = distribute.MirroredStrategy() with strategy.scope(): parallel_model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config, context_size=context_labels.shape[1], weights_path=weights_path) parallel_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=4e-3)) model_t = parallel_model print("Training on {} GPUs.".format(num_gpus)) else: model_t = self.model else: if multi_gpu: from tensorflow import distribute as distribute if new_model: weights_path = None else: weights_path = "{}_weights.hdf5".format(self.config['name']) strategy = distribute.MirroredStrategy() with strategy.scope(): # Do not locate model/merge on CPU since sample sizes are small. parallel_model = textgenrnn_model(self.num_classes, cfg=self.config, weights_path=weights_path) parallel_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=4e-3)) model_t = parallel_model print("Training on {} GPUs.".format(num_gpus)) else: model_t = self.model model_t.fit(gen, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[ LearningRateScheduler( lr_linear_decay), generate_after_epoch( self, gen_epochs, max_gen_length), save_model_weights( self, num_epochs, save_epochs)], verbose=verbose, max_queue_size=10, validation_data=gen_val, validation_steps=val_steps ) # Keep the text-only version of the model if using context labels if context_labels is not None: self.model = Model(inputs=self.model.input[0], outputs=self.model.output[1])
def fit(self, X, y): self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1) Y = np.asfortranarray(self.label_binarizer_.fit_transform(y), dtype=np.float64) return self._fit(X, Y)
from NeuralNetwork import NeuralNetwork from sklearn.preprocessing import LabelBinarizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn import datasets print("[INFO] loading MNIST (sample) dataset...") digits = datasets.load_digits() data = digits.data.astype("float") data = (data - data.min()) / (data.max() - data.min()) print("[INFO] samples: {}, dim: {}".format(data.shape[0], data.shape[1])) (trainX, testX, trainY, testY) = train_test_split(data, digits.target, test_size=0.25) trainY = LabelBinarizer().fit_transform(trainY) testY = LabelBinarizer().fit_transform(testY) print("[INFO] trainig network...") nn = NeuralNetwork([trainX.shape[1], 32, 16, 10]) print("[INFO] {}".format(nn)) nn.fit(trainX, trainY, epochs=1000) print("[INFO] evaluating network...") prediction = nn.predict(testX) prediction = predictions.argmax(axis=1) print(classification_report(testY.argmax(axis=1), predictions))
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. .. versionadded:: 0.17 *sample_weight* support to LogisticRegression. Returns ------- self : object Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = [self._label_binarizer.fit_transform(yy).ravel() for yy in y] # self.lr_p2 = [SGDClassifier( # loss='log', l1_ratio=self.l1_ratio_lamda, # fit_intercept=self.fit_intercept, shuffle=False, # penalty='elasticnet', alpha=self.lamda, warm_start=True, # max_iter=(self.max_iter // 3 if self.deep else 1) + 0) # for i in range(len(X))] self.lr_p2 = [ LogisticRegression( fit_intercept=self.fit_intercept, penalty='l2', solver='lbfgs', C=1. / (self.lamda * (1 - self.l1_ratio_lamda)), warm_start=True, max_iter=(self.max_iter // 3 if self.deep else 1) + 5) for i in range(len(X)) ] self.alpha_, self.coef_, self.intercept_, self.n_iter_ = \ logistic_alternating( X, Y, lamda=self.lamda, beta=self.beta, gamma=self.gamma, max_iter=self.max_iter, verbose=self.verbose, tol=self.tol, return_n_iter=True, deep=self.deep, lr_p2=self.lr_p2, l1_ratio_beta=self.l1_ratio_beta, l1_ratio_lamda=self.l1_ratio_lamda, # unused fit_intercept=self.fit_intercept # unused ) if self.classes_.shape[0] > 2: # ndim = self.classes_.shape[0] raise ValueError("too many classes") else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) # self.alpha_ = [alpha.reshape(ndim, -1) for alpha in self.alpha_] self.y_train_ = Y return self
class LocalTransformer(BaseEstimator, TransformerMixin): def __init__(self, k=1, fxs=None, hour_mean=True, aux=True, ens_std=False, hour_std=False, stid_enc=False): self.k = k self.hour_mean = hour_mean self.fxs = fxs self.aux = aux self.ens_std = ens_std self.hour_std = hour_std self.stid_enc = stid_enc def fit(self, X, y=None): assert y is not None n_stations = y.shape[1] self.n_stations = n_stations assert X.station_info.shape[0] == self.n_stations stid = np.arange(98) self.stid_lb = LabelBinarizer() self.stid_lb.fit(stid) return self @classmethod def transform_labels(cls, y): y = y.ravel(1) return y # @profile def transform(self, X, y=None): k = self.k n_days = X.shape[0] ll_coord = X.station_info[:, 0:2] lat_idx = np.searchsorted(X.lat, ll_coord[:, 0]) lon_idx = np.searchsorted(X.lon, ll_coord[:, 1] + 360) #IPython.embed() n_fx = 0 for b_name, X_b in X.blocks.iteritems(): old_n_fx = n_fx if self.fxs is not None and b_name not in self.fxs: continue if X_b.ndim == 6: if self.fxs is not None and b_name in self.fxs: n_fxs = len(self.fxs[b_name]) else: n_fxs = X_b.shape[1] shapes = [n_fxs] if not self.hour_mean: shapes.append(X_b.shape[3]) shapes.extend([k * 2, k * 2]) print b_name, shapes n_fx += np.prod(shapes) elif X_b.ndim == 1: n_fx += 1 elif X_b.ndim == 2: n_fx += X_b.shape[1] else: raise ValueError('%s has wrong dim: %d' % (b_name, X_b.ndim)) print 'block: %s as %d n_fx' % (b_name, n_fx - old_n_fx) if self.stid_enc: n_fx += len(self.stid_lb.classes_) # num of features - based on blocks + station info (5 fx) if self.aux: n_fx = n_fx + 3 + 2 + 2 X_p = np.zeros((n_days * self.n_stations, n_fx), dtype=np.float32) offset = 0 for bid, b_name in enumerate(X.blocks): if self.fxs is not None and b_name not in self.fxs: continue print 'localizing block: %s' % b_name X_b = X[b_name] # select fx if fxs given if self.fxs is not None and self.fxs.get(b_name, None): fxs = self.fxs[b_name] idx = [ i for i, name in enumerate(X.fx_name[b_name]) if name in fxs ] X_b = X_b[:, idx] if X_b.ndim == 6: # FIXME over hours if self.hour_mean: X_b = np.mean(X_b, axis=3) elif self.hour_std: X_b = np.std(X_b, axis=3) # over ensembles if self.ens_std: X_b = np.std(X_b, axis=2) else: X_b = np.mean(X_b, axis=2) offset_inc = 0 for i in range(self.n_stations): lai, loi = lat_idx[i], lon_idx[i] if (self.hour_mean or self.hour_std): blk = X_b[:, :, lai - k:lai + k, loi - k:loi + k] else: blk = X_b[:, :, :, lai - k:lai + k, loi - k:loi + k] blk = blk.reshape((blk.shape[0], np.prod(blk.shape[1:]))) X_p[i * n_days:((i + 1) * n_days), offset:(offset + blk.shape[1])] = blk if i == 0: offset_inc = blk.shape[1] del blk gc.collect() offset += offset_inc elif X_b.ndim == 1 or (X_b.ndim == 2 and X_b.shape[1] == 1): X_p[:, offset:offset + 1] = np.tile(X_b.ravel(), self.n_stations)[:, np.newaxis] offset += 1 elif X_b.ndim == 2: # FIXME wrong stitching together stuff print('block: %s will be repeated for each station' % b_name) width = X_b.shape[1] X_p[:, offset:offset + width] = np.tile(X_b, (self.n_stations, 1)) #IPython.embed() offset += width else: raise ValueError('%s has wrong dim: %d' % (b_name, X_b.ndim)) if self.stid_enc: stid = np.repeat(self.stid_lb.classes_, n_days) stid_enc = self.stid_lb.transform(stid) X_p[:, offset:(offset + stid_enc.shape[1])] = stid_enc offset += stid_enc.shape[1] if self.aux: # lat, lon, elev X_p[:, offset:(offset + 3)] = np.repeat(X.station_info, n_days, axis=0) offset += 3 # compute pos of station within grid cell (in degree lat lon) lat_idx = np.repeat(lat_idx, n_days) lon_idx = np.repeat(lon_idx, n_days) # offset - 3 is station lat X_p[:, offset] = (X_p[:, offset - 3] - X.lat[lat_idx]) # offset - 2 is station lon X_p[:, offset + 1] = (X_p[:, offset - 2] - (X.lon[lon_idx] - 360.)) # FIXME add lat lon idx offset += 2 X_p[:, offset] = lat_idx X_p[:, offset + 1] = lon_idx print 'X_p.shape: ', X_p.shape return X_p
import numpy as np from NeuralNet import NeuralNet from sklearn.preprocessing import LabelBinarizer train = np.loadtxt('./wine/train_wine.csv', delimiter=',') lb = LabelBinarizer() train_y = lb.fit_transform(train[:, 0]) train_x = train[:, 1:] mean = np.mean(train_x, axis=0) train_x -= mean var = np.var(train_x, axis=0) train_x /= var nn = NeuralNet([13, 13, 3], 0.01) nn.train(train_x, train_y) test = np.loadtxt('./wine/test_wine.csv', delimiter=',') test_y = test[:, 0] test_x = test[:, 1:] test_x -= mean test_x /= var y_pred = np.argmax(nn.predict(test_x), axis=1) accu = np.sum(test_y == y_pred + 1) / len(test_y) print(accu)
dpt = 3 print("[INFO] loading Images") imagePaths = list(paths.list_images(args["dataset"])) sp = SimplePreprocessor(size, size) iap = ImageToArrayPreprocessor() sdl = SimpleDatasetLoader(preprocessors=[sp, iap]) (data, labels) = sdl.load(imagePaths, verbose=500) print(labels) data = data.astype("float") / 255.0 (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) trainY = LabelBinarizer().fit_transform(trainY) testY = LabelBinarizer().fit_transform(testY) print("[INFO] compiling model...") opt = SGD(lr=0.025) model = IncludeNet.build(width=size, height=size, depth=dpt, classes=4) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=size, epochs=ep,
class MultipleLogisticRegressionMultipleKernel( LogisticRegressionMultipleKernel, LogisticRegression, LinearClassifierMixin): # Ensure consistent split _pairwise = True def __init__(self, penalty='l2', dual=False, tol=1e-4, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1, l1_ratio_lamda=0.1, l1_ratio_beta=0.1, deep=True, lamda=0.01, gamma=1, rho=1, rtol=1e-4, beta=0.01): super(MultipleLogisticRegressionMultipleKernel, self).__init__(penalty=penalty, dual=dual, tol=tol, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs, lamda=lamda, gamma=gamma, rho=rho, rtol=rtol, beta=beta) self.l1_ratio_lamda = l1_ratio_lamda self.l1_ratio_beta = l1_ratio_beta self.deep = deep def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. .. versionadded:: 0.17 *sample_weight* support to LogisticRegression. Returns ------- self : object Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = [self._label_binarizer.fit_transform(yy).ravel() for yy in y] # self.lr_p2 = [SGDClassifier( # loss='log', l1_ratio=self.l1_ratio_lamda, # fit_intercept=self.fit_intercept, shuffle=False, # penalty='elasticnet', alpha=self.lamda, warm_start=True, # max_iter=(self.max_iter // 3 if self.deep else 1) + 0) # for i in range(len(X))] self.lr_p2 = [ LogisticRegression( fit_intercept=self.fit_intercept, penalty='l2', solver='lbfgs', C=1. / (self.lamda * (1 - self.l1_ratio_lamda)), warm_start=True, max_iter=(self.max_iter // 3 if self.deep else 1) + 5) for i in range(len(X)) ] self.alpha_, self.coef_, self.intercept_, self.n_iter_ = \ logistic_alternating( X, Y, lamda=self.lamda, beta=self.beta, gamma=self.gamma, max_iter=self.max_iter, verbose=self.verbose, tol=self.tol, return_n_iter=True, deep=self.deep, lr_p2=self.lr_p2, l1_ratio_beta=self.l1_ratio_beta, l1_ratio_lamda=self.l1_ratio_lamda, # unused fit_intercept=self.fit_intercept # unused ) if self.classes_.shape[0] > 2: # ndim = self.classes_.shape[0] raise ValueError("too many classes") else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) # self.alpha_ = [alpha.reshape(ndim, -1) for alpha in self.alpha_] self.y_train_ = Y return self def predict(self, X): """Predict using the kernel ridge model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- C : array, shape = [n_samples] or [n_samples, n_targets] Returns predicted values. """ check_is_fitted(self, ["alpha_", "coef_"]) # return [LinearClassifierMixin.predict( # self, np.tensordot(k, a, axes=1)) for a, k in zip( # self.alpha_, X)] return [ self.lr_p2[i].predict( np.tensordot(self.coef_.ravel(), X[i], axes=1)) for i in range(len(X)) ] def score(self, K, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0. Parameters ---------- X : array-like, shape = (n_samples, n_features) Test samples. y : array-like, shape = (n_samples) or (n_samples, n_outputs) True values for X. sample_weight : array-like, shape = [n_samples], optional Sample weights. Returns ------- score : float R^2 of self.predict(X) wrt. y. """ y_pred = self.predict(K) if sample_weight is None: return np.mean( [accuracy_score(y[j], y_pred[j]) for j in range(len(K))]) else: return np.mean([ accuracy_score(y[j], y_pred[j], sample_weight=sample_weight[j]) for j in range(len(K)) ]) def predict_proba(self, X): """Probability estimates. The returned estimates for all classes are ordered by the label of classes. For a multi_class problem, if multi_class is set to be "multinomial" the softmax function is used to find the predicted probability of each class. Else use a one-vs-rest approach, i.e calculate the probability of each class assuming it to be positive using the logistic function. and normalize these values across all the classes. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ check_is_fitted(self, ["alpha_", "coef_"]) # return [LinearClassifierMixin._predict_proba_lr( # self, np.tensordot(k, a, axes=1)) for a, k in zip( # self.alpha_, X)] return [ self.lr_p2[i].predict_proba( np.tensordot(self.coef_.ravel(), X[i], axes=1)) for i in range(len(X)) ] def predict_log_proba(self, X): """Log of probability estimates. The returned estimates for all classes are ordered by the label of classes. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ proba = self.predict_proba(X) return [np.log(p) for p in proba]
3.0 >>> round_of_rating(4.1) 4.0""" return np.round(number * 2) / 2 min_val = -40 max_val = 40 y_train = round_of_rating(saturate(y_train, min_val, max_val)) r_int = 0.5 slist = np.arange(min_val, max_val + r_int, r_int) * 2 #multiply by 2 to allow labelbinarizer to work lb = LabelBinarizer() lb.fit(slist) ylabels = lb.transform(y_train * 2) # In[17]: print(x_train.shape) print(xfcss_train.shape) print(ylabels.shape) # In[18]: nsamps = x_train.shape[0] n80p = int(np.floor(nsamps * 0.8)) rannums = np.array(random.sample(range(1, nsamps, 1), n80p)) s_nfiles = np.arange(nsamps)
def fit(self, X, y=None): if self.op == 'month' and hasattr(X, 'date'): month = X.date.map(lambda x: x.month) self.lb = LabelBinarizer() self.lb.fit(month) return self
class Model(ModelBase): def __init__(self): super(Model,self).__init__("LR") self.vectorizer=CountVectorizer(lowercase=False,binary=True, analyzer=analyzer_) self.binarizer=LabelBinarizer() self.algo = LogisticRegression(C=1,fit_intercept=False,class_weight='balanced') self.label_dict={} self.label_index=0 self.label_map={} def process_text(self, text): words = text.lower().split() stopwords = set('for a an the of and to in'.split()) words=[word for word in words if word not in stopwords] return words def hot_encode(self,word): index=self.label_index if(word[0] not in self.label_dict): self.label_dict[word[0]]=self.label_index self.label_map[self.label_index]=word[0] index=self.label_index self.label_index+=1 else: index=self.label_dict[word[0]] return index #@ModelBase.train def train(self,docs): # Create sentences from documents #docs=json.loads(data) inputs=[] labels=[] for doc in docs: inputs.append(self.process_text(doc['rawText'])) labels.append(self.hot_encode(doc['tags'])) self.binarizer.fit_transform(labels) inputs = self.vectorizer.fit_transform(inputs) print(inputs) print(labels) self.vectorizer.stop_words_=None self.algo.fit(inputs, labels) print('Training done') return {'status': 2, 'reason': '', 'numRecords': len(inputs)} def validate(self,docs): #docs = json.loads(data) inputs = [] labels = [] for doc in docs: inputs.append(self.process_text(doc['rawText'])) labels.append(self.hot_encode(doc['tags'])) inputs = self.vectorizer.transform(inputs) predictions = self.algo.predict(inputs) fpr, tpr, threshold = roc_curve(labels, predictions) roc = {"fpr": fpr.tolist(), "tpr": tpr.tolist(), "threshold": threshold.tolist()} report = { "classification_report": classification_report(labels, predictions), "confusion_matrix": confusion_matrix(labels, predictions).tolist(), "roc": roc, "roc_auc": auc(fpr, tpr) } print (report) return {'status': 2, 'reason':'', 'report': report, 'numRecords': len(inputs)} def predict(self,docs): #docs = json.loads(data) inputs = [self.process_text(element['rawText']) for element in docs] inputs = self.vectorizer.transform(inputs) predictions = self.algo.predict(inputs) results=[] index=0 for i in predictions.tolist(): result={} result["predicted_tags"]=self.label_map[i] results.append(result) index+=1 print (results) return results def persist(self, path): package={ 'vectorizer': self.vectorizer, 'binarizer': self.binarizer, 'algo': self.algo, 'label_map': self.label_map, 'label_dict': self.label_dict } with open(path, 'wb') as file: pickle.dump(package,file, -1) def reload(self, path): with open(path, 'rb') as file: package = pickle.load(file) for key,value in package.items(): setattr(self,key,value)
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0002: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) from wordbatch.models import nn_relu_h1, nn_relu_h2 modelnn = nn_relu_h1.NN_ReLU_H1(alpha=0.05, L2=0.00001, D_nn=60, D=sparse_merge.shape[1], \ iters=1, inv_link="identity", threads=threads) baseline = 1. print('[{}] Epoch time '.format(time.time() - start_time)) for i in range(3): modelnn.fit(train_X, train_y, verbose=1) predsnn = modelnn.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsnn)) print("FM_FTRL dev RMSLE:", score_) print('[{}] Epoch time '.format(time.time() - start_time)) if score_ < baseline - 0.0002: baseline = score_ else: break pd.Series((np.expm1(predsnn) - np.expm1(predsfm))).hist() print( "FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), 0.1 * (np.expm1(predsnn)) + 0.9 * (np.expm1(predsfm)))) tpoint2 = time.time() print("Time for Training: {}".format(hms_string(tpoint2 - tpoint1))) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
# 숫자 범주화 from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit([1, 2, 2, 6]) le.classes_ # scikit 에서 결과변수인 경우, '_'를 붙이는 관행이 있다. le.transform([1, 1, 2, 6]) # labeling 하라 le.fit(["서울", "서울", "대전", "부산"]) le.classes_ le.transform(["서울", "서울", "부산"]) # 0과 1로 레이블링하라. from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() lb.fit([1, 2, 6, 1, 2]) # 1, 2, 6 의 세가지 경우가 있다. lb.classes_ lb.transform([1, 6]) #1을 표현하는 것과, 6을 표현하는 것을 보여줌. # dictionary feature 정보를 matrix로 표현. from sklearn.feature_extraction import DictVectorizer v = DictVectorizer(sparse=False) D = [{ 'foo': 1, 'bar': 2 }, { 'foo': 3, 'baz': 1 }] #한 문서에 foo가 1번, bar가 2번 나오고, 다른 문서에는 foo 3번, baz 1번.
def plot_mc_roc(y_test, y_score, interpreter=None): ''' plotting function that generates roc curves for data given to it. :param y_test: is the testing data used :param y_score: is the score when the testing data was called :param interpreter: is what was used to preprocess :return a roc plot ''' lw = 2 n_classes = len(np.unique(y_test)) classes = pd.unique(y_test) label_binarizer = LabelBinarizer() label_binarizer.fit(np.concatenate((y_test, y_score))) if n_classes != 2: y_test = label_binarizer.transform(y_test) y_score = label_binarizer.transform(y_score) else: n_classes = 1 y_test = y_test.reshape(-1, 1) y_score = y_score.reshape(-1, 1) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = sklearn.metrics.auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves img = plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) for i in range(n_classes): plt.plot( fpr[i], tpr[i], lw=lw, label='ROC curve of class {0} (area = {1:0.2f})' ''.format( interpreter.inverse_transform( [[label_binarizer.classes_[i]]])[0], roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc="lower right") return img
from sklearn.preprocessing import StandardScaler import random import numpy as np n_batch = 100 n_step = 8 n_input = 1 n_output = 10 n_cell = 100 lr = 0.006 n_train = 8000 bear = Bear() data = scio.loadmat('wavedata.mat')['wavedata'] target = bear.target lb = LabelBinarizer() target = lb.fit_transform(target) # print(target, ...) # print(lb.inverse_transform(target)) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.333) # scaler = StandardScaler() # X_train = scaler.fit_transform(X_train) # X_test = scaler.transform(X_test) total_train_batch, total_test_batch = X_train.shape[0], X_test.shape[0] print(total_train_batch, ...) print(X_train.shape, y_test.shape, ...) lstm = LSTM(n_batch, n_step, n_input, n_output, n_cell) with tf.Session() as sess:
aap = AspectAwarePreprocessor(64, 64) iap = ImageToArrayPreprocessor() #load the dataset from disk and then scale the raw pixel intensities #to the range [0,1] sdl = SimpleDatasetLoader(preprocessors=[aap, iap]) (data, labels) = sdl.load(imagePaths, verbose=-1) data = data.astype("float") / 255.0 (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) # convert the labels from integers to vectors trainY = LabelBinarizer().fit_transform(trainY) testY = LabelBinarizer().fit_transform(testY) # initialize the optimizer and model print("[INFO] compiling model...") opt = SGD(lr=0.01, decay=0.01 / 20, momentum=0.9, nesterov=True) model = Network.build(width=64, height=64, depth=3, classes=len(classNames)) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # train the network print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY),
df_test = create_df(os.path.join(datapath, test_fname), img_path, partial_dataset=part_dat, seed=123) preds = np.load('10-02-2020_cont_colab.npy') predsfl = np.load('22-01-2020_cont_colab.npy') yhat = np.argmax(preds, axis=1) + 1 yhatfl = np.argmax(predsfl, axis=1) + 1 from sklearn.preprocessing import LabelBinarizer y = df_test['label'].to_numpy() lb = LabelBinarizer().fit(range(1, 40)) yhot = lb.transform(y) #%% from sklearn.metrics import roc_curve, auc, roc_auc_score from scipy import interp fpr = dict() tpr = dict() roc_auc = dict() for i in range(39): fpr[i], tpr[i], _ = roc_curve(yhot[:, i], preds[:, i]) if np.isnan(tpr[i]).any() or np.isnan(fpr[i]).any(): fpr[i] = tpr[i] = np.zeros(39) roc_auc[i] = auc(fpr[i], tpr[i])
def load_data(): try: # Reload the data from saved pickle file with open(pickle_file, 'rb') as f: pickle_data = pickle.load(f) train_features_ = pickle_data['train_dataset'] train_labels_ = pickle_data['train_labels'] valid_features_ = pickle_data['valid_dataset'] valid_labels_ = pickle_data['valid_labels'] test_features_ = pickle_data['test_dataset'] test_labels_ = pickle_data['test_labels'] del pickle_data # Free up memory return train_features_, train_labels_, valid_features_, valid_labels_, test_features_, test_labels_ except IOError: print("failed to reload data from %s, load it from begining" % pickle_file) # Download the training and test dataset. download_if_necessary( 'https://s3.amazonaws.com/udacity-sdc/notMNIST_train.zip', 'notMNIST_train.zip', 'c8673b3f28f489e9cdf3a3d74e2ac8fa') download_if_necessary( 'https://s3.amazonaws.com/udacity-sdc/notMNIST_test.zip', 'notMNIST_test.zip', '5d3c7e653e63471c88df796156a9dfa9') # Make sure the files aren't corrupted assert hashlib.md5(open('notMNIST_train.zip', 'rb').read()).hexdigest() == 'c8673b3f28f489e9cdf3a3d74e2ac8fa', \ 'notMNIST_train.zip file is corrupted. Remove the file and try again.' assert hashlib.md5(open('notMNIST_test.zip', 'rb').read()).hexdigest() == '5d3c7e653e63471c88df796156a9dfa9', \ 'notMNIST_test.zip file is corrupted. Remove the file and try again.' # Get the features and labels from the zip files train_features_, train_labels_ = uncompress_features_labels( 'notMNIST_train.zip') test_features_, test_labels_ = uncompress_features_labels( 'notMNIST_test.zip') # Limit the amount of data to work with a docker container docker_size_limit = 150000 train_features_, train_labels_ = resample(train_features_, train_labels_, n_samples=docker_size_limit) # normalize the data train_features_ = normalize_grayscale(train_features_) test_features_ = normalize_grayscale(test_features_) # Turn labels into numbers and apply One-Hot Encoding encoder = LabelBinarizer() encoder.fit(train_labels_) # one-hor encode, and change to float32, # so it can be multiplied against the features in TensorFlow, which are float32 train_labels_ = encoder.transform(train_labels_).astype(np.float32) test_labels_ = encoder.transform(test_labels_).astype(np.float32) # Get randomized datasets for training and validation train_features_, valid_features_, train_labels_, valid_labels_ = train_test_split( train_features_, train_labels_, test_size=0.05, random_state=832289) # Save the data for easy access if not os.path.isfile(pickle_file): print('Saving data to pickle file...') try: with open('notMNIST.pickle', 'wb') as pfile: pickle.dump( { 'train_dataset': train_features_, 'train_labels': train_labels_, 'valid_dataset': valid_features_, 'valid_labels': valid_labels_, 'test_dataset': test_features_, 'test_labels': test_labels_, }, pfile, pickle.HIGHEST_PROTOCOL) print('Data cached in pickle file.') except Exception as e: print('Unable to save data to', pickle_file, ':', e) raise return train_features_, train_labels_, valid_features_, valid_labels_, test_features_, test_labels_
import tensorflow as tf from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelBinarizer # load data digits = load_digits() x_set = digits.data y_set = LabelBinarizer().fit_transform(digits.target) x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.3) def add_layer(inputs, in_size, out_size, activition_function=None): Weights = tf.Variable(tf.random_normal([in_size, out_size])) biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) # 不推荐为0 Wx_plus_b = tf.matmul(inputs, Weights) + biases Wx_plus_b = tf.nn.dropout(Wx_plus_b, keep_probability) if activition_function is None: outputs = Wx_plus_b else: outputs = activition_function(Wx_plus_b) tf.summary.histogram('/outputs', outputs) return outputs keep_probability = tf.placeholder(tf.float32) # 保存率 X = tf.placeholder(tf.float32, shape=[None, 64]) Y = tf.placeholder(tf.float32, shape=[None, 10])
def eval_models(model_paths, data_path, save_path=None): ''' Evaluates performance of a model in terms of loss, accuracy, confusion matrix, and mean per-class recall Parameters: model_paths(dict) - dictionary with model names as keys and paths pointing to the .h5 files of the trained models as values data_path(string) - path to the image directory of the target dataset json_path(string) - optional file path to save output to Returns: Dictionary of dictionaries each containing loss, accuracy, confusion matrix, and mean per-class recall for a given model ''' from keras.models import load_model from keras.backend import clear_session from keras.preprocessing.image import ImageDataGenerator from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score from sklearn.preprocessing import LabelBinarizer import gc import json # build generator to feed the model print('Building image generator...') generator = ImageDataGenerator().flow_from_directory(data_path, target_size=(224, 224), batch_size=8, shuffle=False) y_true = generator.classes # evaluate all models model_results = dict() for name, path in model_paths.items(): # load model print('Loading {}'.format(path)) model = load_model(path) # run basic evaluation print('Evaluating {}'.format(path)) metrics = dict() metrics['loss'], metrics['acc'] = model.evaluate_generator(generator) # predict labels y_prob = model.predict_generator(generator) y_pred = y_prob.argmax(axis=1) # calculate confusion matrix cm = confusion_matrix(y_true, y_pred) metrics['cm'] = cm.tolist() # mean per-class recall pcr = cm.diagonal() / cm.sum(axis=1) metrics['pcr'] = list(pcr) metrics['mpcr'] = pcr.mean() # F1 score metrics['class_f1s'] = list(f1_score(y_true, y_pred, average=None)) metrics['macro_f1'] = f1_score(y_true, y_pred, average='macro') metrics['micro_f1'] = f1_score(y_true, y_pred, average='micro') # AUC score y_binary = LabelBinarizer().fit_transform(y_true) metrics['class_aucs'] = list( roc_auc_score(y_binary, y_prob, average=None)) metrics['macro_auc'] = roc_auc_score(y_binary, y_prob, average='macro') metrics['micro_auc'] = roc_auc_score(y_binary, y_prob, average='micro') model_results[name] = metrics # remove clutter from memory del model clear_session() gc.collect() if save_path: print('Saving evaluation to {}'.format(save_path)) with open(save_path, 'w') as f: json.dump(model_results, f) print('Evaluation complete.\n') gc.collect() return model_results
class MatheusAlvesMLP(BaseEstimator, ClassifierMixin): # or RegressonMixin? def __init__(self, params=None): if params is None: self.ctor({}) else: self.ctor(params) def ctor(self, params): self.alpha = params.get("alpha", 0.00001) # L2 regularization self.max_iter = params.get( "max_iter", 500) # max iteration to the optimization algorithm self.hidden_layers_size = params.get("hidden_layers_size", (100, 200, 300)) self.shuffle = params.get("shuffle", False) # shuflle samples in interactions? self.random_state = params.get( "random_state", None) # state or seed for generating random number self.tol = params.get("tol", 1e-5) # Loss tolerance for optimization self.layers_coef = None self.layers_intercept = None self.cost = None self.n_iter = 0 self.classes = None self.label_binarizer_ = LabelBinarizer() def _unstack(self, stacked_parameters): for i in range(self.n_layers_ - 1): start, end, shape = self._coef_indptr[i] self.layers_coef[i] = np.reshape(stacked_parameters[start:end], shape) start, end = self._intercept_indptr[i] self.layers_intercept[i] = stacked_parameters[start:end] def _forward_pass(self, activations, with_output_activation=True): for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.layers_coef[i]) activations[i + 1] += self.layers_intercept[i] # For the hidden layers if i + 1 != self.n_layers_ - 1: activations[i + 1] = rectified_linear_unit(activations[i + 1]) # For the last layer if with_output_activation: activations[i + 1] = rectified_linear_unit(activations[i + 1]) return activations def _compute_cost_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads): coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer]) coef_grads[layer] += (self.alpha * self.layers_coef[layer]) coef_grads[layer] /= n_samples intercept_grads[layer] = np.mean(deltas[layer], 0) return coef_grads, intercept_grads def _cost_grad_lbfgs(self, stacked_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads): self._unstack(stacked_coef_inter) cost, coef_grads, intercept_grads = self._backprop( X, y, activations, deltas, coef_grads, intercept_grads) self.n_iter += 1 grad = stack(coef_grads, intercept_grads) return cost, grad def _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads): n_samples = X.shape[0] # Forward propagate activations = self._forward_pass(activations) # Get cost using log loss function cost = log_loss(y, activations[-1]) # Add regularization term to the cost values = np.sum(np.array([np.sum(s**2) for s in self.layers_coef])) cost += (0.5 * self.alpha) * values / n_samples # Backward propagate last = self.n_layers_ - 2 diff = y - activations[-1] deltas[last] = -diff # Compute gradient for the last layer coef_grads, intercept_grads = self._compute_cost_grad( last, n_samples, activations, deltas, coef_grads, intercept_grads) # Iterate over the hidden layers for i in range(self.n_layers_ - 2, 0, -1): deltas[i - 1] = safe_sparse_dot(deltas[i], self.layers_coef[i].T) deltas[i - 1] *= rectified_linear_unit_derivative(activations[i]) coef_grads, intercept_grads = self._compute_cost_grad( i - 1, n_samples, activations, deltas, coef_grads, intercept_grads) return cost, coef_grads, intercept_grads def fit(self, X, y): hidden_layers_size = list(self.hidden_layers_size) n_samples, n_features = X.shape self.label_binarizer_.fit(y) if self.classes is None: self.classes = self.label_binarizer_.classes_ else: classes = self.label_binarizer_.classes_ y = self.label_binarizer_.transform(y) self.n_outputs = y.shape[1] layer_units = ([n_features] + hidden_layers_size + [self.n_outputs]) # If it is the first time training the model if self.layers_coef is None: # Initialize parameters self.n_outputs = y.shape[1] # Compute the number of layers self.n_layers_ = len(layer_units) # Initialize coefficient and intercept layers self.layers_coef = [] self.layers_intercept = [] for i in range(self.n_layers_ - 1): rng = check_random_state(self.random_state) n_fan_in = layer_units[i] n_fan_out = layer_units[i + 1] # Use the Gorot initialization method weight_init_bound = np.sqrt(6. / (n_fan_in + n_fan_out)) self.layers_coef.append( rng.uniform(-weight_init_bound, weight_init_bound, (n_fan_in, n_fan_out))) self.layers_intercept.append( rng.uniform(-weight_init_bound, weight_init_bound, n_fan_out)) if self.shuffle: X, y = shuffle(X, y, random_state=self.random_state) # Initialize lists activations = [X] activations.extend( np.empty((n_samples, n_fan_out)) for n_fan_out in layer_units[1:]) deltas = [np.empty_like(a_layer) for a_layer in activations] coef_grads = [ np.empty((n_fan_in, n_fan_out)) for n_fan_in, n_fan_out in zip(layer_units[:-1], layer_units[1:]) ] intercept_grads = [ np.empty(n_fan_out) for n_fan_out in layer_units[1:] ] # START LBFGS algorithm # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unstacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unstacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # enable pretty output for l_bfgs_b iprint = 1 # Run L-BFGS_B opitimization stacked_coef_inter = stack(self.layers_coef, self.layers_intercept) optimal_parameters, self.cost, d = fmin_l_bfgs_b( x0=stacked_coef_inter, func=self._cost_grad_lbfgs, maxfun=self.max_iter, iprint=iprint, pgtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads)) self._unstack(optimal_parameters) return self def decision_function(self, X): hidden_layers_size = list(self.hidden_layers_size) layer_units = [X.shape[1]] + hidden_layers_size + [self.n_outputs] # Initialize layers activations = [] activations.append(X) for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, with_output_activation=False) y_pred = activations[-1] if self.n_outputs == 1: return y_pred.ravel() else: return y_pred def predict(self, X): y_scores = self.decision_function(X) y_scores = rectified_linear_unit(y_scores) return self.label_binarizer_.inverse_transform(y_scores)
def trainFMFTRL(moddict): merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8') #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', merge.shape) dftt = merge[(merge.price < 1.0)] merge = merge.drop(merge[(merge.price < 1.0)].index) del dftt['price'] nrow_train = merge.shape[0] # print(nrow_train, nrow_test) y = np.log1p(merge["price"]) merge = pd.concat([merge, dftt]) merge['target'] = np.log1p(merge["price"]) #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]), random_state=233, train_size=0.90) gc.collect() cpuStats() merge = prepFMFeatures(merge) cpuStats() merge.head() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.where( X_name[:nrow_train].getnnz(axis=0) > 0)[0] X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset #cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(categories) X_cat_tmp = moddict['wb_cat'].transform(categories) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(categories.tolist(), range(len(categories))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0] X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_dscr_mask'] = np.where( X_description[:nrow_train].getnnz(axis=0) > 1)[0] X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) print(50 * '-') cpuStats() print(50 * '-') # Remove features with document frequency <=1 print(sparse_merge.shape) gc.collect() sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train] if develop: train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \ sparse_merge[validx], \ y.values[trnidx], y.values[validx] del sparse_merge gc.collect() print(50 * '*') cpuStats() print(50 * '*') print(train_X.shape[1]) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=4) #iters=15 print(50 * '|') cpuStats() print(50 * '|') baseline = 1. for i in range(15): print(50 * '-') cpuStats() print(50 * '-') model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) gc.collect() return merge, moddict, trnidx, validx, nrow_train, predsfm
image_path.split(os.path.sep)[LABEL_PATH_INDEX] for image_path in image_paths ] class_names = [str(x) for x in np.unique(class_names)] aap = AspectAwarePreprocessor(WIDTH, HEIGHT) iap = ImageToArrayPreprocessor() sdl = SimpleDatasetLoader([aap, iap]) data, labels = sdl.load(image_paths, verbose=500) data = data.astype('float') / 255.0 train_X, test_X, train_y, test_y = train_test_split(data, labels, test_size=0.25) label_binarizer = LabelBinarizer() train_y = label_binarizer.fit_transform(train_y) test_y = label_binarizer.transform(test_y) aug = ImageDataGenerator(rotation_range=30, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest') print('[INFO] compiling model') model = MiniVGGNet.build(width=WIDTH, height=HEIGHT, depth=3,
train_posts = data['news'][:train_size] train_tags = data['category'][:train_size] train_files_names = data['filename'][:train_size] test_posts = data['news'][train_size:] test_tags = data['category'][train_size:] test_files_names = data['filename'][train_size:] # define Tokenizer with Vocab Size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_posts) x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf') x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf') encoder = LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) model = Sequential() model.add(Dense(512, input_shape=(vocab_size,))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary()
def test_gradient(): # Test gradient. # This makes sure that the activation functions and their derivatives # are correct. The numerical and analytical computation of the gradient # should be close. for n_labels in [2, 3]: n_samples = 5 n_features = 10 random_state = np.random.RandomState(seed=42) X = random_state.rand(n_samples, n_features) y = 1 + np.mod(np.arange(n_samples) + 1, n_labels) Y = LabelBinarizer().fit_transform(y) for activation in ACTIVATION_TYPES: mlp = MLPClassifier(activation=activation, hidden_layer_sizes=10, solver='lbfgs', alpha=1e-5, learning_rate_init=0.2, max_iter=1, random_state=1) mlp.fit(X, y) theta = np.hstack( [l.ravel() for l in mlp.coefs_ + mlp.intercepts_]) layer_units = ([X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]) activations = [] deltas = [] coef_grads = [] intercept_grads = [] activations.append(X) for i in range(mlp.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) deltas.append(np.empty((X.shape[0], layer_units[i + 1]))) fan_in = layer_units[i] fan_out = layer_units[i + 1] coef_grads.append(np.empty((fan_in, fan_out))) intercept_grads.append(np.empty(fan_out)) # analytically compute the gradients def loss_grad_fun(t): return mlp._loss_grad_lbfgs(t, X, Y, activations, deltas, coef_grads, intercept_grads) [value, grad] = loss_grad_fun(theta) numgrad = np.zeros(np.size(theta)) n = np.size(theta, 0) E = np.eye(n) epsilon = 1e-5 # numerically compute the gradients for i in range(n): dtheta = E[:, i] * epsilon numgrad[i] = ((loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]) / (epsilon * 2.0)) assert_almost_equal(numgrad, grad)
def training_summary(H): N = EPOCHS plt.style.use("ggplot") plt.figure() plt.plot(np.arange(0, N), H.history["loss"], label="train_loss") plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss") plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc") plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="lower left") plt.savefig(args["plot"]) if __name__ == '__main__': INIT_LR = 1e-4 EPOCHS = 20 BS = 32 args = argument_parser() lb = LabelBinarizer() path = list(paths.list_images(args["dataset"])) data, labels = get_images(path) labels = one_hot_encoder(lb, labels) aug = augmetation_image() xtrain, xtest, ytrain, ytest = data_splitting(data, labels) model, H = model_construction(aug, xtrain, xtest, ytrain, ytest) training_summary(H)