Exemple #1
0
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!

    Note: This function was copied from
    http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

    Args:
        y_true: True labels, list of strings
        y_pred: Predicted labels, list of strings
    Returns:
        classification report as string
    """
    lbin = LabelBinarizer()
    y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred)))

    #tagset = set(lbin.classes_) - {NO_NE_LABEL}
    tagset = set(lbin.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
Exemple #2
0
class MLPClassifier(BaseMLP, ClassifierMixin):
    """ Multilayer Perceptron Classifier.

    Uses a neural network with one hidden layer.


    Parameters
    ----------


    Attributes
    ----------

    Notes
    -----


    References
    ----------"""

    def __init__(
        self, n_hidden=200, lr=0.1, l2decay=0, loss="cross_entropy", output_layer="softmax", batch_size=100, verbose=0
    ):
        super(MLPClassifier, self).__init__(n_hidden, lr, l2decay, loss, output_layer, batch_size, verbose)

    def fit(self, X, y, max_epochs=10, shuffle_data=False):
        self.lb = LabelBinarizer()
        one_hot_labels = self.lb.fit_transform(y)
        super(MLPClassifier, self).fit(X, one_hot_labels, max_epochs, shuffle_data)
        return self

    def predict(self, X):
        prediction = super(MLPClassifier, self).predict(X)
        return self.lb.inverse_transform(prediction)
def test_multinomial_loss():
    # test if the multinomial loss and gradient computations are consistent
    X, y = iris.data, iris.target.astype(np.float64)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    rng = check_random_state(42)
    weights = rng.randn(n_features, n_classes)
    intercept = rng.randn(n_classes)
    sample_weights = rng.randn(n_samples)
    np.abs(sample_weights, sample_weights)

    # compute loss and gradient like in multinomial SAG
    dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
    loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights,
                                                        intercept, n_samples,
                                                        n_features, n_classes)
    # compute loss and gradient like in multinomial LogisticRegression
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)
    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    # comparison
    assert_array_almost_equal(grad_1, grad_2)
    assert_almost_equal(loss_1, loss_2)
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, Y):
        # binarize labels
        self.bl = LabelBinarizer()
        Y = self.bl.fit_transform(Y)
        self.classes_ = self.bl.classes_

        # create an estimator for each label
        self.estimators_ = []
        for i in xrange(self.bl.classes_.shape[0]):
            estimator = clone(self.estimator)
            estimator.fit(X, Y[:, i])
            self.estimators_.append(estimator)

    def predict(self, X):
        self._check_is_fitted()

        X = np.atleast_2d(X)
        Y = np.empty((X.shape[0], self.classes_.shape[0]))
        for i, estimator in enumerate(self.estimators_):
            Y[:, i] = estimator.predict(X).T

        return self.bl.inverse_transform(Y)

    def _check_is_fitted(self):
        if not hasattr(self, "estimators_"):
            raise ValueError("The object hasn't been fitted yet!")
def test_multinomial_loss_ground_truth():
    # n_samples, n_features, n_classes = 4, 2, 3
    n_classes = 3
    X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]])
    y = np.array([0, 1, 2, 0])
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)

    weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]])
    intercept = np.array([1., 0, -.2])
    sample_weights = np.array([0.8, 1, 1, 0.8])

    prediction = np.dot(X, weights) + intercept
    logsumexp_prediction = logsumexp(prediction, axis=1)
    p = prediction - logsumexp_prediction[:, np.newaxis]
    loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum()
    diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin)
    grad_1 = np.dot(X.T, diff)

    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    assert_almost_equal(loss_1, loss_2)
    assert_array_almost_equal(grad_1, grad_2)

    # ground truth
    loss_gt = 11.680360354325961
    grad_gt = np.array([[-0.557487, -1.619151, +2.176638],
                        [-0.903942, +5.258745, -4.354803]])
    assert_almost_equal(loss_1, loss_gt)
    assert_array_almost_equal(grad_1, grad_gt)
def chi2_contingency_matrix(X_train, y_train):
    X = X_train.copy()
    X.data = np.ones_like(X.data)

    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = LabelBinarizer().fit_transform(y_train)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)  # n_classes * n_features

    # feature_count = check_array(X.sum(axis=0))
    # class_prob = check_array(Y.mean(axis=0))
    feature_count = X.sum(axis=0).reshape(1, -1)
    class_prob = Y.mean(axis=0).reshape(1, -1)
    expected = np.dot(class_prob.T, feature_count)

    observed = np.asarray(observed, dtype=np.float64)

    k = len(observed)
    # Reuse observed for chi-squared statistics
    contingency_matrix = observed
    contingency_matrix -= expected
    contingency_matrix **= 2

    expected[expected == 0.0] = 1.0

    contingency_matrix /= expected

    # weights = contingency_matrix.max(axis=0)

    return contingency_matrix
Exemple #7
0
class GBClassifier(_BaseGB, ClassifierMixin):

    def __init__(self, estimator, n_estimators=100,
                 step_size="line_search", learning_rate=0.1,
                 loss="squared_hinge", subsample=1.0,
                 callback=None, random_state=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.step_size = step_size
        self.learning_rate = learning_rate
        self.loss = loss
        self.subsample = subsample
        self.callback = callback
        self.random_state = random_state

    def _get_loss(self):
        losses = dict(squared_hinge=_SquaredHingeLoss(),
                      log=_LogLoss())
        return losses[self.loss]

    def fit(self, X, y):
        self._lb = LabelBinarizer(neg_label=-1)
        Y = self._lb.fit_transform(y)
        return super(GBClassifier, self).fit(X, Y)

    def predict(self, X):
        pred = self.decision_function(X)
        return self._lb.inverse_transform(pred)
Exemple #8
0
    def bio_classification_report(y_true, y_pred):
        """
        Classification report for a list of BIO-encoded sequences.
        It computes token-level metrics and discards "O" labels.

        Note that it requires scikit-learn 0.15+ (or a version from
        github master) to calculate averages properly!
        """
        lb = LabelBinarizer()
        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        labs = [class_indices[cls] for cls in tagset]

        return((precision_recall_fscore_support(y_true_combined,
                                                y_pred_combined,
                                                labels=labs,
                                                average=None,
                                                sample_weight=None)),
               (classification_report(
                   y_true_combined,
                   y_pred_combined,
                   labels=[class_indices[cls] for cls in tagset],
                   target_names=tagset,
               )), labs)
Exemple #9
0
def display_image_predictions(features, labels, predictions):
    n_classes = 10
    label_names = _load_label_names()
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(range(n_classes))
    label_ids = label_binarizer.inverse_transform(np.array(labels))

    fig, axies = plt.subplots(nrows=4, ncols=2)
    fig.tight_layout()
    fig.suptitle('Softmax Predictions', fontsize=20, y=1.1)

    n_predictions = 3
    margin = 0.05
    ind = np.arange(n_predictions)
    width = (1. - 2. * margin) / n_predictions

    for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)):
        pred_names = [label_names[pred_i] for pred_i in pred_indicies]
        correct_name = label_names[label_id]

        axies[image_i][0].imshow(feature*255)
        axies[image_i][0].set_title(correct_name)
        axies[image_i][0].set_axis_off()

        axies[image_i][1].barh(ind + margin, pred_values[::-1], width)
        axies[image_i][1].set_yticks(ind + margin)
        axies[image_i][1].set_yticklabels(pred_names[::-1])
        axies[image_i][1].set_xticks([0, 0.5, 1.0])
Exemple #10
0
def train_logreg(X, y, test_X, test_y, load_vec=True):
	""" 	
	Trains logistic regression on the feature set.
	"""
	full_y = y + test_y
	
	lb = LabelBinarizer()
	lb.fit(full_y)
	# Convert into 1-D array
	print len(X), len(test_X)
	model = LogisticRegression()
	big_X = X + test_X

	features = featurize(big_X)
	X, test_X = features[:4500], features[4500:]
	print X.shape, X

	model.fit(X, y)

	y_pred = model.predict(X)
	print set(y_pred)
	print metrics.classification_report(y, y_pred, digits = 3)
	y_pred = model.predict(test_X)
	print set(y_pred)
	print metrics.classification_report(test_y, y_pred, digits = 3)
def Encoding(data, general_matrix=None):
    encoder = LabelBinarizer()
    count = 0
    # encoding
    for i in range(data.shape[1]):
        if type(data[0, i]) == str:
            count += 1
            col = data[:, i]
            unique = np.unique(col if general_matrix is None else general_matrix[:, i])

            try:
                encoder.fit(unique)
            except:
                pass

            new_col = encoder.transform(col)

            # split at i and i + 1
            before, removed, after = np.hsplit(data, [i, i + 1])
            # concatenate
            data = np.concatenate((before, new_col, after), axis=1)
            before, removed, after = np.hsplit(general_matrix, [i, i + 1])
            general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1)

    print "count : %d" % count
    # return data
    return data
def binarize_seqfeature(X):
    """
    Binarizes the sequence features into 1s and 0s.
    
    Parameters:
    ===========
    - X: (pandas DataFrame) the sequence feature matrix without drug resistance values.
    
    Returns:
    ========
    - binarized:     (pandas DataFrame) a binarized sequence feature matrix with columns corresponding to particular amino acids at each position.
    - binarizers:    (dictionary) a dictionary of binarizer objects for each position.
    """
    binarized = pd.DataFrame()
    binarizers = dict()
    for col in X.columns:
        lb = LabelBinarizer()
        binarized_cols = lb.fit_transform(X[col])
        if len(lb.classes_) == 2:
            binarized[col] = pd.Series(binarized_cols[:, 0])
        else:
            for i, c in enumerate(lb.classes_):
                binarized[col + "_" + c] = binarized_cols[:, i]
        binarizers[col] = lb

    return binarized, binarizers
Exemple #13
0
def bio_classification_report(y_true, y_pred):

    lb = LabelBinarizer()
    y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = list(chain.from_iterable(y_pred))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined))
    print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None),
        roc_auc_score(y_true_combined, y_pred_combined, average=None))
    #plt.figure()
    #fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined)
    #area = auc(fpr, tpr)
    #plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area))
    #plt.legend(loc=4)
    #plt.savefig('sub3.jpg')

    return classification_report(
        1 - y_true_combined,
        [0 if v > 0.1 else 1 for v in y_pred_combined],
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
Exemple #14
0
def just_categorical(dropped):

    # create initial matrix
    print('starting with m0')
    lb = LabelBinarizer(sparse_output=True)
    m = lb.fit_transform(dropped.restaurant_id)
    print(m.shape)

    # build matrix
    # making nan its own category for categorical
    print("adding categorical to matrix")
    m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
    print(m.shape)

    print("adding bool to matrix")
    m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
    print(m.shape)

    print("adding restaurant categories to matrix")
    cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("adding restaurant neighborhoods to matrix")
    cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("matrix shape of {}".format(m.shape))
    joblib.dump(m, 'pickle_jar/categorical_matrix')
 def transform(self, data_dict):
     listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml",
             "cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"]
     regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")"
     data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower()
     lb = LabelBinarizer()
     return lb.fit_transform(data.fillna(""))
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    # Check if dimensions are consistent.
    val.check_consistent_length(T, Y)
    T = val.check_array(T)
    Y = val.check_array(Y)
    print(T)
    print(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return _weighted_sum(loss, sample_weight, normalize)
Exemple #17
0
def full_matrix(dropped):
    # create initial matrix
    print('starting with m0')
    lb = LabelBinarizer(sparse_output=True)
    # m = lb.fit_transform(dropped.restaurant_id)
    m = lb.fit_transform(dropped.user_name)
    print(m.shape)
    # build matrix
    # making nan its own category for categorical
    print("adding categorical to matrix")
    m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
    print(m.shape)

    print("adding bool to matrix")
    m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
    print(m.shape)

    m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold'])
    print(m.shape)

    print("adding restaurant categories to matrix")
    cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("adding restaurant neighborhoods to matrix")
    cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("matrix shape of {}".format(m.shape))
    joblib.dump(m, 'pickle_jar/full_matrix')
Exemple #18
0
class BusinessCategoriesFeature(BaseEstimator):
	"""
	WARNING!!!
	Works only with a modified version of LabelBinarizer.

	A binarization of the reviews' business categories.
	"""

	def __init__(self, data=None):
		self.data = data

	def __create_labels_list(self, review_list):
		labels = []
		for review in review_list:
			business = self.data.get_business_for_review(review)
			labels.append(business['categories'])
		return labels

	def fit(self, X, y):
		self.binarizer = LabelBinarizer()
		labels = self.__create_labels_list(X)
		self.binarizer.fit(labels)
		return self

	def transform(self, X):
		labels = self.__create_labels_list(X)
		binarized_labels = self.binarizer.transform(labels)
		return binarized_labels.astype(float)
Exemple #19
0
    def fit(self, X, y):
        """
        performs one step of gradient descent
        """
        # get the dimensions of our data
        n_samples, n_features = X.shape[0], X.shape[1]+1
        n_targets = len(np.unique(y))

        # add a column to the data matrix to incorporate the bias term
        X = np.c_[np.ones(n_samples), X]
        
        # one-vs-all labeling
        lb = LabelBinarizer()
        y = lb.fit_transform(y)
        
        # initialize the weights 
        if self.W is None:
            self.W = np.zeros( (n_features, n_targets) )
       
        # perform the optimization using gradient descent with momentum
        grad = self.gradient(X,y)
        self.W = self.W - self.learning_rate*(grad + self.momentum*self.prev_grad)
        self.prev_grad = grad

        return self.loss(X,y)
def run():
    # Load and preprocess data
    label_to_unique_instance = load_data()
    X, Y = preprocess_data(label_to_unique_instance)

    # Encode labels
    label_binarizer = LabelBinarizer()
    transformed_Y = label_binarizer.fit_transform(Y)

    # Cross validation
    cross_validation_iterator = StratifiedShuffleSplit(Y, n_iter=1, test_size=0.4, random_state=0)
    for train_index, test_index in cross_validation_iterator:
        break

    # Init model
    model = init_model(raw_feature_dim=X.shape[-1], unique_lable_num=len(label_binarizer.classes_))

    # Training procedure
    model.fit(X[train_index], transformed_Y[train_index],
              batch_size=BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM,
              validation_data=(X[test_index], transformed_Y[test_index]),
              callbacks=[TensorBoard(log_dir="/tmp/Sequence Classification")],
              verbose=2)

    print("All done!")
Exemple #21
0
def report(test_y, pred_y):
    lb = LabelBinarizer()
    test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y)))
    pred_y_combined = lb.transform(list(chain.from_iterable(pred_y)))
    tagset = sorted(set(lb.classes_))
    class_indices = {cls: idx for idx, cls in enumerate(tagset)}
    print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
Exemple #22
0
def conv_demo():
    # load the digits dataset
    digits = load_digits()
    X = digits['data']
    y_labels = digits['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building neural net to classify digits")
    conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1],
                            random_state=RANDOM_STATE)
    print("Training")
    conv_net.fit(X_train, y_train, X_valid, y_valid,
                 batch_size=20, n_epochs=20, learning_rate=0.05)

    y_pred = conv_net.predict(X_test)

    print("digits accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
class CategoricalToNumerical(object):

    def __init__(self, dimensionality_reducer=None, verify=True):
        pass
        """Takes in a dimensionality reducer in order to convert categorical features into numerical.
        """
        if dimensionality_reducer is None:
            dimensionality_reducer = RandomizedPCA(1)
        self.dimensionality_reducer = dimensionality_reducer
        self.verify = verify
        self.binarizer = LabelBinarizer()

    def fit(self, X, y=None):
        self._verify(X, self.verify)
        binarized = self.binarizer.fit_transform(X)
        self.dimensionality_reducer.fit(binarized)

    def transform(self, X):
        self._verify(X, False)
        binarized = self.binarizer.transform(X)
        result = self.dimensionality_reducer.transform(binarized).flatten()
        assert X.shape == result.shape
        return result

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def _verify(self, X, verify):
        if verify:
            assert is_categorical(X)
        else:
            assert isinstance(X, np.ndarray)
            assert len(X.shape) == 1
def get_dataset2(test_fraction):
    """
   @:param: test_fraction used to split train and test
   Vectorizes the features and labels into categorical values and randomly splits into train and test set
   :return: X_train, X_test, y_train, y_test
   """
    data = []
    with open('labels.csv', 'r') as datafile:
        csv_reader = csv.reader(datafile, delimiter=',', quotechar='|')
        for row in csv_reader:
            data.append(row)

    data = numpy.asarray(data)
    X = data[:, 0:data.shape[1]-1]
    y = data[:, data.shape[1]-1]

    # X,y = get_tabledata()

    vec = DictVectorizer()
    feature_dict = [dict(enumerate(x)) for x in X.tolist()]
    X = vec.fit_transform(feature_dict).toarray()
    joblib.dump(vec, 'vectorizer.pkl')

    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    joblib.dump(lb, 'binarizer.pkl')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction)
    return X_train, X_test, y_train, y_test
def bio_classification_report(y_true, y_pred):
    """Evaluates entity extraction accuracy.

    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
    """
    from sklearn.preprocessing import LabelBinarizer
    from itertools import chain
    from sklearn.metrics import classification_report

    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
            y_true_combined,
            y_pred_combined,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
    )
def binarize_label_columns(df, columns, two_classes_as='single'):
    '''
    Inputs:
        df: Pandas dataframe object.
        columns: Columns to binarize.
        tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns.
    Returns a tuple with the following items:
        df: Pandas dataframe object with new columns.
        binlabel_names: Names of the newly created binary variables.
        lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer 
        objects as values.
    '''
    binlabel_names = []
    lb_objects = {}
    for col in columns:
        if len(df[col].unique()) > 1: 
            rows_notnull = df[col].notnull() # Use only valid feature observations
            lb = LabelBinarizer()
            binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations
            if len(lb.classes_) == 2 and two_classes_as == 'multiple':
                binclass = np.hstack((1 - binclass, binclass))
            lb_objects[col] = lb
            if len(lb.classes_) > 2 or two_classes_as == 'multiple':
                col_binlabel_names = [col+'_'+str(c) for c in lb.classes_]
                binlabel_names += col_binlabel_names # Names for the binarized classes
                for n in col_binlabel_names: df[n] = np.NaN # Initialize columns
                df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data
            elif two_classes_as == 'single': 
                binlabel_names.append(col+'_bin') # Names for the binarized classes
                df[col+'_bin'] = np.NaN # Initialize columns
                df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data
    return df, binlabel_names, lb_objects
def scorer_auc(y_true, y_pred):
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    """Dedicated to 2class probabilistic outputs"""
    le = LabelBinarizer()
    y_true = le.fit_transform(y_true)
    return roc_auc_score(y_true, y_pred)
Exemple #28
0
def iris_demo():
    # load the iris dataset
    iris = load_iris()
    X = iris['data']
    y_labels = iris['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building logistic regression classifier to classify iris data")
    nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]])
    print("Training")
    nn.fit(X_train, y_train, X_valid, y_valid,
           batch_size=20, n_epochs=20, learning_rate=0.05,
           random_state=RANDOM_STATE)

    y_pred = nn.predict(X_test)

    print("iris accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
Exemple #29
0
def one_hot_encoding(y_train, y_test):
    labelBinarizer = LabelBinarizer()
    labelBinarizer.fit(y_train)

    y_train_one_hot = labelBinarizer.transform(y_train)
    y_test_one_hot = labelBinarizer.transform(y_test)
    return y_train_one_hot, y_test_one_hot
Exemple #30
0
class BaseSGD(object):
    def _get_loss(self):
        losses = {
            "modified_huber": ModifiedHuber(),
            "hinge": Hinge(1.0),
            "perceptron": Hinge(0.0),
            "log": Log(),
            "sparse_log": SparseLog(),
            "squared": SquaredLoss(),
            "huber": Huber(self.epsilon),
            "epsilon_insensitive": EpsilonInsensitive(self.epsilon),
        }
        return losses[self.loss]

    def _get_learning_rate(self):
        learning_rates = {"constant": 1, "pegasos": 2, "invscaling": 3}
        return learning_rates[self.learning_rate]

    def _set_label_transformers(self, y):
        if self.multiclass == "natural":
            self.label_encoder_ = LabelEncoder()
            y = self.label_encoder_.fit_transform(y).astype(np.float64)

        self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_classes = len(self.label_binarizer_.classes_)
        n_vectors = 1 if n_classes <= 2 else n_classes
        return n_classes, n_vectors
import argparse

# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-o", "--output", required=True,
                help="path to the output loss/accuracy plot")
args = vars(ap.parse_args())

# load the training and testing data, then scale it into the range [0, 1]
print("[INFO] loading the CIFAR-10 data...")
((trainX, trainY), (testX, testY)) = cifar10.load_data()
trainX = trainX.astype("float")/255.0
testX = testX.astype("float")/255.0

# convert the labels from integers to vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.fit_transform(testY)

# initialize the label names for CIFAR-10 dataset
labelNames=["airplane", "automobile", "bird", "cat", "deer",
            "dog", "frog", "horse", "ship", "truck"]

# initialize the optimizer and model
print("[INFO] compiling the model...")
opt = SGD(lr=0.001, decay=0.01/40, momentum=0.9, nesterov=True)
model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt,
              metrics=["accuracy"])
print(model.summary())
print(data.head())
print ("\nFeatures : \n" ,data.columns.tolist())
print ("\nMissing values :  ", data.isnull().sum().values.sum())
print ("\nUnique values :  \n",data.nunique())

for i in data.columns:

	data[i] = data[i].replace("?",np.nan)
	data = data[data[i].notnull()]
	data = data.reset_index()[data.columns]
	data[i] = data[i].astype(float)

X = data.drop(['income'], axis=1)
y = data['income']
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
Y = encoder.fit_transform(y)



from keras.models import Sequential #Sequential Models
from keras.layers import Dense #Dense Fully Connected Layer Type
from keras.optimizers import SGD #Stochastic Gradient Descent Optimizer


def create_network():
    model = Sequential()
    model.add(Dense(25, input_shape=(13,), activation='relu'))
    model.add(Dense(9, activation='softmax'))
        
    #stochastic gradient descent
    image = img_to_array(image)
    data.append(image)

    # extract the class label from the image path and update the label list
    label = imagePath.split(os.path.sep)[-2]
    #print(str(imagePath)+"STR"+str(label))
    labels.append(label)

# scale the raw pixel intensities to the range [0, 1]
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
#print(labels)
print("[INFO] data matrix: {:.2f}MB".format(data.nbytes / (1024 * 1000.0)))

# binarize the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
print("LB.CLASSES=" + str(lb.classes_))

# partition data into train and validation
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.2,
                                                  random_state=42)

# construct image generator for data augmentation
aug = ImageDataGenerator(rotation_range=25,
                         width_shift_range=0.1,
                         height_shift_range=0.1,
                         shear_range=0.2,
                         zoom_range=0.2,
Exemple #34
0
dataUni = stretch_interp(data, dur)
dataUni = np.array(dataUni, dtype="float")

(train_x, test_x, train_y, test_y) = train_test_split(dataUni,
                                                      labels,
                                                      test_size=0.1,
                                                      random_state=14)

#train_x = train_x[:, :, np.newaxis]
#test_x = test_x[:, :, np.newaxis]

print(np.shape(train_x), np.shape(train_y), np.shape(test_x), np.shape(test_y))

print("labels:", train_y)

lb = LabelBinarizer()
train_y = lb.fit_transform(train_y)
test_y = lb.transform(test_y)

print("labels after transform:", train_y)

print(np.shape(train_x), np.shape(train_y), np.shape(test_x), np.shape(test_y))
batch_size = np.shape(train_x)[0]

model = Sequential()
#model.add(Conv1D(3012, input_shape=(53, dur*3012), kernel_size=(12), activation='relu'))
model.add(Dense(3012, activation="relu", input_shape=(dur * 3012, )))
print("after 1 dense:", model.input_shape, model.output_shape)
model.add(Dense(506, activation="sigmoid"))
model.add(Dropout(0.25))
model.add(Dense(253, activation="sigmoid"))
Exemple #35
0
    def train_on_texts(self, texts, context_labels=None,
                       batch_size=128,
                       num_epochs=50,
                       verbose=1,
                       new_model=False,
                       gen_epochs=1,
                       train_size=1.0,
                       max_gen_length=300,
                       validation=True,
                       dropout=0.0,
                       via_new_model=False,
                       save_epochs=0,
                       multi_gpu=False,
                       **kwargs):

        if new_model and not via_new_model:
            self.train_new_model(texts,
                                 context_labels=context_labels,
                                 num_epochs=num_epochs,
                                 gen_epochs=gen_epochs,
                                 train_size=train_size,
                                 batch_size=batch_size,
                                 dropout=dropout,
                                 validation=validation,
                                 save_epochs=save_epochs,
                                 multi_gpu=multi_gpu,
                                 **kwargs)
            return

        if context_labels:
            context_labels = LabelBinarizer().fit_transform(context_labels)

        if 'prop_keep' in kwargs:
            train_size = prop_keep

        if self.config['word_level']:
            # If training word level, must add spaces around each
            # punctuation. https://stackoverflow.com/a/3645946/9314418
            punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—…'
            for i in range(len(texts)):
                texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i])
                texts[i] = re.sub(' {2,}', ' ', texts[i])
            texts = [text_to_word_sequence(text, filters='') for text in texts]

        # calculate all combinations of text indices + token indices
        indices_list = [np.meshgrid(np.array(i), np.arange(
            len(text) + 1)) for i, text in enumerate(texts)]
        # indices_list = np.block(indices_list) # this hangs when indices_list is large enough
        # FIX BEGIN ------
        indices_list_o = np.block(indices_list[0])
        for i in range(len(indices_list)-1):
            tmp = np.block(indices_list[i+1])
            indices_list_o = np.concatenate([indices_list_o, tmp])
        indices_list = indices_list_o
        # FIX END ------

        # If a single text, there will be 2 extra indices, so remove them
        # Also remove first sequences which use padding
        if self.config['single_text']:
            indices_list = indices_list[self.config['max_length']:-2, :]

        indices_mask = np.random.rand(indices_list.shape[0]) < train_size

        if multi_gpu:
            num_gpus = len(config.get_visible_devices('GPU'))
            batch_size = batch_size * num_gpus

        gen_val = None
        val_steps = None
        if train_size < 1.0 and validation:
            indices_list_val = indices_list[~indices_mask, :]
            gen_val = generate_sequences_from_texts(
                texts, indices_list_val, self, context_labels, batch_size)
            val_steps = max(
                int(np.floor(indices_list_val.shape[0] / batch_size)), 1)

        indices_list = indices_list[indices_mask, :]

        num_tokens = indices_list.shape[0]
        assert num_tokens >= batch_size, "Fewer tokens than batch_size."

        level = 'word' if self.config['word_level'] else 'character'
        print("Training on {:,} {} sequences.".format(num_tokens, level))

        steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1)

        gen = generate_sequences_from_texts(
            texts, indices_list, self, context_labels, batch_size)

        base_lr = 4e-3

        # scheduler function must be defined inline.
        def lr_linear_decay(epoch):
            return (base_lr * (1 - (epoch / num_epochs)))

        '''
        FIXME
        This part is a bit messy as we need to initialize the model within
        strategy.scope() when using multi-GPU. Can probably be cleaned up a bit.
        '''

        if context_labels is not None:
            if new_model:
                weights_path = None
            else:
                weights_path = "{}_weights.hdf5".format(self.config['name'])
                self.save(weights_path)


            if multi_gpu:
                from tensorflow import distribute as distribute
                strategy = distribute.MirroredStrategy()
                with strategy.scope():
                    parallel_model = textgenrnn_model(self.num_classes,
                                                      dropout=dropout,
                                                      cfg=self.config,
                                                      context_size=context_labels.shape[1],
                                                      weights_path=weights_path)
                    parallel_model.compile(loss='categorical_crossentropy',
                                           optimizer=Adam(lr=4e-3))
                model_t = parallel_model
                print("Training on {} GPUs.".format(num_gpus))
            else:
                model_t = self.model
        else:
            if multi_gpu:
                from tensorflow import distribute as distribute
                if new_model:
                    weights_path = None
                else:
                    weights_path = "{}_weights.hdf5".format(self.config['name'])

                strategy = distribute.MirroredStrategy()
                with strategy.scope():
                # Do not locate model/merge on CPU since sample sizes are small.
                    parallel_model = textgenrnn_model(self.num_classes,
                                                      cfg=self.config,
                                                      weights_path=weights_path)
                    parallel_model.compile(loss='categorical_crossentropy',
                                           optimizer=Adam(lr=4e-3))

                model_t = parallel_model
                print("Training on {} GPUs.".format(num_gpus))
            else:
                model_t = self.model

        model_t.fit(gen, steps_per_epoch=steps_per_epoch,
                              epochs=num_epochs,
                              callbacks=[
                                  LearningRateScheduler(
                                      lr_linear_decay),
                                  generate_after_epoch(
                                      self, gen_epochs,
                                      max_gen_length),
                                  save_model_weights(
                                      self, num_epochs,
                                      save_epochs)],
                              verbose=verbose,
                              max_queue_size=10,
                              validation_data=gen_val,
                              validation_steps=val_steps
                              )

        # Keep the text-only version of the model if using context labels
        if context_labels is not None:
            self.model = Model(inputs=self.model.input[0],
                               outputs=self.model.output[1])
Exemple #36
0
 def fit(self, X, y):
     self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
     Y = np.asfortranarray(self.label_binarizer_.fit_transform(y),
                           dtype=np.float64)
     return self._fit(X, Y)
Exemple #37
0
from NeuralNetwork import NeuralNetwork
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import datasets

print("[INFO] loading MNIST (sample) dataset...")
digits = datasets.load_digits()
data = digits.data.astype("float")
data = (data - data.min()) / (data.max() - data.min())
print("[INFO] samples: {}, dim: {}".format(data.shape[0], data.shape[1]))
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  digits.target,
                                                  test_size=0.25)
trainY = LabelBinarizer().fit_transform(trainY)
testY = LabelBinarizer().fit_transform(testY)

print("[INFO] trainig network...")
nn = NeuralNetwork([trainX.shape[1], 32, 16, 10])
print("[INFO] {}".format(nn))
nn.fit(trainX, trainY, epochs=1000)
print("[INFO] evaluating network...")
prediction = nn.predict(testX)
prediction = predictions.argmax(axis=1)
print(classification_report(testY.argmax(axis=1), predictions))
Exemple #38
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.

            .. versionadded:: 0.17
               *sample_weight* support to LogisticRegression.

        Returns
        -------
        self : object
            Returns self.
        """
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        Y = [self._label_binarizer.fit_transform(yy).ravel() for yy in y]

        # self.lr_p2 = [SGDClassifier(
        #     loss='log', l1_ratio=self.l1_ratio_lamda,
        #     fit_intercept=self.fit_intercept, shuffle=False,
        #     penalty='elasticnet', alpha=self.lamda, warm_start=True,
        #     max_iter=(self.max_iter // 3 if self.deep else 1) + 0)
        #     for i in range(len(X))]
        self.lr_p2 = [
            LogisticRegression(
                fit_intercept=self.fit_intercept,
                penalty='l2',
                solver='lbfgs',
                C=1. / (self.lamda * (1 - self.l1_ratio_lamda)),
                warm_start=True,
                max_iter=(self.max_iter // 3 if self.deep else 1) + 5)
            for i in range(len(X))
        ]

        self.alpha_, self.coef_, self.intercept_, self.n_iter_ = \
            logistic_alternating(
                X, Y, lamda=self.lamda, beta=self.beta, gamma=self.gamma,
                max_iter=self.max_iter, verbose=self.verbose, tol=self.tol,
                return_n_iter=True, deep=self.deep,
                lr_p2=self.lr_p2, l1_ratio_beta=self.l1_ratio_beta,
                l1_ratio_lamda=self.l1_ratio_lamda,  # unused
                fit_intercept=self.fit_intercept  # unused
            )

        if self.classes_.shape[0] > 2:
            # ndim = self.classes_.shape[0]
            raise ValueError("too many classes")
        else:
            ndim = 1

        self.coef_ = self.coef_.reshape(ndim, -1)
        # self.alpha_ = [alpha.reshape(ndim, -1) for alpha in self.alpha_]

        self.y_train_ = Y

        return self
Exemple #39
0
class LocalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 k=1,
                 fxs=None,
                 hour_mean=True,
                 aux=True,
                 ens_std=False,
                 hour_std=False,
                 stid_enc=False):
        self.k = k
        self.hour_mean = hour_mean
        self.fxs = fxs
        self.aux = aux
        self.ens_std = ens_std
        self.hour_std = hour_std
        self.stid_enc = stid_enc

    def fit(self, X, y=None):
        assert y is not None
        n_stations = y.shape[1]
        self.n_stations = n_stations
        assert X.station_info.shape[0] == self.n_stations

        stid = np.arange(98)
        self.stid_lb = LabelBinarizer()
        self.stid_lb.fit(stid)
        return self

    @classmethod
    def transform_labels(cls, y):
        y = y.ravel(1)
        return y

    # @profile
    def transform(self, X, y=None):
        k = self.k
        n_days = X.shape[0]

        ll_coord = X.station_info[:, 0:2]
        lat_idx = np.searchsorted(X.lat, ll_coord[:, 0])
        lon_idx = np.searchsorted(X.lon, ll_coord[:, 1] + 360)

        #IPython.embed()

        n_fx = 0
        for b_name, X_b in X.blocks.iteritems():
            old_n_fx = n_fx
            if self.fxs is not None and b_name not in self.fxs:
                continue
            if X_b.ndim == 6:
                if self.fxs is not None and b_name in self.fxs:
                    n_fxs = len(self.fxs[b_name])
                else:
                    n_fxs = X_b.shape[1]
                shapes = [n_fxs]
                if not self.hour_mean:
                    shapes.append(X_b.shape[3])
                shapes.extend([k * 2, k * 2])
                print b_name, shapes
                n_fx += np.prod(shapes)
            elif X_b.ndim == 1:
                n_fx += 1
            elif X_b.ndim == 2:
                n_fx += X_b.shape[1]
            else:
                raise ValueError('%s has wrong dim: %d' % (b_name, X_b.ndim))
            print 'block: %s as %d n_fx' % (b_name, n_fx - old_n_fx)

        if self.stid_enc:
            n_fx += len(self.stid_lb.classes_)

        # num of features - based on blocks + station info (5 fx)
        if self.aux:
            n_fx = n_fx + 3 + 2 + 2

        X_p = np.zeros((n_days * self.n_stations, n_fx), dtype=np.float32)
        offset = 0

        for bid, b_name in enumerate(X.blocks):
            if self.fxs is not None and b_name not in self.fxs:
                continue
            print 'localizing block: %s' % b_name
            X_b = X[b_name]

            # select fx if fxs given
            if self.fxs is not None and self.fxs.get(b_name, None):
                fxs = self.fxs[b_name]
                idx = [
                    i for i, name in enumerate(X.fx_name[b_name])
                    if name in fxs
                ]
                X_b = X_b[:, idx]

            if X_b.ndim == 6:
                # FIXME over hours
                if self.hour_mean:
                    X_b = np.mean(X_b, axis=3)
                elif self.hour_std:
                    X_b = np.std(X_b, axis=3)

                # over ensembles
                if self.ens_std:
                    X_b = np.std(X_b, axis=2)
                else:
                    X_b = np.mean(X_b, axis=2)

                offset_inc = 0
                for i in range(self.n_stations):
                    lai, loi = lat_idx[i], lon_idx[i]
                    if (self.hour_mean or self.hour_std):
                        blk = X_b[:, :, lai - k:lai + k, loi - k:loi + k]
                    else:
                        blk = X_b[:, :, :, lai - k:lai + k, loi - k:loi + k]
                    blk = blk.reshape((blk.shape[0], np.prod(blk.shape[1:])))
                    X_p[i * n_days:((i + 1) * n_days),
                        offset:(offset + blk.shape[1])] = blk
                    if i == 0:
                        offset_inc = blk.shape[1]
                    del blk
                    gc.collect()

                offset += offset_inc

            elif X_b.ndim == 1 or (X_b.ndim == 2 and X_b.shape[1] == 1):
                X_p[:,
                    offset:offset + 1] = np.tile(X_b.ravel(),
                                                 self.n_stations)[:,
                                                                  np.newaxis]
                offset += 1
            elif X_b.ndim == 2:
                # FIXME wrong stitching together stuff
                print('block: %s will be repeated for each station' % b_name)
                width = X_b.shape[1]
                X_p[:,
                    offset:offset + width] = np.tile(X_b, (self.n_stations, 1))
                #IPython.embed()
                offset += width
            else:
                raise ValueError('%s has wrong dim: %d' % (b_name, X_b.ndim))

        if self.stid_enc:
            stid = np.repeat(self.stid_lb.classes_, n_days)
            stid_enc = self.stid_lb.transform(stid)
            X_p[:, offset:(offset + stid_enc.shape[1])] = stid_enc
            offset += stid_enc.shape[1]

        if self.aux:
            # lat, lon, elev
            X_p[:, offset:(offset + 3)] = np.repeat(X.station_info,
                                                    n_days,
                                                    axis=0)
            offset += 3

            # compute pos of station within grid cell (in degree lat lon)
            lat_idx = np.repeat(lat_idx, n_days)
            lon_idx = np.repeat(lon_idx, n_days)
            # offset - 3 is station lat
            X_p[:, offset] = (X_p[:, offset - 3] - X.lat[lat_idx])
            # offset - 2 is station lon
            X_p[:, offset + 1] = (X_p[:, offset - 2] - (X.lon[lon_idx] - 360.))

            # FIXME add lat lon idx
            offset += 2
            X_p[:, offset] = lat_idx
            X_p[:, offset + 1] = lon_idx

        print 'X_p.shape: ', X_p.shape
        return X_p
import numpy as np
from NeuralNet import NeuralNet
from sklearn.preprocessing import LabelBinarizer

train = np.loadtxt('./wine/train_wine.csv', delimiter=',')
lb = LabelBinarizer()
train_y = lb.fit_transform(train[:, 0])
train_x = train[:, 1:]
mean = np.mean(train_x, axis=0)
train_x -= mean
var = np.var(train_x, axis=0)
train_x /= var

nn = NeuralNet([13, 13, 3], 0.01)

nn.train(train_x, train_y)

test = np.loadtxt('./wine/test_wine.csv', delimiter=',')
test_y = test[:, 0]
test_x = test[:, 1:]
test_x -= mean
test_x /= var

y_pred = np.argmax(nn.predict(test_x), axis=1)
accu = np.sum(test_y == y_pred + 1) / len(test_y)
print(accu)
Exemple #41
0
dpt = 3

print("[INFO] loading Images")
imagePaths = list(paths.list_images(args["dataset"]))
sp = SimplePreprocessor(size, size)
iap = ImageToArrayPreprocessor()
sdl = SimpleDatasetLoader(preprocessors=[sp, iap])
(data, labels) = sdl.load(imagePaths, verbose=500)
print(labels)
data = data.astype("float") / 255.0
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.25,
                                                  random_state=42)

trainY = LabelBinarizer().fit_transform(trainY)
testY = LabelBinarizer().fit_transform(testY)

print("[INFO] compiling model...")
opt = SGD(lr=0.025)
model = IncludeNet.build(width=size, height=size, depth=dpt, classes=4)
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

print("[INFO] training network...")
H = model.fit(trainX,
              trainY,
              validation_data=(testX, testY),
              batch_size=size,
              epochs=ep,
Exemple #42
0
class MultipleLogisticRegressionMultipleKernel(
        LogisticRegressionMultipleKernel, LogisticRegression,
        LinearClassifierMixin):
    # Ensure consistent split
    _pairwise = True

    def __init__(self,
                 penalty='l2',
                 dual=False,
                 tol=1e-4,
                 fit_intercept=True,
                 intercept_scaling=1,
                 class_weight=None,
                 random_state=None,
                 solver='liblinear',
                 max_iter=100,
                 multi_class='ovr',
                 verbose=0,
                 warm_start=False,
                 n_jobs=1,
                 l1_ratio_lamda=0.1,
                 l1_ratio_beta=0.1,
                 deep=True,
                 lamda=0.01,
                 gamma=1,
                 rho=1,
                 rtol=1e-4,
                 beta=0.01):

        super(MultipleLogisticRegressionMultipleKernel,
              self).__init__(penalty=penalty,
                             dual=dual,
                             tol=tol,
                             fit_intercept=fit_intercept,
                             intercept_scaling=intercept_scaling,
                             class_weight=class_weight,
                             random_state=random_state,
                             solver=solver,
                             max_iter=max_iter,
                             multi_class=multi_class,
                             verbose=verbose,
                             warm_start=warm_start,
                             n_jobs=n_jobs,
                             lamda=lamda,
                             gamma=gamma,
                             rho=rho,
                             rtol=rtol,
                             beta=beta)
        self.l1_ratio_lamda = l1_ratio_lamda
        self.l1_ratio_beta = l1_ratio_beta
        self.deep = deep

    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.

            .. versionadded:: 0.17
               *sample_weight* support to LogisticRegression.

        Returns
        -------
        self : object
            Returns self.
        """
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        Y = [self._label_binarizer.fit_transform(yy).ravel() for yy in y]

        # self.lr_p2 = [SGDClassifier(
        #     loss='log', l1_ratio=self.l1_ratio_lamda,
        #     fit_intercept=self.fit_intercept, shuffle=False,
        #     penalty='elasticnet', alpha=self.lamda, warm_start=True,
        #     max_iter=(self.max_iter // 3 if self.deep else 1) + 0)
        #     for i in range(len(X))]
        self.lr_p2 = [
            LogisticRegression(
                fit_intercept=self.fit_intercept,
                penalty='l2',
                solver='lbfgs',
                C=1. / (self.lamda * (1 - self.l1_ratio_lamda)),
                warm_start=True,
                max_iter=(self.max_iter // 3 if self.deep else 1) + 5)
            for i in range(len(X))
        ]

        self.alpha_, self.coef_, self.intercept_, self.n_iter_ = \
            logistic_alternating(
                X, Y, lamda=self.lamda, beta=self.beta, gamma=self.gamma,
                max_iter=self.max_iter, verbose=self.verbose, tol=self.tol,
                return_n_iter=True, deep=self.deep,
                lr_p2=self.lr_p2, l1_ratio_beta=self.l1_ratio_beta,
                l1_ratio_lamda=self.l1_ratio_lamda,  # unused
                fit_intercept=self.fit_intercept  # unused
            )

        if self.classes_.shape[0] > 2:
            # ndim = self.classes_.shape[0]
            raise ValueError("too many classes")
        else:
            ndim = 1

        self.coef_ = self.coef_.reshape(ndim, -1)
        # self.alpha_ = [alpha.reshape(ndim, -1) for alpha in self.alpha_]

        self.y_train_ = Y

        return self

    def predict(self, X):
        """Predict using the kernel ridge model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Samples.

        Returns
        -------
        C : array, shape = [n_samples] or [n_samples, n_targets]
            Returns predicted values.
        """
        check_is_fitted(self, ["alpha_", "coef_"])
        # return [LinearClassifierMixin.predict(
        #     self, np.tensordot(k, a, axes=1)) for a, k in zip(
        #         self.alpha_, X)]

        return [
            self.lr_p2[i].predict(
                np.tensordot(self.coef_.ravel(), X[i], axes=1))
            for i in range(len(X))
        ]

    def score(self, K, y, sample_weight=None):
        """Returns the coefficient of determination R^2 of the prediction.

        The coefficient R^2 is defined as (1 - u/v), where u is the residual
        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
        sum of squares ((y_true - y_true.mean()) ** 2).sum().
        The best possible score is 1.0 and it can be negative (because the
        model can be arbitrarily worse). A constant model that always
        predicts the expected value of y, disregarding the input features,
        would get a R^2 score of 0.0.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.

        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True values for X.

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.

        Returns
        -------
        score : float
            R^2 of self.predict(X) wrt. y.
        """
        y_pred = self.predict(K)
        if sample_weight is None:
            return np.mean(
                [accuracy_score(y[j], y_pred[j]) for j in range(len(K))])
        else:
            return np.mean([
                accuracy_score(y[j], y_pred[j], sample_weight=sample_weight[j])
                for j in range(len(K))
            ])

    def predict_proba(self, X):
        """Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        For a multi_class problem, if multi_class is set to be "multinomial"
        the softmax function is used to find the predicted probability of
        each class.
        Else use a one-vs-rest approach, i.e calculate the probability
        of each class assuming it to be positive using the logistic function.
        and normalize these values across all the classes.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        T : array-like, shape = [n_samples, n_classes]
            Returns the probability of the sample for each class in the model,
            where classes are ordered as they are in ``self.classes_``.
        """
        check_is_fitted(self, ["alpha_", "coef_"])
        # return [LinearClassifierMixin._predict_proba_lr(
        #     self, np.tensordot(k, a, axes=1)) for a, k in zip(
        #         self.alpha_, X)]

        return [
            self.lr_p2[i].predict_proba(
                np.tensordot(self.coef_.ravel(), X[i], axes=1))
            for i in range(len(X))
        ]

    def predict_log_proba(self, X):
        """Log of probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        T : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the sample for each class in the
            model, where classes are ordered as they are in ``self.classes_``.
        """
        proba = self.predict_proba(X)
        return [np.log(p) for p in proba]
Exemple #43
0
    3.0
    >>> round_of_rating(4.1)
    4.0"""

    return np.round(number * 2) / 2


min_val = -40
max_val = 40

y_train = round_of_rating(saturate(y_train, min_val, max_val))

r_int = 0.5
slist = np.arange(min_val, max_val + r_int,
                  r_int) * 2  #multiply by 2 to allow labelbinarizer to work
lb = LabelBinarizer()
lb.fit(slist)
ylabels = lb.transform(y_train * 2)

# In[17]:

print(x_train.shape)
print(xfcss_train.shape)
print(ylabels.shape)

# In[18]:

nsamps = x_train.shape[0]
n80p = int(np.floor(nsamps * 0.8))
rannums = np.array(random.sample(range(1, nsamps, 1), n80p))
s_nfiles = np.arange(nsamps)
Exemple #44
0
 def fit(self, X, y=None):
     if self.op == 'month' and hasattr(X, 'date'):
         month = X.date.map(lambda x: x.month)
         self.lb = LabelBinarizer()
         self.lb.fit(month)
     return self
Exemple #45
0
class Model(ModelBase):
    def __init__(self):
        super(Model,self).__init__("LR")
        self.vectorizer=CountVectorizer(lowercase=False,binary=True, analyzer=analyzer_)
        self.binarizer=LabelBinarizer()
        self.algo = LogisticRegression(C=1,fit_intercept=False,class_weight='balanced')
        self.label_dict={}
        self.label_index=0
        self.label_map={}



    def process_text(self, text):
        words = text.lower().split()
        stopwords = set('for a an the of and to in'.split())
        words=[word for word in words if word not in stopwords]
        return words

    def hot_encode(self,word):
        index=self.label_index
        if(word[0] not in self.label_dict):
            self.label_dict[word[0]]=self.label_index
            self.label_map[self.label_index]=word[0]
            index=self.label_index
            self.label_index+=1
        else:
            index=self.label_dict[word[0]]
        return index

    #@ModelBase.train
    def train(self,docs):
        # Create sentences from documents
        #docs=json.loads(data)

        inputs=[]
        labels=[]

        for doc in docs:
            inputs.append(self.process_text(doc['rawText']))
            labels.append(self.hot_encode(doc['tags']))

        self.binarizer.fit_transform(labels)


        inputs = self.vectorizer.fit_transform(inputs)
        print(inputs)
        print(labels)
        self.vectorizer.stop_words_=None

        self.algo.fit(inputs, labels)

        print('Training done')
        return {'status': 2, 'reason': '', 'numRecords': len(inputs)}

    def validate(self,docs):
        #docs = json.loads(data)
        inputs = []
        labels = []

        for doc in docs:
            inputs.append(self.process_text(doc['rawText']))
            labels.append(self.hot_encode(doc['tags']))

        inputs = self.vectorizer.transform(inputs)
        predictions = self.algo.predict(inputs)
        fpr, tpr, threshold = roc_curve(labels, predictions)

        roc = {"fpr": fpr.tolist(), "tpr": tpr.tolist(), "threshold": threshold.tolist()}

        report = {
            "classification_report": classification_report(labels, predictions),
            "confusion_matrix": confusion_matrix(labels, predictions).tolist(),
            "roc": roc,
            "roc_auc": auc(fpr, tpr)

        }
        print (report)
        return {'status': 2, 'reason':'', 'report': report, 'numRecords': len(inputs)}

    def predict(self,docs):
        #docs = json.loads(data)

        inputs = [self.process_text(element['rawText']) for element in docs]
        inputs = self.vectorizer.transform(inputs)

        predictions = self.algo.predict(inputs)

        results=[]
        index=0
        for i in predictions.tolist():
            result={}
            result["predicted_tags"]=self.label_map[i]
            results.append(result)
            index+=1

        print (results)

        return results

    def persist(self, path):
        package={
            'vectorizer': self.vectorizer,
            'binarizer': self.binarizer,
            'algo': self.algo,
            'label_map': self.label_map,
            'label_dict': self.label_dict
        }
        with open(path, 'wb') as file:
            pickle.dump(package,file, -1)

    def reload(self, path):

        with open(path, 'rb') as file:
            package = pickle.load(file)

        for key,value in package.items():
            setattr(self,key,value)
Exemple #46
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 8
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)

    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    '''
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_orig)).tocsr()
    '''

    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline - 0.0002:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    from wordbatch.models import nn_relu_h1, nn_relu_h2

    modelnn = nn_relu_h1.NN_ReLU_H1(alpha=0.05, L2=0.00001, D_nn=60, D=sparse_merge.shape[1], \
                                  iters=1, inv_link="identity", threads=threads)

    baseline = 1.
    print('[{}] Epoch time '.format(time.time() - start_time))
    for i in range(3):
        modelnn.fit(train_X, train_y, verbose=1)
        predsnn = modelnn.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsnn))
        print("FM_FTRL dev RMSLE:", score_)
        print('[{}] Epoch time '.format(time.time() - start_time))
        if score_ < baseline - 0.0002:
            baseline = score_
        else:
            break

    pd.Series((np.expm1(predsnn) - np.expm1(predsfm))).hist()

    print(
        "FM_FTRL dev RMSLE:",
        rmsle(np.expm1(valid_y),
              0.1 * (np.expm1(predsnn)) + 0.9 * (np.expm1(predsfm))))

    tpoint2 = time.time()
    print("Time for Training: {}".format(hms_string(tpoint2 - tpoint1)))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
Exemple #47
0
# 숫자 범주화
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit([1, 2, 2, 6])
le.classes_  # scikit 에서 결과변수인 경우, '_'를 붙이는 관행이 있다.
le.transform([1, 1, 2, 6])  # labeling 하라

le.fit(["서울", "서울", "대전", "부산"])
le.classes_
le.transform(["서울", "서울", "부산"])

# 0과 1로 레이블링하라.
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
lb.fit([1, 2, 6, 1, 2])  # 1, 2, 6 의 세가지 경우가 있다.
lb.classes_
lb.transform([1, 6])  #1을 표현하는 것과, 6을 표현하는 것을 보여줌.

# dictionary feature 정보를 matrix로 표현.
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse=False)
D = [{
    'foo': 1,
    'bar': 2
}, {
    'foo': 3,
    'baz': 1
}]  #한 문서에 foo가 1번, bar가 2번 나오고, 다른 문서에는 foo 3번, baz 1번.
def plot_mc_roc(y_test, y_score, interpreter=None):
    '''
    plotting function that generates roc curves for data given to it.

    :param y_test: is the testing data used
    :param y_score: is the score when the testing data was called
    :param interpreter: is what was used to preprocess
    :return a roc plot
    '''
    lw = 2
    n_classes = len(np.unique(y_test))
    classes = pd.unique(y_test)
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(np.concatenate((y_test, y_score)))
    if n_classes != 2:
        y_test = label_binarizer.transform(y_test)
        y_score = label_binarizer.transform(y_score)
    else:
        n_classes = 1
        y_test = y_test.reshape(-1, 1)
        y_score = y_score.reshape(-1, 1)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = sklearn.metrics.auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    img = plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    for i in range(n_classes):
        plt.plot(
            fpr[i],
            tpr[i],
            lw=lw,
            label='ROC curve of class {0} (area = {1:0.2f})'
                ''.format(
                interpreter.inverse_transform(
                [[label_binarizer.classes_[i]]])[0],
                roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    return img
Exemple #49
0
from sklearn.preprocessing import StandardScaler
import random
import numpy as np

n_batch = 100
n_step = 8
n_input = 1
n_output = 10
n_cell = 100
lr = 0.006
n_train = 8000

bear = Bear()
data = scio.loadmat('wavedata.mat')['wavedata']
target = bear.target
lb = LabelBinarizer()
target = lb.fit_transform(target)
# print(target, ...)
# print(lb.inverse_transform(target))
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    test_size=0.333)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
total_train_batch, total_test_batch = X_train.shape[0], X_test.shape[0]
print(total_train_batch, ...)
print(X_train.shape, y_test.shape, ...)

lstm = LSTM(n_batch, n_step, n_input, n_output, n_cell)
with tf.Session() as sess:
Exemple #50
0
aap = AspectAwarePreprocessor(64, 64)
iap = ImageToArrayPreprocessor()

#load the dataset from disk and then scale the raw pixel intensities
#to the range [0,1]
sdl = SimpleDatasetLoader(preprocessors=[aap, iap])
(data, labels) = sdl.load(imagePaths, verbose=-1)
data = data.astype("float") / 255.0

(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.25,
                                                  random_state=42)

# convert the labels from integers to vectors
trainY = LabelBinarizer().fit_transform(trainY)
testY = LabelBinarizer().fit_transform(testY)

# initialize the optimizer and model
print("[INFO] compiling model...")
opt = SGD(lr=0.01, decay=0.01 / 20, momentum=0.9, nesterov=True)
model = Network.build(width=64, height=64, depth=3, classes=len(classNames))
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

# train the network
print("[INFO] training network...")
H = model.fit(trainX,
              trainY,
              validation_data=(testX, testY),
df_test = create_df(os.path.join(datapath, test_fname),
                    img_path,
                    partial_dataset=part_dat,
                    seed=123)

preds = np.load('10-02-2020_cont_colab.npy')
predsfl = np.load('22-01-2020_cont_colab.npy')

yhat = np.argmax(preds, axis=1) + 1
yhatfl = np.argmax(predsfl, axis=1) + 1

from sklearn.preprocessing import LabelBinarizer

y = df_test['label'].to_numpy()

lb = LabelBinarizer().fit(range(1, 40))

yhot = lb.transform(y)
#%%
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy import interp

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(39):
    fpr[i], tpr[i], _ = roc_curve(yhot[:, i], preds[:, i])
    if np.isnan(tpr[i]).any() or np.isnan(fpr[i]).any():
        fpr[i] = tpr[i] = np.zeros(39)

    roc_auc[i] = auc(fpr[i], tpr[i])
Exemple #52
0
def load_data():
    try:
        # Reload the data from saved pickle file
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
            train_features_ = pickle_data['train_dataset']
            train_labels_ = pickle_data['train_labels']
            valid_features_ = pickle_data['valid_dataset']
            valid_labels_ = pickle_data['valid_labels']
            test_features_ = pickle_data['test_dataset']
            test_labels_ = pickle_data['test_labels']
            del pickle_data  # Free up memory
            return train_features_, train_labels_, valid_features_, valid_labels_, test_features_, test_labels_
    except IOError:
        print("failed to reload data from %s, load it from begining" %
              pickle_file)

    # Download the training and test dataset.
    download_if_necessary(
        'https://s3.amazonaws.com/udacity-sdc/notMNIST_train.zip',
        'notMNIST_train.zip', 'c8673b3f28f489e9cdf3a3d74e2ac8fa')
    download_if_necessary(
        'https://s3.amazonaws.com/udacity-sdc/notMNIST_test.zip',
        'notMNIST_test.zip', '5d3c7e653e63471c88df796156a9dfa9')
    # Make sure the files aren't corrupted
    assert hashlib.md5(open('notMNIST_train.zip', 'rb').read()).hexdigest() == 'c8673b3f28f489e9cdf3a3d74e2ac8fa', \
        'notMNIST_train.zip file is corrupted.  Remove the file and try again.'
    assert hashlib.md5(open('notMNIST_test.zip', 'rb').read()).hexdigest() == '5d3c7e653e63471c88df796156a9dfa9', \
        'notMNIST_test.zip file is corrupted.  Remove the file and try again.'

    # Get the features and labels from the zip files
    train_features_, train_labels_ = uncompress_features_labels(
        'notMNIST_train.zip')
    test_features_, test_labels_ = uncompress_features_labels(
        'notMNIST_test.zip')

    # Limit the amount of data to work with a docker container
    docker_size_limit = 150000
    train_features_, train_labels_ = resample(train_features_,
                                              train_labels_,
                                              n_samples=docker_size_limit)

    # normalize the data
    train_features_ = normalize_grayscale(train_features_)
    test_features_ = normalize_grayscale(test_features_)

    # Turn labels into numbers and apply One-Hot Encoding
    encoder = LabelBinarizer()
    encoder.fit(train_labels_)
    # one-hor encode, and change to float32,
    # so it can be multiplied against the features in TensorFlow, which are float32
    train_labels_ = encoder.transform(train_labels_).astype(np.float32)
    test_labels_ = encoder.transform(test_labels_).astype(np.float32)

    # Get randomized datasets for training and validation
    train_features_, valid_features_, train_labels_, valid_labels_ = train_test_split(
        train_features_, train_labels_, test_size=0.05, random_state=832289)

    # Save the data for easy access
    if not os.path.isfile(pickle_file):
        print('Saving data to pickle file...')
        try:
            with open('notMNIST.pickle', 'wb') as pfile:
                pickle.dump(
                    {
                        'train_dataset': train_features_,
                        'train_labels': train_labels_,
                        'valid_dataset': valid_features_,
                        'valid_labels': valid_labels_,
                        'test_dataset': test_features_,
                        'test_labels': test_labels_,
                    }, pfile, pickle.HIGHEST_PROTOCOL)
                print('Data cached in pickle file.')
        except Exception as e:
            print('Unable to save data to', pickle_file, ':', e)
            raise

        return train_features_, train_labels_, valid_features_, valid_labels_, test_features_, test_labels_
Exemple #53
0
import tensorflow as tf
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# load data
digits = load_digits()
x_set = digits.data
y_set = LabelBinarizer().fit_transform(digits.target)
x_train, x_test, y_train, y_test = train_test_split(x_set,
                                                    y_set,
                                                    test_size=0.3)


def add_layer(inputs, in_size, out_size, activition_function=None):
    Weights = tf.Variable(tf.random_normal([in_size, out_size]))
    biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)  # 不推荐为0
    Wx_plus_b = tf.matmul(inputs, Weights) + biases
    Wx_plus_b = tf.nn.dropout(Wx_plus_b, keep_probability)
    if activition_function is None:
        outputs = Wx_plus_b
    else:
        outputs = activition_function(Wx_plus_b)
    tf.summary.histogram('/outputs', outputs)
    return outputs


keep_probability = tf.placeholder(tf.float32)  # 保存率
X = tf.placeholder(tf.float32, shape=[None, 64])
Y = tf.placeholder(tf.float32, shape=[None, 10])
Exemple #54
0
def eval_models(model_paths, data_path, save_path=None):
    '''
    Evaluates performance of a model in terms of loss, accuracy, confusion
    matrix, and mean per-class recall
    
    Parameters:
    model_paths(dict) - dictionary with model names as keys and paths pointing 
        to the .h5 files of the trained models as values
    data_path(string) - path to the image directory of the target dataset
    json_path(string) - optional file path to save output to
    
    Returns:
    Dictionary of dictionaries each containing loss, accuracy, confusion matrix, 
    and mean per-class recall for a given model
    '''

    from keras.models import load_model
    from keras.backend import clear_session
    from keras.preprocessing.image import ImageDataGenerator
    from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    import gc
    import json

    # build generator to feed the model
    print('Building image generator...')
    generator = ImageDataGenerator().flow_from_directory(data_path,
                                                         target_size=(224,
                                                                      224),
                                                         batch_size=8,
                                                         shuffle=False)
    y_true = generator.classes

    # evaluate all models
    model_results = dict()
    for name, path in model_paths.items():
        # load model
        print('Loading {}'.format(path))
        model = load_model(path)

        # run basic evaluation
        print('Evaluating {}'.format(path))
        metrics = dict()
        metrics['loss'], metrics['acc'] = model.evaluate_generator(generator)

        # predict labels
        y_prob = model.predict_generator(generator)
        y_pred = y_prob.argmax(axis=1)

        # calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        metrics['cm'] = cm.tolist()

        # mean per-class recall
        pcr = cm.diagonal() / cm.sum(axis=1)
        metrics['pcr'] = list(pcr)
        metrics['mpcr'] = pcr.mean()

        # F1 score
        metrics['class_f1s'] = list(f1_score(y_true, y_pred, average=None))
        metrics['macro_f1'] = f1_score(y_true, y_pred, average='macro')
        metrics['micro_f1'] = f1_score(y_true, y_pred, average='micro')

        # AUC score
        y_binary = LabelBinarizer().fit_transform(y_true)
        metrics['class_aucs'] = list(
            roc_auc_score(y_binary, y_prob, average=None))
        metrics['macro_auc'] = roc_auc_score(y_binary, y_prob, average='macro')
        metrics['micro_auc'] = roc_auc_score(y_binary, y_prob, average='micro')

        model_results[name] = metrics

        # remove clutter from memory
        del model
        clear_session()
        gc.collect()

    if save_path:
        print('Saving evaluation to {}'.format(save_path))
        with open(save_path, 'w') as f:
            json.dump(model_results, f)

    print('Evaluation complete.\n')
    gc.collect()

    return model_results
class MatheusAlvesMLP(BaseEstimator, ClassifierMixin):  # or RegressonMixin?
    def __init__(self, params=None):
        if params is None:
            self.ctor({})
        else:
            self.ctor(params)

    def ctor(self, params):
        self.alpha = params.get("alpha", 0.00001)  # L2 regularization
        self.max_iter = params.get(
            "max_iter", 500)  # max iteration to the optimization algorithm
        self.hidden_layers_size = params.get("hidden_layers_size",
                                             (100, 200, 300))
        self.shuffle = params.get("shuffle",
                                  False)  # shuflle samples in interactions?
        self.random_state = params.get(
            "random_state", None)  # state or seed for generating random number
        self.tol = params.get("tol", 1e-5)  # Loss tolerance for optimization

        self.layers_coef = None
        self.layers_intercept = None
        self.cost = None
        self.n_iter = 0
        self.classes = None
        self.label_binarizer_ = LabelBinarizer()

    def _unstack(self, stacked_parameters):
        for i in range(self.n_layers_ - 1):
            start, end, shape = self._coef_indptr[i]
            self.layers_coef[i] = np.reshape(stacked_parameters[start:end],
                                             shape)
            start, end = self._intercept_indptr[i]
            self.layers_intercept[i] = stacked_parameters[start:end]

    def _forward_pass(self, activations, with_output_activation=True):
        for i in range(self.n_layers_ - 1):
            activations[i + 1] = safe_sparse_dot(activations[i],
                                                 self.layers_coef[i])
            activations[i + 1] += self.layers_intercept[i]

            # For the hidden layers
            if i + 1 != self.n_layers_ - 1:
                activations[i + 1] = rectified_linear_unit(activations[i + 1])

        # For the last layer
        if with_output_activation:
            activations[i + 1] = rectified_linear_unit(activations[i + 1])
        return activations

    def _compute_cost_grad(self, layer, n_samples, activations, deltas,
                           coef_grads, intercept_grads):
        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
                                            deltas[layer])
        coef_grads[layer] += (self.alpha * self.layers_coef[layer])
        coef_grads[layer] /= n_samples

        intercept_grads[layer] = np.mean(deltas[layer], 0)

        return coef_grads, intercept_grads

    def _cost_grad_lbfgs(self, stacked_coef_inter, X, y, activations, deltas,
                         coef_grads, intercept_grads):
        self._unstack(stacked_coef_inter)
        cost, coef_grads, intercept_grads = self._backprop(
            X, y, activations, deltas, coef_grads, intercept_grads)
        self.n_iter += 1
        grad = stack(coef_grads, intercept_grads)
        return cost, grad

    def _backprop(self, X, y, activations, deltas, coef_grads,
                  intercept_grads):
        n_samples = X.shape[0]

        # Forward propagate
        activations = self._forward_pass(activations)

        # Get cost using log loss function
        cost = log_loss(y, activations[-1])

        # Add regularization term to the cost
        values = np.sum(np.array([np.sum(s**2) for s in self.layers_coef]))
        cost += (0.5 * self.alpha) * values / n_samples

        # Backward propagate
        last = self.n_layers_ - 2
        diff = y - activations[-1]
        deltas[last] = -diff

        # Compute gradient for the last layer
        coef_grads, intercept_grads = self._compute_cost_grad(
            last, n_samples, activations, deltas, coef_grads, intercept_grads)

        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 2, 0, -1):
            deltas[i - 1] = safe_sparse_dot(deltas[i], self.layers_coef[i].T)
            deltas[i - 1] *= rectified_linear_unit_derivative(activations[i])
            coef_grads, intercept_grads = self._compute_cost_grad(
                i - 1, n_samples, activations, deltas, coef_grads,
                intercept_grads)
        return cost, coef_grads, intercept_grads

    def fit(self, X, y):
        hidden_layers_size = list(self.hidden_layers_size)
        n_samples, n_features = X.shape
        self.label_binarizer_.fit(y)

        if self.classes is None:
            self.classes = self.label_binarizer_.classes_
        else:
            classes = self.label_binarizer_.classes_

        y = self.label_binarizer_.transform(y)
        self.n_outputs = y.shape[1]
        layer_units = ([n_features] + hidden_layers_size + [self.n_outputs])

        # If it is the first time training the model
        if self.layers_coef is None:
            # Initialize parameters
            self.n_outputs = y.shape[1]

            # Compute the number of layers
            self.n_layers_ = len(layer_units)

            # Initialize coefficient and intercept layers
            self.layers_coef = []
            self.layers_intercept = []

            for i in range(self.n_layers_ - 1):
                rng = check_random_state(self.random_state)
                n_fan_in = layer_units[i]
                n_fan_out = layer_units[i + 1]

                # Use the Gorot initialization method
                weight_init_bound = np.sqrt(6. / (n_fan_in + n_fan_out))
                self.layers_coef.append(
                    rng.uniform(-weight_init_bound, weight_init_bound,
                                (n_fan_in, n_fan_out)))
                self.layers_intercept.append(
                    rng.uniform(-weight_init_bound, weight_init_bound,
                                n_fan_out))
        if self.shuffle:
            X, y = shuffle(X, y, random_state=self.random_state)

        # Initialize lists
        activations = [X]
        activations.extend(
            np.empty((n_samples, n_fan_out)) for n_fan_out in layer_units[1:])

        deltas = [np.empty_like(a_layer) for a_layer in activations]
        coef_grads = [
            np.empty((n_fan_in, n_fan_out))
            for n_fan_in, n_fan_out in zip(layer_units[:-1], layer_units[1:])
        ]

        intercept_grads = [
            np.empty(n_fan_out) for n_fan_out in layer_units[1:]
        ]

        # START LBFGS algorithm
        # Store meta information for the parameters
        self._coef_indptr = []
        self._intercept_indptr = []
        start = 0

        # Save sizes and indices of coefficients for faster unstacking
        for i in range(self.n_layers_ - 1):
            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]

            end = start + (n_fan_in * n_fan_out)
            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
            start = end

        # Save sizes and indices of intercepts for faster unstacking
        for i in range(self.n_layers_ - 1):
            end = start + layer_units[i + 1]
            self._intercept_indptr.append((start, end))
            start = end

        # enable pretty output for l_bfgs_b
        iprint = 1

        # Run L-BFGS_B opitimization
        stacked_coef_inter = stack(self.layers_coef, self.layers_intercept)

        optimal_parameters, self.cost, d = fmin_l_bfgs_b(
            x0=stacked_coef_inter,
            func=self._cost_grad_lbfgs,
            maxfun=self.max_iter,
            iprint=iprint,
            pgtol=self.tol,
            args=(X, y, activations, deltas, coef_grads, intercept_grads))

        self._unstack(optimal_parameters)

        return self

    def decision_function(self, X):
        hidden_layers_size = list(self.hidden_layers_size)

        layer_units = [X.shape[1]] + hidden_layers_size + [self.n_outputs]

        # Initialize layers
        activations = []
        activations.append(X)

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, with_output_activation=False)
        y_pred = activations[-1]

        if self.n_outputs == 1:
            return y_pred.ravel()
        else:
            return y_pred

    def predict(self, X):
        y_scores = self.decision_function(X)
        y_scores = rectified_linear_unit(y_scores)

        return self.label_binarizer_.inverse_transform(y_scores)
Exemple #56
0
def trainFMFTRL(moddict):

    merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8')
    #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', merge.shape)

    dftt = merge[(merge.price < 1.0)]
    merge = merge.drop(merge[(merge.price < 1.0)].index)
    del dftt['price']
    nrow_train = merge.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(merge["price"])
    merge = pd.concat([merge, dftt])
    merge['target'] = np.log1p(merge["price"])
    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]),
                                      random_state=233,
                                      train_size=0.90)
    gc.collect()
    cpuStats()

    merge = prepFMFeatures(merge)
    cpuStats()
    merge.head()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    moddict['cross_cols'] = {}
    for i in range(0, len(cross_nm)):
        moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True)
        moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]])
        if i == 0:
            x_col = moddict['cross_cols'][cross_nm[i]].transform(
                merge[cross_nm[i]])
        else:
            x_col = hstack(
                (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform(
                    merge[cross_nm[i]])))
        del merge[cross_nm[i]]
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''
    moddict['wb_name'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.5, 1.0],
                                                 "hash_size":
                                                 2**29,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None,
                                                 'verbose':
                                                 1,
                                             }),
                                             procs=8)
    moddict['wb_name'].dictionary_freeze = True
    X_name = moddict['wb_name'].fit_transform(merge['name'])
    moddict['wb_name_mask'] = np.where(
        X_name[:nrow_train].getnnz(axis=0) > 0)[0]
    X_name = X_name[:, moddict['wb_name_mask']]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category #2
    '''
    moddict['wb_cat'] = wordbatch.WordBatch(normalize_text,
                                            extractor=(WordBag, {
                                                "hash_ngrams":
                                                2,
                                                "hash_ngrams_weights":
                                                [1.0, 1.0],
                                                "hash_size":
                                                2**20,
                                                "norm":
                                                None,
                                                "tf":
                                                'binary',
                                                "idf":
                                                None,
                                            }),
                                            procs=4)
    moddict['wb_cat'].dictionary_freeze = True
    ### This must be the full dataset
    #cats = merge["category_name"].str.replace('/', ' ').unique()
    moddict['wb_cat'].fit(categories)
    X_cat_tmp = moddict['wb_cat'].transform(categories)
    moddict['wb_cat_dict'] = dict([
        (c, X_cat_tmp.getrow(row))
        for (c, row) in zip(categories.tolist(), range(len(categories)))
    ])
    X_cat = vstack(([
        moddict['wb_cat_dict'][c]
        for c in merge["category_name"].str.replace('/', ' ')
    ]))
    #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool)
    moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0]
    X_cat = X_cat[:, moddict['wb_cat_mask']]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    moddict['wb_cat_ctgc'] = CountVectorizer()
    moddict['wb_cat_ctgc'].fit(merge['general_cat'])
    X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat'])
    moddict['wb_cat_ctsc1'] = CountVectorizer()
    moddict['wb_cat_ctsc1'].fit(merge['subcat_1'])
    X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1'])
    moddict['wb_cat_ctsc2'] = CountVectorizer()
    moddict['wb_cat_ctsc2'].fit(merge['subcat_2'])
    X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.0, 0.6],
                                                 "hash_size":
                                                 2**28,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None
                                             }),
                                             procs=8)
    moddict['wb_dscr'].dictionary_freeze = True
    X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' +
                                                     merge['item_description'])
    #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool)
    moddict['wb_dscr_mask'] = np.where(
        X_description[:nrow_train].getnnz(axis=0) > 1)[0]
    X_description = X_description[:, moddict['wb_dscr_mask']]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_brandname'] = LabelBinarizer(sparse_output=True)
    moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train])
    X_brand = moddict['wb_brandname'].transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True)
    moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train])
    X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id'])
    print('[{}] Label binarize `item_condition_id` completed.'.format(
        time.time() - start_time))

    moddict['wb_shipping'] = LabelBinarizer(sparse_output=True)
    moddict['wb_shipping'].fit(merge['shipping'][:nrow_train])
    X_shipping = moddict['wb_shipping'].transform(merge['shipping'])
    print('[{}] Label binarize `shipping` completed.'.format(time.time() -
                                                             start_time))

    print(
        X_itemcond.shape,
        X_shipping.shape,  #X_dummies.shape, 
        X_description.shape,
        X_brand.shape,
        X_category1.shape,
        X_category2.shape,
        X_category3.shape,
        X_name.shape,
        X_cat.shape,
        x_col.shape)
    sparse_merge = hstack((
        X_itemcond,
        X_shipping,  #X_dummies, 
        X_description,
        X_brand,
        X_category1,
        X_category2,
        X_category3,
        X_name,
        X_cat,
        x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    print(50 * '-')
    cpuStats()
    print(50 * '-')
    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    gc.collect()
    sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train]
    if develop:
        train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \
                                        sparse_merge[validx], \
                                        y.values[trnidx], y.values[validx]
        del sparse_merge
        gc.collect()
    print(50 * '*')
    cpuStats()
    print(50 * '*')
    print(train_X.shape[1])
    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=train_X.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=4)  #iters=15

    print(50 * '|')
    cpuStats()
    print(50 * '|')
    baseline = 1.
    for i in range(15):
        print(50 * '-')
        cpuStats()
        print(50 * '-')
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline - 0.0004:
            baseline = score_
        else:
            break

    moddict['FMmodel'] = model

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = moddict['FMmodel'].predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
    gc.collect()

    return merge, moddict, trnidx, validx, nrow_train, predsfm
Exemple #57
0
    image_path.split(os.path.sep)[LABEL_PATH_INDEX]
    for image_path in image_paths
]
class_names = [str(x) for x in np.unique(class_names)]

aap = AspectAwarePreprocessor(WIDTH, HEIGHT)
iap = ImageToArrayPreprocessor()

sdl = SimpleDatasetLoader([aap, iap])
data, labels = sdl.load(image_paths, verbose=500)
data = data.astype('float') / 255.0

train_X, test_X, train_y, test_y = train_test_split(data,
                                                    labels,
                                                    test_size=0.25)
label_binarizer = LabelBinarizer()
train_y = label_binarizer.fit_transform(train_y)
test_y = label_binarizer.transform(test_y)

aug = ImageDataGenerator(rotation_range=30,
                         width_shift_range=0.1,
                         height_shift_range=0.1,
                         shear_range=0.2,
                         zoom_range=0.2,
                         horizontal_flip=True,
                         fill_mode='nearest')

print('[INFO] compiling model')
model = MiniVGGNet.build(width=WIDTH,
                         height=HEIGHT,
                         depth=3,
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]

test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)

x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
Exemple #59
0
def test_gradient():
    # Test gradient.

    # This makes sure that the activation functions and their derivatives
    # are correct. The numerical and analytical computation of the gradient
    # should be close.
    for n_labels in [2, 3]:
        n_samples = 5
        n_features = 10
        random_state = np.random.RandomState(seed=42)
        X = random_state.rand(n_samples, n_features)
        y = 1 + np.mod(np.arange(n_samples) + 1, n_labels)
        Y = LabelBinarizer().fit_transform(y)

        for activation in ACTIVATION_TYPES:
            mlp = MLPClassifier(activation=activation,
                                hidden_layer_sizes=10,
                                solver='lbfgs',
                                alpha=1e-5,
                                learning_rate_init=0.2,
                                max_iter=1,
                                random_state=1)
            mlp.fit(X, y)

            theta = np.hstack(
                [l.ravel() for l in mlp.coefs_ + mlp.intercepts_])

            layer_units = ([X.shape[1]] + [mlp.hidden_layer_sizes] +
                           [mlp.n_outputs_])

            activations = []
            deltas = []
            coef_grads = []
            intercept_grads = []

            activations.append(X)
            for i in range(mlp.n_layers_ - 1):
                activations.append(np.empty((X.shape[0], layer_units[i + 1])))
                deltas.append(np.empty((X.shape[0], layer_units[i + 1])))

                fan_in = layer_units[i]
                fan_out = layer_units[i + 1]
                coef_grads.append(np.empty((fan_in, fan_out)))
                intercept_grads.append(np.empty(fan_out))

            # analytically compute the gradients
            def loss_grad_fun(t):
                return mlp._loss_grad_lbfgs(t, X, Y, activations, deltas,
                                            coef_grads, intercept_grads)

            [value, grad] = loss_grad_fun(theta)
            numgrad = np.zeros(np.size(theta))
            n = np.size(theta, 0)
            E = np.eye(n)
            epsilon = 1e-5
            # numerically compute the gradients
            for i in range(n):
                dtheta = E[:, i] * epsilon
                numgrad[i] = ((loss_grad_fun(theta + dtheta)[0] -
                               loss_grad_fun(theta - dtheta)[0]) /
                              (epsilon * 2.0))
            assert_almost_equal(numgrad, grad)

def training_summary(H):
    N = EPOCHS
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, N), H.history["loss"], label="train_loss")
    plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
    plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc")
    plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend(loc="lower left")
    plt.savefig(args["plot"])


if __name__ == '__main__':
    INIT_LR = 1e-4
    EPOCHS = 20
    BS = 32
    args = argument_parser()
    lb = LabelBinarizer()
    path = list(paths.list_images(args["dataset"]))
    data, labels = get_images(path)
    labels = one_hot_encoder(lb, labels)
    aug = augmetation_image()
    xtrain, xtest, ytrain, ytest = data_splitting(data, labels)
    model, H = model_construction(aug, xtrain, xtest, ytrain, ytest)
    training_summary(H)