def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets')
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])    
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1,), (1, 2)]
    indicator_mat = np.array([[0, 1, 1],
                              [1, 0, 0],
                              [1, 1, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat,
                       lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()

    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1],
                         [1, 0],
                         [0, 1],
                         [1, 1]])
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
Beispiel #3
0
def initData(filename):
    if not os.path.exists(filename):
        print "I can't find this file: %s"%filename
        sys.exit(1)

    datareader = csv.reader(open(filename,'r'))
    ct = 0;
    for row in datareader:
     ct = ct+1

    datareader = csv.reader(open(filename,'r'))
    data = np.array(-1*np.ones((ct,7),float),object);
    k=0;

    for row in datareader:
     data[k,:] = np.array(row)
     k = k+1;

    #To modify
    featnames = np.array(ATTRIBUTES,str)

    keys = [[]]*np.size(data,1)
    numdata = -1*np.ones_like(data);
    nfeatures=[0]
    featIndex=[]

    # convert string objects to integer values for modeling:
    for k in range(np.size(data,1)):
     keys[k],garbage,numdata[:,k] = np.unique(data[:,k],True,True)

    numrows = np.size(numdata,0); # number of instances in car data set
    numcols = np.size(numdata,1); # number of columns in car data set
    numdata = np.array(numdata,int)

    xdata = numdata[:,:-1]; # x-data is all data BUT the last column which are the class labels
    ydata = numdata[:,-1]; # y-data is set to class labels in the final column, signified by -1

    # ------------------ numdata multilabel -> binary conversion for NB-Model ---------------------
    lbin = LabelBinarizer();
    for k in range(np.size(xdata,1)): # loop thru number of columns in xdata
     if k==0:
      xdata_ml = lbin.fit_transform(xdata[:,k]);
      featIndex = lbin.classes_
      nfeatures.append(len(lbin.classes_))
     else:
      xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k])))
      featIndex= np.hstack((featIndex,lbin.classes_))
      nfeatures.append(nfeatures[-1]+len(lbin.classes_))

    if _VERBOSE:

        print "nfeatures:"
        print nfeatures
        print "featIndex"
        print featIndex

    return xdata_ml,xdata,ydata,data,nfeatures,keys,featIndex
def encode_categorical(cat, missing_value = False, option = "binary"):
    # Encodes the categorical features. For N unique categories:
    # cat : the column of categorical values
    # option = 'binary'    : binary (one-hot, orthogonal, thermometer) encoding - N features
    #          'freq'      : occuring frequency (percentage) - 1 feature
    #          'mis_float' : all binary encoded except missing values (vector of floats corresponding to occurance frequencies) - (N-1) features
    #          'mis_unif'  : all binary encoded except missing values (vector of floats of uniform values) - (N-1) features
    #          'dummy'     : just like binary but one column is removed - (n-1) features
    #          'sum'       : sum (deviation) coding. just like dummy but zeros row is all -1 - (N-1) features
    ########## TO DO: Encoding w.r.t. targets
    
    if option == "binary":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)
    elif option == "freq":
        freq_count = itemfreq(cat)
        encoded = np.zeros(len(cat))
        for i in range(freq_count.shape[0]):
            encoded[cat == freq_count[i][0]] = float(freq_count[i][1])/len(encoded)
    elif option == "mis_float":
        if missing_value == False:
            raise ValueError("Provide a missing value for the option 'mis_float'.")
        else:
            lb = LabelBinarizer()
            encoded = lb.fit_transform(cat).astype(float)
            missing_bool = cat == missing_value
            if np.sum(missing_bool) == 0:
                raise ValueError("No such missing value!")
            encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
            encoded[missing_bool,:] = np.sum(encoded[~missing_bool], axis = 0)/float(encoded[~missing_bool].shape[0])
    elif option == "mis_unif":
        if missing_value == False:
            raise ValueError("Provide a missing value for the option 'mis_float'.")
        else:
            lb = LabelBinarizer()
            encoded = lb.fit_transform(cat).astype(float)
            missing_bool = cat == missing_value
            if np.sum(missing_bool) == 0:
                raise ValueError("No such missing value!")
            encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1)
            encoded[missing_bool,:] = np.ones(encoded.shape[1]) * 1.0 / encoded.shape[1]
    elif option == "dummy":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)[:,0:-1]
    elif option == "sum":
        lb = LabelBinarizer()
        encoded = lb.fit_transform(cat)
        last_col = encoded[:,-1].astype(bool)
        encoded = encoded[:,0:-1]
        encoded[last_col,:] = -1
    else:
        raise ValueError("No such option!")
    print("Number of unique categorical values : %s" % encoded.shape[1])
        
    return encoded
Beispiel #5
0
class MLPClassifier(BaseMLP, ClassifierMixin):
    """ Multilayer Perceptron Classifier.

    Uses a neural network with one hidden layer.


    Parameters
    ----------


    Attributes
    ----------

    Notes
    -----


    References
    ----------"""

    def __init__(
        self, n_hidden=200, lr=0.1, l2decay=0, loss="cross_entropy", output_layer="softmax", batch_size=100, verbose=0
    ):
        super(MLPClassifier, self).__init__(n_hidden, lr, l2decay, loss, output_layer, batch_size, verbose)

    def fit(self, X, y, max_epochs=10, shuffle_data=False):
        self.lb = LabelBinarizer()
        one_hot_labels = self.lb.fit_transform(y)
        super(MLPClassifier, self).fit(X, one_hot_labels, max_epochs, shuffle_data)
        return self

    def predict(self, X):
        prediction = super(MLPClassifier, self).predict(X)
        return self.lb.inverse_transform(prediction)
Beispiel #6
0
def test_multinomial_loss_ground_truth():
    # n_samples, n_features, n_classes = 4, 2, 3
    n_classes = 3
    X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]])
    y = np.array([0, 1, 2, 0])
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)

    weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]])
    intercept = np.array([1., 0, -.2])
    sample_weights = np.array([0.8, 1, 1, 0.8])

    prediction = np.dot(X, weights) + intercept
    logsumexp_prediction = logsumexp(prediction, axis=1)
    p = prediction - logsumexp_prediction[:, np.newaxis]
    loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum()
    diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin)
    grad_1 = np.dot(X.T, diff)

    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    assert_almost_equal(loss_1, loss_2)
    assert_array_almost_equal(grad_1, grad_2)

    # ground truth
    loss_gt = 11.680360354325961
    grad_gt = np.array([[-0.557487, -1.619151, +2.176638],
                        [-0.903942, +5.258745, -4.354803]])
    assert_almost_equal(loss_1, loss_gt)
    assert_array_almost_equal(grad_1, grad_gt)
Beispiel #7
0
def run_sr():
    dim = (X_train.shape[1], n_classes)
    lb = LabelBinarizer()
    y_true = lb.fit_transform(y_train)

    sr = SoftmaxRegression(dim)
    sr.fit(X_train, y_true, verbose=1)
Beispiel #8
0
    def bio_classification_report(y_true, y_pred):
        """
        Classification report for a list of BIO-encoded sequences.
        It computes token-level metrics and discards "O" labels.

        Note that it requires scikit-learn 0.15+ (or a version from
        github master) to calculate averages properly!
        """
        lb = LabelBinarizer()
        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        labs = [class_indices[cls] for cls in tagset]

        return((precision_recall_fscore_support(y_true_combined,
                                                y_pred_combined,
                                                labels=labs,
                                                average=None,
                                                sample_weight=None)),
               (classification_report(
                   y_true_combined,
                   y_pred_combined,
                   labels=[class_indices[cls] for cls in tagset],
                   target_names=tagset,
               )), labs)
Beispiel #9
0
class GBClassifier(_BaseGB, ClassifierMixin):

    def __init__(self, estimator, n_estimators=100,
                 step_size="line_search", learning_rate=0.1,
                 loss="squared_hinge", subsample=1.0,
                 callback=None, random_state=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.step_size = step_size
        self.learning_rate = learning_rate
        self.loss = loss
        self.subsample = subsample
        self.callback = callback
        self.random_state = random_state

    def _get_loss(self):
        losses = dict(squared_hinge=_SquaredHingeLoss(),
                      log=_LogLoss())
        return losses[self.loss]

    def fit(self, X, y):
        self._lb = LabelBinarizer(neg_label=-1)
        Y = self._lb.fit_transform(y)
        return super(GBClassifier, self).fit(X, Y)

    def predict(self, X):
        pred = self.decision_function(X)
        return self._lb.inverse_transform(pred)
Beispiel #10
0
class ElasticNetClassifier(LinearClassifierMixin, ElasticNet):
    """Class to extend elastic-net in case of classification."""

    def fit(self, X, y, check_input=True):
        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
        Y = self._label_binarizer.fit_transform(y)
        if self._label_binarizer.y_type_.startswith('multilabel'):
            # we don't (yet) support multi-label classification in ENet
            raise ValueError(
                "%s doesn't support multi-label classification" % (
                    self.__class__.__name__))

        # Y = column_or_1d(Y, warn=True)
        super(ElasticNetClassifier, self).fit(X, Y)
        if self.classes_.shape[0] > 2:
            ndim = self.classes_.shape[0]
        else:
            ndim = 1
        self.coef_ = self.coef_.reshape(ndim, -1)

        return self

    @property
    def classes_(self):
        return self._label_binarizer.classes_
def binarize_seqfeature(X):
    """
    Binarizes the sequence features into 1s and 0s.
    
    Parameters:
    ===========
    - X: (pandas DataFrame) the sequence feature matrix without drug resistance values.
    
    Returns:
    ========
    - binarized:     (pandas DataFrame) a binarized sequence feature matrix with columns corresponding to particular amino acids at each position.
    - binarizers:    (dictionary) a dictionary of binarizer objects for each position.
    """
    binarized = pd.DataFrame()
    binarizers = dict()
    for col in X.columns:
        lb = LabelBinarizer()
        binarized_cols = lb.fit_transform(X[col])
        if len(lb.classes_) == 2:
            binarized[col] = pd.Series(binarized_cols[:, 0])
        else:
            for i, c in enumerate(lb.classes_):
                binarized[col + "_" + c] = binarized_cols[:, i]
        binarizers[col] = lb

    return binarized, binarizers
Beispiel #12
0
def bio_classification_report(y_true, y_pred):

    lb = LabelBinarizer()
    y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = list(chain.from_iterable(y_pred))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined))
    print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None),
        roc_auc_score(y_true_combined, y_pred_combined, average=None))
    #plt.figure()
    #fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined)
    #area = auc(fpr, tpr)
    #plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area))
    #plt.legend(loc=4)
    #plt.savefig('sub3.jpg')

    return classification_report(
        1 - y_true_combined,
        [0 if v > 0.1 else 1 for v in y_pred_combined],
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
Beispiel #13
0
def just_categorical(dropped):

    # create initial matrix
    print('starting with m0')
    lb = LabelBinarizer(sparse_output=True)
    m = lb.fit_transform(dropped.restaurant_id)
    print(m.shape)

    # build matrix
    # making nan its own category for categorical
    print("adding categorical to matrix")
    m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
    print(m.shape)

    print("adding bool to matrix")
    m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
    print(m.shape)

    print("adding restaurant categories to matrix")
    cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("adding restaurant neighborhoods to matrix")
    cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("matrix shape of {}".format(m.shape))
    joblib.dump(m, 'pickle_jar/categorical_matrix')
 def transform(self, data_dict):
     listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml",
             "cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"]
     regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")"
     data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower()
     lb = LabelBinarizer()
     return lb.fit_transform(data.fillna(""))
Beispiel #15
0
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    # Check if dimensions are consistent.
    val.check_consistent_length(T, Y)
    T = val.check_array(T)
    Y = val.check_array(Y)
    print(T)
    print(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return _weighted_sum(loss, sample_weight, normalize)
Beispiel #16
0
def full_matrix(dropped):
    # create initial matrix
    print('starting with m0')
    lb = LabelBinarizer(sparse_output=True)
    # m = lb.fit_transform(dropped.restaurant_id)
    m = lb.fit_transform(dropped.user_name)
    print(m.shape)
    # build matrix
    # making nan its own category for categorical
    print("adding categorical to matrix")
    m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',])
    print(m.shape)

    print("adding bool to matrix")
    m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ])
    print(m.shape)

    m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold'])
    print(m.shape)

    print("adding restaurant categories to matrix")
    cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("adding restaurant neighborhoods to matrix")
    cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']
    m = special_categories_to_matrix(m, dropped, cats)
    print(m.shape)

    print("matrix shape of {}".format(m.shape))
    joblib.dump(m, 'pickle_jar/full_matrix')
Beispiel #17
0
    def fit(self, X, y):
        """
        performs one step of gradient descent
        """
        # get the dimensions of our data
        n_samples, n_features = X.shape[0], X.shape[1]+1
        n_targets = len(np.unique(y))

        # add a column to the data matrix to incorporate the bias term
        X = np.c_[np.ones(n_samples), X]
        
        # one-vs-all labeling
        lb = LabelBinarizer()
        y = lb.fit_transform(y)
        
        # initialize the weights 
        if self.W is None:
            self.W = np.zeros( (n_features, n_targets) )
       
        # perform the optimization using gradient descent with momentum
        grad = self.gradient(X,y)
        self.W = self.W - self.learning_rate*(grad + self.momentum*self.prev_grad)
        self.prev_grad = grad

        return self.loss(X,y)
def run():
    # Load and preprocess data
    label_to_unique_instance = load_data()
    X, Y = preprocess_data(label_to_unique_instance)

    # Encode labels
    label_binarizer = LabelBinarizer()
    transformed_Y = label_binarizer.fit_transform(Y)

    # Cross validation
    cross_validation_iterator = StratifiedShuffleSplit(Y, n_iter=1, test_size=0.4, random_state=0)
    for train_index, test_index in cross_validation_iterator:
        break

    # Init model
    model = init_model(raw_feature_dim=X.shape[-1], unique_lable_num=len(label_binarizer.classes_))

    # Training procedure
    model.fit(X[train_index], transformed_Y[train_index],
              batch_size=BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM,
              validation_data=(X[test_index], transformed_Y[test_index]),
              callbacks=[TensorBoard(log_dir="/tmp/Sequence Classification")],
              verbose=2)

    print("All done!")
Beispiel #19
0
def test_multinomial_loss():
    # test if the multinomial loss and gradient computations are consistent
    X, y = iris.data, iris.target.astype(np.float64)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    rng = check_random_state(42)
    weights = rng.randn(n_features, n_classes)
    intercept = rng.randn(n_classes)
    sample_weights = rng.randn(n_samples)
    np.abs(sample_weights, sample_weights)

    # compute loss and gradient like in multinomial SAG
    dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
    loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights,
                                                        intercept, n_samples,
                                                        n_features, n_classes)
    # compute loss and gradient like in multinomial LogisticRegression
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)
    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    # comparison
    assert_array_almost_equal(grad_1, grad_2)
    assert_almost_equal(loss_1, loss_2)
Beispiel #20
0
def report(test_y, pred_y):
    lb = LabelBinarizer()
    test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y)))
    pred_y_combined = lb.transform(list(chain.from_iterable(pred_y)))
    tagset = sorted(set(lb.classes_))
    class_indices = {cls: idx for idx, cls in enumerate(tagset)}
    print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
Beispiel #21
0
def iris_demo():
    # load the iris dataset
    iris = load_iris()
    X = iris['data']
    y_labels = iris['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building logistic regression classifier to classify iris data")
    nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]])
    print("Training")
    nn.fit(X_train, y_train, X_valid, y_valid,
           batch_size=20, n_epochs=20, learning_rate=0.05,
           random_state=RANDOM_STATE)

    y_pred = nn.predict(X_test)

    print("iris accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
Beispiel #22
0
def conv_demo():
    # load the digits dataset
    digits = load_digits()
    X = digits['data']
    y_labels = digits['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building neural net to classify digits")
    conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1],
                            random_state=RANDOM_STATE)
    print("Training")
    conv_net.fit(X_train, y_train, X_valid, y_valid,
                 batch_size=20, n_epochs=20, learning_rate=0.05)

    y_pred = conv_net.predict(X_test)

    print("digits accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
def bio_classification_report(y_true, y_pred):
    """Evaluates entity extraction accuracy.

    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
    """
    from sklearn.preprocessing import LabelBinarizer
    from itertools import chain
    from sklearn.metrics import classification_report

    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
            y_true_combined,
            y_pred_combined,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
    )
def binarize_label_columns(df, columns, two_classes_as='single'):
    '''
    Inputs:
        df: Pandas dataframe object.
        columns: Columns to binarize.
        tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns.
    Returns a tuple with the following items:
        df: Pandas dataframe object with new columns.
        binlabel_names: Names of the newly created binary variables.
        lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer 
        objects as values.
    '''
    binlabel_names = []
    lb_objects = {}
    for col in columns:
        if len(df[col].unique()) > 1: 
            rows_notnull = df[col].notnull() # Use only valid feature observations
            lb = LabelBinarizer()
            binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations
            if len(lb.classes_) == 2 and two_classes_as == 'multiple':
                binclass = np.hstack((1 - binclass, binclass))
            lb_objects[col] = lb
            if len(lb.classes_) > 2 or two_classes_as == 'multiple':
                col_binlabel_names = [col+'_'+str(c) for c in lb.classes_]
                binlabel_names += col_binlabel_names # Names for the binarized classes
                for n in col_binlabel_names: df[n] = np.NaN # Initialize columns
                df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data
            elif two_classes_as == 'single': 
                binlabel_names.append(col+'_bin') # Names for the binarized classes
                df[col+'_bin'] = np.NaN # Initialize columns
                df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data
    return df, binlabel_names, lb_objects
def scorer_auc(y_true, y_pred):
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import LabelBinarizer
    """Dedicated to 2class probabilistic outputs"""
    le = LabelBinarizer()
    y_true = le.fit_transform(y_true)
    return roc_auc_score(y_true, y_pred)
Beispiel #26
0
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!

    Note: This function was copied from
    http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

    Args:
        y_true: True labels, list of strings
        y_pred: Predicted labels, list of strings
    Returns:
        classification report as string
    """
    lbin = LabelBinarizer()
    y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred)))

    #tagset = set(lbin.classes_) - {NO_NE_LABEL}
    tagset = set(lbin.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, Y):
        # binarize labels
        self.bl = LabelBinarizer()
        Y = self.bl.fit_transform(Y)
        self.classes_ = self.bl.classes_

        # create an estimator for each label
        self.estimators_ = []
        for i in xrange(self.bl.classes_.shape[0]):
            estimator = clone(self.estimator)
            estimator.fit(X, Y[:, i])
            self.estimators_.append(estimator)

    def predict(self, X):
        self._check_is_fitted()

        X = np.atleast_2d(X)
        Y = np.empty((X.shape[0], self.classes_.shape[0]))
        for i, estimator in enumerate(self.estimators_):
            Y[:, i] = estimator.predict(X).T

        return self.bl.inverse_transform(Y)

    def _check_is_fitted(self):
        if not hasattr(self, "estimators_"):
            raise ValueError("The object hasn't been fitted yet!")
class CategoricalToNumerical(object):

    def __init__(self, dimensionality_reducer=None, verify=True):
        pass
        """Takes in a dimensionality reducer in order to convert categorical features into numerical.
        """
        if dimensionality_reducer is None:
            dimensionality_reducer = RandomizedPCA(1)
        self.dimensionality_reducer = dimensionality_reducer
        self.verify = verify
        self.binarizer = LabelBinarizer()

    def fit(self, X, y=None):
        self._verify(X, self.verify)
        binarized = self.binarizer.fit_transform(X)
        self.dimensionality_reducer.fit(binarized)

    def transform(self, X):
        self._verify(X, False)
        binarized = self.binarizer.transform(X)
        result = self.dimensionality_reducer.transform(binarized).flatten()
        assert X.shape == result.shape
        return result

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def _verify(self, X, verify):
        if verify:
            assert is_categorical(X)
        else:
            assert isinstance(X, np.ndarray)
            assert len(X.shape) == 1
Beispiel #29
0
	def train(self, X, y):
		n_features = X.shape[1]

		# class_prior = self.class_prior

		# Binarize Y
		labelbin = LabelBinarizer()
		Y = labelbin.fit_transform(y)
		self.classes = labelbin.classes_
		if Y.shape[1] == 1:
			Y = np.concatenate((1 - Y, Y), axis=1)

		n_effective_classes = Y.shape[1]
		self.class_count = np.zeros(n_effective_classes)
		self.feature_count = np.zeros((n_effective_classes, n_features))

		print "Start counting..."
		self.class_count = Y.sum(axis=0)
		print "Finished class counting!"
		print "Start feature counting..."
		self.feature_count = np.dot(Y.T, X)
		print "Finished feature counting!"

		# Apply add-k-smoothing
		print "Start smoothing..."
		self.class_count_smooth = self.class_count + self.k * len(self.classes)
		self.feature_count_smooth = self.feature_count + self.k
		print "Finished smooting!"

		# Convert to log probabilities
		self.feature_log_prob = (np.log(self.feature_count_smooth) - np.log(self.class_count_smooth.reshape(-1,1)))
		self.class_log_prior = np.zeros(len(self.classes)) - np.log(len(self.classes))

		return self
def get_dataset2(test_fraction):
    """
   @:param: test_fraction used to split train and test
   Vectorizes the features and labels into categorical values and randomly splits into train and test set
   :return: X_train, X_test, y_train, y_test
   """
    data = []
    with open('labels.csv', 'r') as datafile:
        csv_reader = csv.reader(datafile, delimiter=',', quotechar='|')
        for row in csv_reader:
            data.append(row)

    data = numpy.asarray(data)
    X = data[:, 0:data.shape[1]-1]
    y = data[:, data.shape[1]-1]

    # X,y = get_tabledata()

    vec = DictVectorizer()
    feature_dict = [dict(enumerate(x)) for x in X.tolist()]
    X = vec.fit_transform(feature_dict).toarray()
    joblib.dump(vec, 'vectorizer.pkl')

    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    joblib.dump(lb, 'binarizer.pkl')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction)
    return X_train, X_test, y_train, y_test
Beispiel #31
0
def main():
    feature_vectorized_file_name = 'Data/feature_vectorized2'
    if os.path.exists(feature_vectorized_file_name) == False:
        sparse_merge, price = _load(feature_vectorized_file_name)
        print(sparse_merge.shape)
    else:
        ########################################################################
        start_time = time.time()
        merge, submission, price = get_extract_feature()
        merge = merge[:TRAIN_SIZE]

        merge['item_condition_id'] = merge['item_condition_id'].astype(
            'category')
        print('[{}] Convert categorical completed'.format(time.time() -
                                                          start_time))

        # vectorize features
        wb = CountVectorizer()
        X_category2 = wb.fit_transform(merge['category_2'])
        X_category3 = wb.fit_transform(merge['category_name'])
        X_brand2 = wb.fit_transform(merge['brand_name'])
        print(
            '[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                  start_time))

        lb = LabelBinarizer(sparse_output=True)
        X_brand = lb.fit_transform(merge['brand_name'])
        X_category1 = lb.fit_transform(merge['category_1'])
        X_category4 = lb.fit_transform(merge['category_name'])
        print(
            '[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                                 start_time))

        X_dummies = csr_matrix(
            pd.get_dummies(merge[['item_condition_id', 'shipping']],
                           sparse=True).values)

        # hand feature
        for col in merge.columns:
            if ('Len' in col) or ('Frec' in col):
                merge[col] = np.log1p(merge[col])
                merge[col] = merge[col] / merge[col].max()

        hand_feature = [
            'brand_name_Frec', 'item_description_wordLen',
            'brand_name_name_Intsct', 'brand_name_item_description_Intsct'
        ]
        X_hand_feature = merge[hand_feature].values

        name_w1 = param_space_best_WordBatch['name_w1']
        name_w2 = param_space_best_WordBatch['name_w2']
        desc_w1 = param_space_best_WordBatch['desc_w1']
        desc_w2 = param_space_best_WordBatch['desc_w2']

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     2,
                                     "hash_ngrams_weights": [name_w1, name_w2],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     None,
                                     "tf":
                                     'binary',
                                     "idf":
                                     None,
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_name = wb.fit_transform(merge['name'])
        del (wb)
        X_name = X_name[:,
                        np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1),
                                 dtype=bool)]
        print('[{}] Vectorize `name` completed.'.format(time.time() -
                                                        start_time))

        merge['item_description'] = merge['category_2'].map(str)+' .#d3 .#d3 '+\
                                    merge['name'].map(str)+' .#d3 .#d3 '+\
                                    merge['item_description'].map(str)

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     3,
                                     "hash_ngrams_weights":
                                     [desc_w1, desc_w2, 0.7],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     "l2",
                                     "tf":
                                     1.0,
                                     "idf":
                                     None
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_description = wb.fit_transform(merge['item_description'])
        del (wb)
        X_description = X_description[:,
                                      np.array(np.clip(
                                          X_description.getnnz(axis=0) -
                                          6, 0, 1),
                                               dtype=bool)]
        print(
            '[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                  start_time))

        sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1,
                               X_category2, X_category3, X_category4,
                               X_hand_feature, X_name, X_description)).tocsr()

        print(X_dummies.shape, X_brand.shape, X_brand2.shape,
              X_category1.shape, X_category2.shape, X_category3.shape,
              X_category4.shape, X_hand_feature.shape, X_name.shape,
              X_description.shape, sparse_merge.shape)

        _save(feature_vectorized_file_name, [sparse_merge, price])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 3 fold cross validation

    # learner_name='best_FTRL'
    # learner_name='FTRL'
    learner_name = 'best_FM_FTRL'
    #learner_name='FM_FTRL'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger)
    optimizer.run()

    a = 12
Beispiel #32
0
INIT_LR = 1e-3
EPOCHS = 10
BS = 128
# grab the MNIST dataset
print("[INFO] accessing MNIST...")
((trainData, trainLabels), (testData, testLabels)) = mnist.load_data()
# add a channel (i.e., grayscale) dimension to the digits
trainData = trainData.reshape((trainData.shape[0], 28, 28, 1))
testData = testData.reshape((testData.shape[0], 28, 28, 1))
# scale data to the range of [0, 1]
trainData = trainData.astype("float32") / 255.0
testData = testData.astype("float32") / 255.0
# convert the labels from integers to vectors
le = LabelBinarizer()
trainLabels = le.fit_transform(trainLabels)
testLabels = le.transform(testLabels)
print("[INFO] compiling model...")
opt = Adam(lr=INIT_LR)
model = SudokuNet.build(width=28, height=28, depth=1, classes=10)
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])
# train the network
print("[INFO] training network...")
H = model.fit(trainData,
              trainLabels,
              validation_data=(testData, testLabels),
              batch_size=BS,
              epochs=EPOCHS,
              verbose=1)
    # load the input image (224x224) and preprocess it
    image = load_img(imagePath, target_size=(224, 224))
    image = img_to_array(image)
    image = preprocess_input(image)

    # update the data and labels lists, respectively
    data.append(image)
    labels.append(label)

# convert the data and labels to NumPy arrays
data = np.array(data, dtype="float32")
labels = np.array(labels)

# perform one-hot encoding on the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
labels = to_categorical(labels)

# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.20,
                                                  stratify=labels,
                                                  random_state=42)

# construct the training image generator for data augmentation
aug = ImageDataGenerator(rotation_range=20,
                         zoom_range=0.15,
                         width_shift_range=0.2,
                         height_shift_range=0.2,
Beispiel #34
0
import numpy as np

print("[INFO] accessing MNIST...")
dataset = datasets.fetch_mldata("MNIST Original")
data = dataset.data

if K.image_data_format() == "channels_first":
    data = data.reshape(data.shape[0], 1, 28, 28)
else:
    data = data.reshape(data.shape[0], 28, 28, 1)

(trainX, testX, trainY, testY) = train_test_split(data/255, dataset.target.astype("int"), test_size=0.25,
                                                  random_state=42)

lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)

print("[INFO] compiling model...")

optimizer = SGD(lr=0.01)
model = LeNet.build(width=28, height=28, depth=1, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])

print("[INFO] training network...")

H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=128,
              epochs=20, verbose=1)

print("[INFO] evaluating network...")
Beispiel #35
0
num_of_class = len(set(labels))
epochs = 100
channel_num = 1

# input image dimensions
img_height, img_width = 64, 64

if K.image_data_format() == 'channels_first':
    input_shape = (channel_num, img_height, img_width)
else:
    input_shape = (img_height, img_width, channel_num)

_print('label binarize.')
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
labels_one_hot = lb.fit_transform(labels)
with open('./label_binarizer.model', 'wb') as fp:
    pickle.dump(lb, fp)

from sklearn.model_selection import train_test_split

valid_size = 0.1
valid_idx = int(len(data_paths) * (1 - valid_size))

#_print('split train and test.')
#X_train, X_valid, y_train, y_valid = train_test_split(
#    data_paths, labels_one_hot, test_size=0.1)

_print('train gen')
train_gen = MyGenerator(data_paths[:valid_idx],
                        labels_one_hot[:valid_idx],
Beispiel #36
0
    def train(self, train, test, save_file=None, is_logging=True):
        if is_logging:
            old_stdout = sys.stdout
            log_file = open(save_file + '.log', 'w')
            sys.stdout = log_file

        # initialize batch loader
        batch_loader = BatchLoader(train[0], train[1], self.seq_len)

        # # prepare test data
        reshaped_test = test[0].reshape(
            (test[0].shape[0], self.seq_len, self.input_dim))
        lb = LabelBinarizer()
        lb.fit(batch_loader.get_classes())
        binarized_test_labels = lb.fit_transform(test[1])

        # initialize the variables
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            # run the initializer
            sess.run(init)

            # for e in range(1, self.num_epochs+1):
            #     # iteration_count = int(np.ceil(train[0].shape[0] / self.batch_size))
            #     # for idx in range(0, iteration_count):
            #     batch_x, batch_y = batch_loader.next_batch(self.batch_size)
            #     batch_x = np.expand_dims(batch_x, axis=1)
            #     # batch_x = batch_x.reshape((self.batch_size, self.seq_len, self.input_dim))
            #
            #     sess.run(self.train_op, feed_dict={self.X: batch_x, self.Y: batch_y})
            #     if e % self.display_step == 0 or e == 1:
            #         # Calculate batch loss and accuracy
            #         loss, acc = sess.run([self.loss_op, self.accuracy], feed_dict={self.X: batch_x, self.Y: batch_y})
            #         print("Minibatch Step " + str(e) + ", Loss= {:.4f}".format(loss) + ", Training Accuracy= {:.3f}".format(acc))
            #
            #     if e % 100 == 0:
            #         predictions = []
            #         for smpl_idx in range(0, test[0].shape[0]):
            #             acc = []
            #             for desc in test[0][smpl_idx]:
            #                 acc.append(sess.run(self.prediction, feed_dict={self.X: desc.reshape((1, 1, self.input_dim)), self.Y: binarized_test_labels[smpl_idx, :].reshape((1, self.num_classes))}))
            #             sum_acc = np.sum(acc, axis=0)
            #             predictions.append(np.argmax(sum_acc))
            #
            #         # binarized_predictions = lb.fit_transform(predictions)
            #         print("Epoch #" + str(e) + ", Test Accuracy:", (100 * np.sum(test[1] == predictions)) / test[1].shape[0])
            #
            #     # if e % 100 == 0:
            #     #     # evaluate model every 100 iterations
            #     #     print("Epoch #" + str(e) + ", Test Accuracy:",
            #     #           sess.run(self.accuracy, feed_dict={self.X: reshaped_test, self.Y: binarized_test_labels}), flush=True)

            for step in range(1, self.num_epochs + 1):
                batch_x, batch_y = batch_loader.next_batch(self.batch_size)
                batch_x = batch_x.reshape(
                    (self.batch_size, self.seq_len, self.input_dim))

                sess.run(self.train_op,
                         feed_dict={
                             self.X: batch_x,
                             self.Y: batch_y
                         })
                if step % self.display_step == 0 or step == 1:
                    # Calculate batch loss and accuracy
                    loss, acc = sess.run([self.loss_op, self.accuracy],
                                         feed_dict={
                                             self.X: batch_x,
                                             self.Y: batch_y
                                         })
                    print("Step " + str(step) + ", Minibatch Loss= " + \
                          "{:.4f}".format(loss) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))

                if step % 100 == 0:
                    # evaluate model every 100 iterations
                    print(
                        "Testing Accuracy:",
                        sess.run(self.accuracy,
                                 feed_dict={
                                     self.X: reshaped_test,
                                     self.Y: binarized_test_labels
                                 }))

            print("Optimization Finished!")

        if is_logging:
            sys.stdout = old_stdout
            log_file.close()
answer = ROWS[:, 12]  # The answer
# Provide list of datetime categories
yearlist = np.array(range(1970, 2050)).astype(str)
monthlist = np.array(range(1, 13)).astype(str)
daylist = np.array(range(1, 32)).astype(str)
hourlist = np.array(range(8, 17)).astype(str)
# Minutes with step size of 5, like 15, 20, 30, 45, etc...
minutelist = np.array(range(0, 61, 5)).astype(str)
weeklist = np.array(range(1, 8)).astype(str)
# ------------------------End Prepare Data-----------------------------

# ------------------------Vectorize-------------------------------
vectorizer = LabelBinarizer()  # Vectorize by going through the SQL result

procedure_name_vec = vectorizer.fit_transform(procedure_name)
print("Procedure Name")
print(procedure_name_vec[0])
provider_name_vec = vectorizer.fit_transform(provider_name)
print("Provider Name")
print(provider_name_vec[0])
answer_vec = vectorizer.fit_transform(answer)  # This is the output
print("Answer shape and %s = %s " % (answer[3], answer_vec[3]))
print(answer_vec.shape)

appt_duration_vec = appt_duration[:, None]
complete_vec = complete[:, None]
cancel_vec = cancel[:, None]
noshow_vec = noshow[:, None]

lb_year = LabelBinarizer().fit(yearlist)
Beispiel #38
0



#Step 3: scale the raw pixel intensities to the
#range [0, 1.0], then construct the training (75%)and
#testing splits(25%)Use train_test_split() function
data = dataset.data.astype("float") / 255.0
#(trainX, testX, trainY, testY) = train_test_split(data / 255.0, dataset.target.astype("int"), test_size=0.25, random_state=42)


# initialize the optimizer and model
print("[INFO] compiling model...")

lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.fit_transform(testY)

#Step 4: Define the 784-256-128-10 architecture using
#sequential model in KerasCheckthis tutorial
#(https://keras.io/getting-started/sequential-model-guide/)



#inputShape = (height,width,depth)

model = Sequential()
model.add(Dense(256, input_shape=(784,), activation="sigmoid"))
model.add(Dense(128, activation="sigmoid"))
model.add(Dense(10, activation="softmax"))
                help="path to models directory")
args = vars(ap.parse_args())

# load the testing data, then scale it into the range [0, 1]
(testX, testY) = cifar10.load_data()[1]
testX = testX.astype("float") / 255.0

# initialize the label names for the CIFAR-10 dataset
labelNames = [
    "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse",
    "ship", "truck"
]

# convert the labels from integers to vectors
lb = LabelBinarizer()
testY = lb.fit_transform(testY)

# construct the path used to collect the models then initialize the
# model list
modelPaths = os.path.sep.join([args["models"], "*.model"])
modelPaths = list(glob.glob(modelPaths))
models = []

# loop over the model paths, loading the model, and adding it to
# the list of models
for (i, modelPath) in enumerate(modelPaths):
    print("[INFO] loading model {}/{}".format(i + 1, len(modelPaths)))
    models.append(load_model(modelPath))

# initialize the list of predictions
print("[INFO] evaluating ensemble...")
output = Feature_set.S_TREE_CROP_NAME
Feature_set = Feature_set.drop(columns=['S_TREE_CROP_NAME'])
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
#print(output)
#print(Feature_set)
clf = RandomForestClassifier()

print(Feature_set)
#Encoding to convert String attributes into numbers
from sklearn.preprocessing import LabelBinarizer
croptype_lb = LabelBinarizer()
soiltype_lb = LabelBinarizer()
irrigation_lb = LabelBinarizer()
croptype = croptype_lb.fit_transform(Feature_set.S_TREE_CROP_TYPE.values)
soiltype = soiltype_lb.fit_transform(Feature_set.S_SOIL_TYPE.values)
irrigation = irrigation_lb.fit_transform(Feature_set.S_IRRIGATION.values)

temp = Feature_set.copy()
temp = temp.drop(columns=['S_IRRIGATION', 'S_SOIL_TYPE', 'S_TREE_CROP_TYPE'])
temp['Crop Type'] = pd.DataFrame(croptype)
temp['Irrigation'] = pd.DataFrame(irrigation)
stype = pd.DataFrame(soiltype)
temp["Soil Type"] = (stype[0].astype(str) + stype[1].astype(str) +
                     stype[2].astype(str) + stype[3].astype(str) +
                     stype[4].astype(str) + stype[5].astype(str)).astype(int)
#Final Feature set is in temp
Final_Feature_Set = temp.copy()
#print(Final_Feature_Set)
#print("Correlations")
# min normalization
# standardization [upb lwb]

# 3D tensor
# neural network input is always a tensor
# higher dimension matirx
set1 = np.reshape(set1, (len(set1), 32, len(np.transpose(set1)), 1))
set2 = np.reshape(set2, (len(set2), 32, len(np.transpose(set2)), 1))
set3 = np.reshape(set3, (len(set3), 32, len(np.transpose(set3)), 1))

# Encoding Data Label
# -- 1. binarization
# -- 2. One hot encoding

label_encoder = LabelBinarizer()
U_L = label_encoder.fit_transform(U_L)
T_L = label_encoder.fit_transform(T_L)

# task finished
# 1. Data read
# 2. data Normalize
# 3. label binarize

# Set the random seed
random_seed = 2

'DATA SPLITING'
# " Change Here"
# Exp1
# Split the train and the validation set for the fitting
X_train, X_val, Y_train, Y_val = train_test_split(set3,
Beispiel #42
0
from sklearn.preprocessing import LabelBinarizer

# Generate data
from sklearn.datasets import fetch_mldata
np.random.seed(0)
mnist = fetch_mldata('MNIST original')

X, y = mnist.data, mnist.target
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

lb = LabelBinarizer(neg_label=0, pos_label=1)
y_train = lb.fit_transform(y_train).astype(np.float64)
y_test = lb.transform(y_test).astype(np.float64)

ppross = decomposition.RandomizedPCA(n_components=50, whiten=True)
ppross.fit(X_train)
X_train = ppross.transform(X_train)
X_test = ppross.transform(X_test)

n = X_train.shape[0]
p = y_train.shape[1]
d = X_train.shape[1]

# sigma = 1113.379020273871
sigma = np.median(pdist(X_train[np.random.choice(X_train.shape[0], 1000), :]))
# sigma = 1. / np.sqrt(2 * 0.00728932)
print 'sigma: ', sigma
Beispiel #43
0
class RCNN():
    def __init__(self):
        self.init_lr = 1e-4
        self.epochs = 5
        self.bs = 2
        self.baseModel = MobileNetV2(weights="imagenet",
                                     include_top=False,
                                     input_tensor=Input(shape=(224, 224, 3)))
        self.model = None
        self.aug = ImageDataGenerator(rotation_range=20,
                                      zoom_range=0.15,
                                      width_shift_range=0.2,
                                      height_shift_range=0.2,
                                      shear_range=0.15,
                                      horizontal_flip=True,
                                      fill_mode="nearest")
        self.trainX, self.trainY = None, None
        self.testX, self.testY = None, None
        self.H = None
        self.lb = None
        self.build_model()

    def load_dataset(self):
        imagePaths = list(paths.list_images(config.BASE_PATH))
        data = []
        labels = []

        for imagePath in imagePaths:
            label = imagePath.split(os.path.sep)[-2]
            image = load_img(imagePath, target_size=config.INPUT_DIMS)
            image = img_to_array(image)
            image = preprocess_input(image)

            data.append(image)
            labels.append(label)
        data = np.array(data, dtype="float32")
        labels = np.array(labels)
        self.lb = LabelBinarizer()
        labels = self.lb.fit_transform(labels)
        labels = to_categorical(labels)
        (self.trainX, self.testX, self.trainY,
         self.testY) = train_test_split(data,
                                        labels,
                                        test_size=0.20,
                                        stratify=labels,
                                        random_state=42)
        return self

    def build_model(self):
        headModel = self.baseModel.output
        headModel = AveragePooling2D(pool_size=(7, 7))(headModel)
        headModel = Flatten(name="flatten")(headModel)
        headModel = Dense(128, activation="relu")(headModel)
        headModel = Dropout(0.5)(headModel)
        headModel = Dense(len(config.LABELS), activation="softmax")(headModel)

        self.model = Model(inputs=self.baseModel.input, outputs=headModel)
        for layer in self.baseModel.layers:
            layer.trainable = False
        return self

    def summary(self):
        self.model.summary()

    def compile(self):
        print("[+] Model is compiling...")
        opt = Adam(lr=self.init_lr)
        self.model.compile(loss="binary_crossentropy",
                           optimizer=opt,
                           metrics=["accuracy"])
        return self

    def train(self):
        print("[+] Model is training...")
        self.H = self.model.fit(self.aug.flow(self.trainX,
                                              self.trainY,
                                              batch_size=self.bs),
                                steps_per_epoch=len(self.trainX) // self.bs,
                                validation_data=(self.testX, self.testY),
                                validation_steps=len(self.testX) // self.bs,
                                epochs=self.epochs)
        return self

    def evaluate(self):
        print("[INFO] evaluating network...")
        predIdxs = self.model.predict(self.testX, batch_size=self.bs)

        # for each image in the testing set we need to find the index of the
        # label with corresponding largest predicted probability
        predIdxs = np.argmax(predIdxs, axis=1)

        # show a nicely formatted classification report
        print(
            classification_report(self.testY.argmax(axis=1),
                                  predIdxs,
                                  target_names=self.lb.classes_))

        # serialize the model to disk
        print("[+] saving mask detector model...")
        self.model.save(config.MODEL_PATH, save_format="h5")

        # serialize the label encoder to disk
        print("[+] saving label encoder...")
        f = open(config.ENCODER_PATH, "wb")
        f.write(pickle.dumps(self.lb))
        f.close()

        # plot the training loss and accuracy
        N = self.epochs
        plt.style.use("ggplot")
        plt.figure()
        plt.plot(np.arange(0, N), self.H.history["loss"], label="train_loss")
        plt.plot(np.arange(0, N), self.H.history["val_loss"], label="val_loss")
        plt.plot(np.arange(0, N),
                 self.H.history["accuracy"],
                 label="train_acc")
        plt.plot(np.arange(0, N),
                 self.H.history["val_accuracy"],
                 label="val_acc")
        plt.title("Training Loss and Accuracy")
        plt.xlabel("Epoch #")
        plt.ylabel("Loss/Accuracy")
        plt.legend(loc="lower left")
        plt.savefig('test.png')
Beispiel #44
0
def LabelBinarize(df):
    lb = LabelBinarizer(sparse_output=True)
    return lb.fit_transform(df).astype(np.int32)
Beispiel #45
0
class KernelSVC(BaseClassifier):
    """Estimator for learning kernel SVMs by Newton's method.

    Parameters
    ----------

    alpha : float
        Weight of the penalty term.

    solver : str, 'cg', 'dense'

    max_iter : int
        Maximum number of iterations to perform.

    tol : float
        Tolerance of the stopping criterion.

    kernel: "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed"
        Kernel to use.  Default: "linear"

    degree : int, default=3
        Degree for poly, rbf and sigmoid kernels. Ignored by other kernels.

    gamma : float, optional
        Kernel coefficient for rbf and poly kernels. Default: 1/n_features.
        Ignored by other kernels.

    coef0 : float, optional
        Independent term in poly and sigmoid kernels.
        Ignored by other kernels.

    random_state : RandomState or int
        The seed of the pseudo random number generator to use.

    verbose : int
        Verbosity level.

    n_jobs : int
        Number of jobs to use to compute the kernel matrix.

    Example
    -------

    >>> from sklearn.datasets import make_classification
    >>> from lightning.classification import KernelSVC
    >>> X, y = make_classification()
    >>> clf = KernelSVC().fit(X, y)
    >>> accuracy = clf.score(X, y)
    """
    def __init__(self,
                 alpha=1.0,
                 solver="cg",
                 max_iter=50,
                 tol=1e-3,
                 kernel="linear",
                 gamma=0.1,
                 coef0=1,
                 degree=4,
                 random_state=None,
                 verbose=0,
                 n_jobs=1):
        self.alpha = alpha
        self.solver = solver
        self.max_iter = max_iter
        self.tol = tol
        self.kernel = kernel
        self.gamma = gamma
        self.coef0 = coef0
        self.degree = degree
        self.random_state = random_state
        self.verbose = verbose
        self.n_jobs = n_jobs

    def _kernel_params(self):
        return {
            "gamma": self.gamma,
            "degree": self.degree,
            "coef0": self.coef0
        }

    def _solve(self, A, b):
        if self.solver == "cg":
            x, info = cg(A, b, tol=self.tol)
        elif self.solver == "dense":
            x = solve(A, b, sym_pos=True)
        return x

    def _fit_binary(self, K, y, rs):
        n_samples = K.shape[0]
        coef = np.zeros(n_samples)
        if n_samples < 1000:
            sv = np.ones(n_samples, dtype=bool)
        else:
            sv = np.zeros(n_samples, dtype=bool)
            sv[:1000] = True
            rs.shuffle(sv)

        for t in xrange(1, self.max_iter + 1):
            if self.verbose:
                print("Iteration", t, "#SV=", np.sum(sv))

            K_sv = K[sv][:, sv]
            I = np.diag(self.alpha * np.ones(K_sv.shape[0]))

            coef_sv = self._solve(K_sv + I, y[sv])

            coef *= 0
            coef[sv] = coef_sv
            pred = np.dot(K, coef)
            errors = 1 - y * pred
            last_sv = sv
            sv = errors > 0

            if np.array_equal(last_sv, sv):
                if self.verbose:
                    print("Converged at iteration", t)
                break

        return coef

    def _post_process(self, X):
        # We can't know the support vectors when using precomputed kernels.
        if self.kernel != "precomputed":
            sv = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            if np.sum(sv) > 0:
                self.coef_ = np.ascontiguousarray(self.coef_[:, sv])
                mask = safe_mask(X, sv)
                self.support_vectors_ = np.ascontiguousarray(X[mask])
                self.support_indices_ = np.arange(X.shape[0],
                                                  dtype=np.int32)[sv]
                self.n_samples_ = X.shape[0]

            if self.verbose >= 1:
                print("Number of support vectors:", np.sum(sv))

    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        n_samples, n_features = X.shape
        rs = check_random_state(self.random_state)

        self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
        Y = self.label_binarizer_.fit_transform(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_vectors = Y.shape[1]

        if self.verbose:
            print("Pre-computing kernel matrix...")

        K = pairwise_kernels(X,
                             filter_params=True,
                             n_jobs=self.n_jobs,
                             metric=self.kernel,
                             **self._kernel_params())

        coef = [self._fit_binary(K, Y[:, i], rs) for i in xrange(n_vectors)]
        self.coef_ = np.array(coef)
        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)

        self._post_process(X)

        return self

    def decision_function(self, X):
        """
        Return the decision function for test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        P : array, shape = [n_classes, n_samples]
            Decision function for X
        """
        K = pairwise_kernels(X,
                             self.support_vectors_,
                             filter_params=True,
                             n_jobs=self.n_jobs,
                             metric=self.kernel,
                             **self._kernel_params())
        return np.dot(K, self.coef_.T)
Beispiel #46
0
# In[34]:

X_train_counts = count_vect.fit_transform(X_train)

# In[35]:

count_vect.vocabulary_.items()[0:3]

# In[36]:

len(count_vect.vocabulary_)

# In[56]:

lab_bin = LabelBinarizer()
y_train_bin = lab_bin.fit_transform(y_train)
y_test_bin = lab_bin.fit_transform(y_test)

# ## Train

# In[58]:

from sklearn.naive_bayes import MultinomialNB

# In[59]:

clf = MultinomialNB().fit(X_train_counts, y_train_bin)

# In[60]:

len(clf.coef_[0])
Beispiel #47
0
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pickle

df = pd.read_csv('../data/iowa_recidivism_2019.csv')
y = df['Return to Prison']
# transform target to binary with 1 equal to recidivism
lb_target = LabelBinarizer()
y = pd.Series(lb_target.fit_transform(y).reshape(-1,))

y.index = df.index
X = df.drop('Return to Prison', axis=1)

# train: 19515
# test: 6505 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25)

X_train.dropna(axis = 0, subset=['Sex'], inplace=True)

class encoded_X_train():

    def __init__(self, X_train, y_train):

        self.X_train = X_train
        self.y_train = y_train
        
    def oh_encode(self, feature):
Beispiel #48
0
                required=True,
                help="path to the trained model directory")
args = vars(ap.parse_args())

# show information on the process ID
print("[INFO process ID: {}]".format(os.getpid()))

# load the training and testing set and then scale it to
# the range [0,1]
((X_train, y_train), (X_test, y_test)) = cifar10.load_data()
X_train = X_train.astype("float") / 255.0
X_test = X_test.astype("float") / 255.0

# convert the labels from integers to vectors
lb = LabelBinarizer()
y_test = lb.fit_transform(y_test)
y_train = lb.fit_transform(y_train)

# initialize the label names for the CIRFA10 dataset
labelNames = [
    "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse",
    "ship", "truck"
]

# initialize the optimizer and model
print("[INFO] compiling model...")
opt = SGD(lr=0.01, decay=0.01 / 50, momentum=0.9, nesterov=True)
model = MiniVGGNetwork.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])
Beispiel #49
0
    data = data.reshape(data.shape[0], 1, 28, 28)

# otherwise, we are using "channels last" ordering, so the design
# matrix shape should be: num_samples x rows x columns x depth
else:

    data = data.reshape(data.shape[0], 28, 28, 1)

(trainX, testX, trainY, testY) = train_test_split(data / 255.0,
                                                  dataset.target.astype("int"),
                                                  test_size=0.25,
                                                  random_state=42)

# convert the labels from integers to vectors
le = LabelBinarizer()
trainY = le.fit_transform(trainY)
testY = le.transform(testY)

print("[INFO] compiling model...")
opt = SGD(lr=0.01)
model = LeNet.build(width=28, height=28, depth=1, classes=10)
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

# train the network
print("[INFO] training network...")
H = model.fit(trainX,
              trainY,
              validation_data=(testX, testY),
              batch_size=128,
def main():
    start_time = time.time()

    train = pd.read_table('../train.tsv', engine='c')
    test = pd.read_table('../test.tsv', engine='c')
    train = train.sample(frac=1).reset_index(drop=True)
    test = train.loc[100000:120000]
    train = train.loc[0:100000]
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)

    test_labels = np.log1p(test['price'])
    nrow_train = train.shape[0]
#    y = train["price"]
    y = np.log1p(train["price"])
    merge = pd.concat([train, test])
    submission = test[['train_id']]

    del train
    del test
    gc.collect()

    handle_missing_inplace(merge)
    print('[{}] Finished to handle missing'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Finished to cut'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Finished to convert categorical'.format(time.time() - start_time))

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name = cv.fit_transform(merge['name'])
    print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

    cv = CountVectorizer()
    X_category = cv.fit_transform(merge['category_name'])
    print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

    tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')
    X_description = tv.fit_transform(merge['item_description'])
    print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
    print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    print("TRAINING SHAPE")
    print(X.shape)
    print("Test SHAPE")
    print(X_test.shape)

   


    
    model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3)
    model.fit(X, y)
    print('[{}] Finished to train ridge sag'.format(time.time() - start_time))
    predsR = model.predict(X=X_test)
    print('[{}] Finished to predict ridge sag'.format(time.time() - start_time))

    model = Ridge(solver="lsqr", fit_intercept=True, random_state=145, alpha = 3)
    model.fit(X, y)
    print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time))
    predsR2 = model.predict(X=X_test)
    print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time))

    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(valid_X, label=valid_y)
    watchlist = [d_train, d_valid]
    
    params = {
        'learning_rate': 0.76,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 99,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    params2 = {
        'learning_rate': 0.85,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 110,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
    early_stopping_rounds=500, verbose_eval=500) 
    predsL = model.predict(X_test)
    
    print('[{}] Finished to predict lgb 1'.format(time.time() - start_time))
    
    train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) 
    d_train2 = lgb.Dataset(train_X2, label=train_y2)
    d_valid2 = lgb.Dataset(valid_X2, label=valid_y2)
    watchlist2 = [d_train2, d_valid2]

    model = lgb.train(params2, train_set=d_train2, num_boost_round=3000, valid_sets=watchlist2, \
    early_stopping_rounds=50, verbose_eval=500) 
    predsL2 = model.predict(X_test)

    print('[{}] Finished to predict lgb 2'.format(time.time() - start_time))

    preds = predsR2*0.15 + predsR*0.15 + predsL*0.5 + predsL2*0.2

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_lgbm_ridge_11.csv", index=False)
    print("ERROR")
    print(rmsle(preds, test_labels))
ROWS = 10
fig, axes = plt.subplots(ROWS, ROWS, figsize=(10, 10))
for i in range(ROWS):
    for j in range(ROWS):
        k = np.random.choice(range(X_train_orig.shape[0]))
        axes[i][j].set_axis_off()
        axes[i][j].imshow(X_train_orig[k].reshape((28, 28)))
#plt.show()

# Normalize image vectors
X_train = X_train_orig / 255.
X_test = X_test_orig / 255.

# Convert training and test labels to one hot matrices
label_binrizer = LabelBinarizer()
Y_train = label_binrizer.fit_transform(Y_train_orig)
Y_test = label_binrizer.fit_transform(Y_test_orig)

print("number of training examples = " + str(X_train.shape[0]))
print("number of test examples = " + str(X_test.shape[0]))
print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(Y_train.shape))
print("X_test shape: " + str(X_test.shape))
print("Y_test shape: " + str(Y_test.shape))

# train the neural network
model = ResNet18(input_shape=(28, 28, 1), classes=24)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=2, batch_size=32)
Beispiel #52
0
def do_vgg_train(path_input,
                 width,
                 height,
                 basename,
                 vgg_size,
                 fc_size,
                 logLevel="WARN"):
    """Train a VGG-like convolutional network
    """
    logvgg = logging.getLogger(f"{__name__}.console.trainvgg")
    logvgg.setLevel(logLevel)

    model_file = f"{basename}.model"
    label_bin_file = f"{basename}.pickle"
    plot_file = f"{basename}.png"
    logvgg.debug(f"mf {model_file} lbf {label_bin_file} pf {plot_file}")

    data, labels = load_dataset(path_input, width, height, "INFO")

    # partition the data into training and testing splits using 75% of
    # the data for training and the remaining 25% for testing
    (trainX, testX, trainY, testY) = train_test_split(data,
                                                      labels,
                                                      test_size=0.25)

    # convert the labels from integers to vectors (for 2-class, binary
    # classification you should use Keras' to_categorical function
    # instead as the scikit-learn's LabelBinarizer will not return a
    # vector)
    lb = LabelBinarizer()
    trainY = lb.fit_transform(trainY)
    testY = lb.transform(testY)

    # construct the image generator for data augmentation
    # rotation is ok, shear/shift/flip reduced
    aug = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.01,
        height_shift_range=0.01,
        shear_range=0.002,
        zoom_range=0.02,
        horizontal_flip=False,
        fill_mode="nearest",
    )

    if vgg_size == "small":
        # TODO fc_size set from here
        model = SmallVGGNet.build(width=width,
                                  height=height,
                                  depth=3,
                                  classes=len(lb.classes_))
    elif vgg_size == "middle":
        # default value of fc_size
        if fc_size == -1:
            fc_size = 512
        model = MiddleVGGNet.build(
            width=width,
            height=height,
            depth=3,
            classes=len(lb.classes_),
            fully_connected_size=fc_size,
        )
    else:
        logvgg.critical(f"Unrecognized dimension {vgg_size}, stopping.")
        return -1

    # initialize our initial learning rate, # of epochs to train for, and batch size
    INIT_LR = 0.01
    EPOCHS = 75
    #  EPOCHS = 3
    BS = 32
    # TODO fiddle with this

    # initialize the model and optimizer (you'll want to use
    # binary_crossentropy for 2-class classification)
    logvgg.info("Training network...")
    opt = SGD(lr=INIT_LR, decay=INIT_LR / EPOCHS)
    model.compile(loss="categorical_crossentropy",
                  optimizer=opt,
                  metrics=["accuracy"])
    # TODO fiddle with this

    # save model summary
    summary_file = f"{basename}_summary.txt"
    with open(summary_file, "w") as sf:
        model.summary(line_length=100, print_fn=lambda x: sf.write(f"{x}\n"))
        # using an actual logger: print_fn=logger.info

    # save the model structure in JSON format
    config = model.get_config()
    config_json_file = f"{basename}_structure.json"
    with open(config_json_file, "w") as jf:
        json.dump(config, jf)

    # train the network
    H = model.fit_generator(
        aug.flow(trainX, trainY, batch_size=BS),
        validation_data=(testX, testY),
        steps_per_epoch=len(trainX) // BS,
        epochs=EPOCHS,
    )

    # save the model and label binarizer to disk
    logvgg.info("Serializing network and label binarizer...")
    model.save(model_file)
    with open(label_bin_file, "wb") as f:
        f.write(pickle.dumps(lb))

    # evaluate the network
    logvgg.info("Evaluating network...")
    predictions = model.predict(testX, batch_size=32)
    report = classification_report(testY.argmax(axis=1),
                                   predictions.argmax(axis=1),
                                   target_names=lb.classes_)
    logvgg.info(f"\n{report}")
    report_file = f"{basename}_report.txt"
    with open(report_file, "w") as rf:
        rf.write(report)

    # plot the training loss and accuracy
    N = np.arange(0, EPOCHS)
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(N, H.history["loss"], label="train_loss")
    plt.plot(N, H.history["val_loss"], label="val_loss")
    plt.plot(N, H.history["acc"], label="train_acc")
    plt.plot(N, H.history["val_acc"], label="val_acc")
    plt.title("Training Loss and Accuracy (SmallVGGNet)")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.savefig(plot_file)
Beispiel #53
0
    picture = load_img(path, target_size=(224, 224))

    picture = img_to_array(picture)

    picture = preprocess_input(picture)

    data.append(picture)
    labels.append(label)

data = np.array(data, dtype='float32')
labels = np.array(labels)

le = LabelBinarizer()

labels = le.fit_transform(labels)
labels = to_categorical(labels)

(x_train, x_test, y_train, y_test) = train_test_split(data,
                                                      labels,
                                                      test_size=0.2,
                                                      stratify=labels,
                                                      random_state=42)

print(x_train.shape)
print(y_train.shape)

augmentation = ImageDataGenerator(rotation_range=20,
                                  zoom_range=0.15,
                                  width_shift_range=0.2,
                                  height_shift_range=0.2,
Beispiel #54
0
import numpy as np
import os

images = list(paths.list_images("dataset"))
x = []
Y = []

for image in tqdm(images, desc="Processing Images s"):
    label = image.split(os.path.sep)[-2]

    img = load_img(image, target_size=(100, 100))
    img = img_to_array(img)
    img = preprocess_input(img)

    x.append(img)
    Y.append(label)

x_arr = np.array(x, dtype='float32')
Y_arr = np.array(Y)
print(x_arr.shape)
print(Y_arr.shape)

binarizer = LabelBinarizer()
y_arr = binarizer.fit_transform(Y_arr)
y_arr = to_categorical(y_arr)
print(y_arr.shape)

np.save('X_RAW.npy', x_arr)
np.save('Y_RAW.npy', y_arr)
print("Raw Data Saved to disc")
Beispiel #55
0
    'latitude', 'longitude', 'review_count', 'is_open', 'Monday', 'Tuesday',
    'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Caters',
    'WheelchairAccessible', 'BikeParking', 'AcceptsInsurance',
    'BusinessAcceptsCreditCards', 'CoatCheck', 'HappyHour', 'GoodForKids',
    'Open24Hours', 'OutdoorSeating', 'HasTV', 'BusinessAcceptsBitcoin',
    'ByAppointmentOnly', 'DogsAllowed', 'DriveThru', 'Smoking', 'NoiseLevel',
    'AgesAllowed', 'Alcohol', 'WiFi', 'Music', 'Ambience', 'BusinessParking',
    'pos_count', 'neg_count', 'checkin_count'
]]
y = np.array(dataset[['stars']])

lab_enc = preprocessing.LabelEncoder()
label_dataset_encoded = lab_enc.fit_transform(y)

encoder = LabelBinarizer()
y = encoder.fit_transform(label_dataset_encoded)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.fit_transform(X_test)
'''
#Initializing Neural Network
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 8))
# Adding the second hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim = 9, init = 'uniform', activation = 'sigmoid'))
Beispiel #56
0
def one_hot_encode(labels):
    enc = LabelBinarizer()
    return enc.fit_transform(labels)
Beispiel #57
0
#    if len(d.split()) == 3:
#        new_data.append(d)
#
#selected = new_data
all_words = []

for s in data:
    for se in s.split():
        all_words.append(se)

words = np.unique(all_words)
vocabulary_size = len(words)

enc = LabelBinarizer()

ohe = enc.fit_transform(words)

input_dic = {}
for w in range(0, len(words)):
    input_dic[words[w]] = np.reshape(ohe[w], (1, vocabulary_size))

context_words = []

for s in data:
    se = s.split()
    context_words.append([se[2], [se[1], se[0]]])
    #context_words.append([se[1], [se[0], se[2]]])
    #context_words.append([se[2], [se[0], se[1]]])

#for s in selected:
#    se = s.split()
unique_val = np.array(labels)
np.unique(unique_val)

plt.figure(figsize = (18,8))
sns.countplot(x =labels)

train.drop('label', axis = 1, inplace = True)

images = train.values
images = np.array([np.reshape(i, (28, 28)) for i in images])
images = np.array([i.flatten() for i in images])

from sklearn.preprocessing import LabelBinarizer
label_binrizer = LabelBinarizer()
labels = label_binrizer.fit_transform(labels)

plt.imshow(images[0].reshape(28,28))

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(images, labels, test_size = 0.3, random_state = 101)
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout

batch_size = 128
num_classes = 24
epochs = 50

x_train = x_train / 255
x_test = x_test / 255
class OneAgainstRest(MulticlassExtension):
    """
      the multiclass extension based on the one-against-rest algorithm.
    """
    def __init__(self,
                 estimator_cls: Callable[[List], Estimator],
                 params: Optional[List] = None) -> None:
        super().__init__()
        self.estimator_cls = estimator_cls
        self.params = params if params is not None else []
        self.label_binarizer_ = None
        self.classes = None
        self.estimators = None

    def train(self, x, y):
        """
        training multiple estimators each for distinguishing a pair of classes.
        Args:
            x (numpy.ndarray): input points
            y (numpy.ndarray): input labels
        Raises:
            Exception: given all data points are assigned to the same class,
                        the prediction would be boring
        """
        self.label_binarizer_ = LabelBinarizer(neg_label=0)
        Y = self.label_binarizer_.fit_transform(y)
        self.classes = self.label_binarizer_.classes_
        columns = (np.ravel(col) for col in Y.T)
        self.estimators = []
        for _, column in enumerate(columns):
            unique_y = np.unique(column)
            if len(unique_y) == 1:
                raise Exception(
                    "given all data points are assigned to the same class, "
                    "the prediction would be boring.")
            estimator = self.estimator_cls(*self.params)
            estimator.fit(x, column)
            self.estimators.append(estimator)

    def test(self, x, y):
        """
        testing multiple estimators each for distinguishing a pair of classes.
        Args:
            x (numpy.ndarray): input points
            y (numpy.ndarray): input labels
        Returns:
            float: accuracy
        """
        A = self.predict(x)
        B = y
        _l = len(A)
        diff = np.sum(A != B)
        logger.debug("%d out of %d are wrong", diff, _l)
        return 1 - (diff * 1.0 / _l)

    def predict(self, x):
        """
        applying multiple estimators for prediction
        Args:
            x (numpy.ndarray): NxD array
        Returns:
            numpy.ndarray: predicted labels, Nx1 array
        """
        n_samples = _num_samples(x)
        maxima = np.empty(n_samples, dtype=float)
        maxima.fill(-np.inf)
        argmaxima = np.zeros(n_samples, dtype=int)
        for i, e in enumerate(self.estimators):
            pred = np.ravel(e.decision_function(x))
            np.maximum(maxima, pred, out=maxima)
            argmaxima[maxima == pred] = i
        return self.classes[np.array(argmaxima.T)]
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True, help="path to output model")
ap.add_argument("-o", "--output", required=True, help="path to output directory (logs, plots, etc.)")
args = vars(ap.parse_args())

print("[INFO] loading CIFAR-10 data ...")
((train_x, train_y), (test_x, test_y)) = cifar10.load_data()
train_x = train_x.astype("float")
test_x = test_x.astype("float")

mean = np.mean(train_x, axis=0)
train_x -= mean
test_x -= mean

lb = LabelBinarizer()
train_y = lb.fit_transform(train_y)
test_y = lb.transform(test_y)

aug = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest")

fig_path = os.path.sep.join([args["output"], "{}.png".format(os.getpid())])
json_path = os.path.sep.join([args["output"], "{}.json".format(os.getpid())])
# callbacks = [TrainingMonitor(fig_path, json_path=json_path), LearningRateScheduler(poly_decay)]
callbacks = [LearningRateScheduler(poly_decay)]

print("[INFO] compiling model ...")
opt = SGD(lr=INIT_LR, momentum=0.9)
model = MiniGoogleNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

print("[INFO] training model ...")