def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets') df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Precision:', precision_score(y_test, predictions) print 'Recall:', recall_score(y_test, predictions)
def test_label_binarizer_multilabel(): lb = LabelBinarizer() # test input as lists of tuples inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) got = lb.fit_transform(inp) assert_array_equal(indicator_mat, got) assert_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix lb.fit(indicator_mat) assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat)) # regression test for the two-class multilabel case lb = LabelBinarizer() inp = [[1, 0], [0], [1], [0, 1]] expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]]) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_equal([set(x) for x in lb.inverse_transform(got)], [set(x) for x in inp])
def initData(filename): if not os.path.exists(filename): print "I can't find this file: %s"%filename sys.exit(1) datareader = csv.reader(open(filename,'r')) ct = 0; for row in datareader: ct = ct+1 datareader = csv.reader(open(filename,'r')) data = np.array(-1*np.ones((ct,7),float),object); k=0; for row in datareader: data[k,:] = np.array(row) k = k+1; #To modify featnames = np.array(ATTRIBUTES,str) keys = [[]]*np.size(data,1) numdata = -1*np.ones_like(data); nfeatures=[0] featIndex=[] # convert string objects to integer values for modeling: for k in range(np.size(data,1)): keys[k],garbage,numdata[:,k] = np.unique(data[:,k],True,True) numrows = np.size(numdata,0); # number of instances in car data set numcols = np.size(numdata,1); # number of columns in car data set numdata = np.array(numdata,int) xdata = numdata[:,:-1]; # x-data is all data BUT the last column which are the class labels ydata = numdata[:,-1]; # y-data is set to class labels in the final column, signified by -1 # ------------------ numdata multilabel -> binary conversion for NB-Model --------------------- lbin = LabelBinarizer(); for k in range(np.size(xdata,1)): # loop thru number of columns in xdata if k==0: xdata_ml = lbin.fit_transform(xdata[:,k]); featIndex = lbin.classes_ nfeatures.append(len(lbin.classes_)) else: xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k]))) featIndex= np.hstack((featIndex,lbin.classes_)) nfeatures.append(nfeatures[-1]+len(lbin.classes_)) if _VERBOSE: print "nfeatures:" print nfeatures print "featIndex" print featIndex return xdata_ml,xdata,ydata,data,nfeatures,keys,featIndex
def encode_categorical(cat, missing_value = False, option = "binary"): # Encodes the categorical features. For N unique categories: # cat : the column of categorical values # option = 'binary' : binary (one-hot, orthogonal, thermometer) encoding - N features # 'freq' : occuring frequency (percentage) - 1 feature # 'mis_float' : all binary encoded except missing values (vector of floats corresponding to occurance frequencies) - (N-1) features # 'mis_unif' : all binary encoded except missing values (vector of floats of uniform values) - (N-1) features # 'dummy' : just like binary but one column is removed - (n-1) features # 'sum' : sum (deviation) coding. just like dummy but zeros row is all -1 - (N-1) features ########## TO DO: Encoding w.r.t. targets if option == "binary": lb = LabelBinarizer() encoded = lb.fit_transform(cat) elif option == "freq": freq_count = itemfreq(cat) encoded = np.zeros(len(cat)) for i in range(freq_count.shape[0]): encoded[cat == freq_count[i][0]] = float(freq_count[i][1])/len(encoded) elif option == "mis_float": if missing_value == False: raise ValueError("Provide a missing value for the option 'mis_float'.") else: lb = LabelBinarizer() encoded = lb.fit_transform(cat).astype(float) missing_bool = cat == missing_value if np.sum(missing_bool) == 0: raise ValueError("No such missing value!") encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1) encoded[missing_bool,:] = np.sum(encoded[~missing_bool], axis = 0)/float(encoded[~missing_bool].shape[0]) elif option == "mis_unif": if missing_value == False: raise ValueError("Provide a missing value for the option 'mis_float'.") else: lb = LabelBinarizer() encoded = lb.fit_transform(cat).astype(float) missing_bool = cat == missing_value if np.sum(missing_bool) == 0: raise ValueError("No such missing value!") encoded = np.delete(encoded, np.argmax(encoded[np.argmax(missing_bool),:]), axis = 1) encoded[missing_bool,:] = np.ones(encoded.shape[1]) * 1.0 / encoded.shape[1] elif option == "dummy": lb = LabelBinarizer() encoded = lb.fit_transform(cat)[:,0:-1] elif option == "sum": lb = LabelBinarizer() encoded = lb.fit_transform(cat) last_col = encoded[:,-1].astype(bool) encoded = encoded[:,0:-1] encoded[last_col,:] = -1 else: raise ValueError("No such option!") print("Number of unique categorical values : %s" % encoded.shape[1]) return encoded
class MLPClassifier(BaseMLP, ClassifierMixin): """ Multilayer Perceptron Classifier. Uses a neural network with one hidden layer. Parameters ---------- Attributes ---------- Notes ----- References ----------""" def __init__( self, n_hidden=200, lr=0.1, l2decay=0, loss="cross_entropy", output_layer="softmax", batch_size=100, verbose=0 ): super(MLPClassifier, self).__init__(n_hidden, lr, l2decay, loss, output_layer, batch_size, verbose) def fit(self, X, y, max_epochs=10, shuffle_data=False): self.lb = LabelBinarizer() one_hot_labels = self.lb.fit_transform(y) super(MLPClassifier, self).fit(X, one_hot_labels, max_epochs, shuffle_data) return self def predict(self, X): prediction = super(MLPClassifier, self).predict(X) return self.lb.inverse_transform(prediction)
def test_multinomial_loss_ground_truth(): # n_samples, n_features, n_classes = 4, 2, 3 n_classes = 3 X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]]) y = np.array([0, 1, 2, 0]) lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]]) intercept = np.array([1., 0, -.2]) sample_weights = np.array([0.8, 1, 1, 0.8]) prediction = np.dot(X, weights) + intercept logsumexp_prediction = logsumexp(prediction, axis=1) p = prediction - logsumexp_prediction[:, np.newaxis] loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum() diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin) grad_1 = np.dot(X.T, diff) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T assert_almost_equal(loss_1, loss_2) assert_array_almost_equal(grad_1, grad_2) # ground truth loss_gt = 11.680360354325961 grad_gt = np.array([[-0.557487, -1.619151, +2.176638], [-0.903942, +5.258745, -4.354803]]) assert_almost_equal(loss_1, loss_gt) assert_array_almost_equal(grad_1, grad_gt)
def run_sr(): dim = (X_train.shape[1], n_classes) lb = LabelBinarizer() y_true = lb.fit_transform(y_train) sr = SoftmaxRegression(dim) sr.fit(X_train, y_true, verbose=1)
def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! """ lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} labs = [class_indices[cls] for cls in tagset] return((precision_recall_fscore_support(y_true_combined, y_pred_combined, labels=labs, average=None, sample_weight=None)), (classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )), labs)
class GBClassifier(_BaseGB, ClassifierMixin): def __init__(self, estimator, n_estimators=100, step_size="line_search", learning_rate=0.1, loss="squared_hinge", subsample=1.0, callback=None, random_state=None): self.estimator = estimator self.n_estimators = n_estimators self.step_size = step_size self.learning_rate = learning_rate self.loss = loss self.subsample = subsample self.callback = callback self.random_state = random_state def _get_loss(self): losses = dict(squared_hinge=_SquaredHingeLoss(), log=_LogLoss()) return losses[self.loss] def fit(self, X, y): self._lb = LabelBinarizer(neg_label=-1) Y = self._lb.fit_transform(y) return super(GBClassifier, self).fit(X, Y) def predict(self, X): pred = self.decision_function(X) return self._lb.inverse_transform(pred)
class ElasticNetClassifier(LinearClassifierMixin, ElasticNet): """Class to extend elastic-net in case of classification.""" def fit(self, X, y, check_input=True): self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if self._label_binarizer.y_type_.startswith('multilabel'): # we don't (yet) support multi-label classification in ENet raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) # Y = column_or_1d(Y, warn=True) super(ElasticNetClassifier, self).fit(X, Y) if self.classes_.shape[0] > 2: ndim = self.classes_.shape[0] else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) return self @property def classes_(self): return self._label_binarizer.classes_
def binarize_seqfeature(X): """ Binarizes the sequence features into 1s and 0s. Parameters: =========== - X: (pandas DataFrame) the sequence feature matrix without drug resistance values. Returns: ======== - binarized: (pandas DataFrame) a binarized sequence feature matrix with columns corresponding to particular amino acids at each position. - binarizers: (dictionary) a dictionary of binarizer objects for each position. """ binarized = pd.DataFrame() binarizers = dict() for col in X.columns: lb = LabelBinarizer() binarized_cols = lb.fit_transform(X[col]) if len(lb.classes_) == 2: binarized[col] = pd.Series(binarized_cols[:, 0]) else: for i, c in enumerate(lb.classes_): binarized[col + "_" + c] = binarized_cols[:, i] binarizers[col] = lb return binarized, binarizers
def bio_classification_report(y_true, y_pred): lb = LabelBinarizer() y_true_combined = 1 - lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = list(chain.from_iterable(y_pred)) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} print 'True sum %d Pred sum %d Len %d' %(sum(y_true_combined), sum(y_pred_combined), len(y_pred_combined)) print "AUC\tP-R: %.4f\tROC: %.4f" % (average_precision_score(y_true_combined, y_pred_combined, average=None), roc_auc_score(y_true_combined, y_pred_combined, average=None)) #plt.figure() #fpr, tpr, thr = roc_curve(y_true_combined, y_pred_combined) #area = auc(fpr, tpr) #plt.plot(fpr, tpr, label='{area:.3f}'.format( area=area)) #plt.legend(loc=4) #plt.savefig('sub3.jpg') return classification_report( 1 - y_true_combined, [0 if v > 0.1 else 1 for v in y_pred_combined], labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
def just_categorical(dropped): # create initial matrix print('starting with m0') lb = LabelBinarizer(sparse_output=True) m = lb.fit_transform(dropped.restaurant_id) print(m.shape) # build matrix # making nan its own category for categorical print("adding categorical to matrix") m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city', 'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode', 'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',]) print(m.shape) print("adding bool to matrix") m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating', 'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out', 'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ]) print(m.shape) print("adding restaurant categories to matrix") cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("adding restaurant neighborhoods to matrix") cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("matrix shape of {}".format(m.shape)) joblib.dump(m, 'pickle_jar/categorical_matrix')
def transform(self, data_dict): listOfUnits = ["kilogram", "kg", "gram", "[GMgmkK]?Hz", "liter", "ml", "cup", "cm", "foot", "inch", "meter", "mg", "gallon", "milliliter", "[MGTmgtKk]B"] regex = "[\d]+\.[\d]+(" + "[\b/,-]|".join(listOfUnits) + ")" data = data_dict[self.key].str.extract(regex, flags = re.IGNORECASE, expand=False).str.lower() lb = LabelBinarizer() return lb.fit_transform(data.fillna(""))
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. val.check_consistent_length(T, Y) T = val.check_array(T) Y = val.check_array(Y) print(T) print(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def full_matrix(dropped): # create initial matrix print('starting with m0') lb = LabelBinarizer(sparse_output=True) # m = lb.fit_transform(dropped.restaurant_id) m = lb.fit_transform(dropped.user_name) print(m.shape) # build matrix # making nan its own category for categorical print("adding categorical to matrix") m = add_categorical_to_matrix(m, dropped, ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city', 'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode', 'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',]) print(m.shape) print("adding bool to matrix") m = add_categorical_to_matrix(m, dropped, ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating', 'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out', 'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ]) print(m.shape) m = add_numerical_to_matrix(m, dropped, ['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold']) print(m.shape) print("adding restaurant categories to matrix") cats = ['restaurant_category_1', 'restaurant_category_2', 'restaurant_category_3', 'restaurant_category_4', 'restaurant_category_5', 'restaurant_category_6', 'restaurant_category_7'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("adding restaurant neighborhoods to matrix") cats = ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3'] m = special_categories_to_matrix(m, dropped, cats) print(m.shape) print("matrix shape of {}".format(m.shape)) joblib.dump(m, 'pickle_jar/full_matrix')
def fit(self, X, y): """ performs one step of gradient descent """ # get the dimensions of our data n_samples, n_features = X.shape[0], X.shape[1]+1 n_targets = len(np.unique(y)) # add a column to the data matrix to incorporate the bias term X = np.c_[np.ones(n_samples), X] # one-vs-all labeling lb = LabelBinarizer() y = lb.fit_transform(y) # initialize the weights if self.W is None: self.W = np.zeros( (n_features, n_targets) ) # perform the optimization using gradient descent with momentum grad = self.gradient(X,y) self.W = self.W - self.learning_rate*(grad + self.momentum*self.prev_grad) self.prev_grad = grad return self.loss(X,y)
def run(): # Load and preprocess data label_to_unique_instance = load_data() X, Y = preprocess_data(label_to_unique_instance) # Encode labels label_binarizer = LabelBinarizer() transformed_Y = label_binarizer.fit_transform(Y) # Cross validation cross_validation_iterator = StratifiedShuffleSplit(Y, n_iter=1, test_size=0.4, random_state=0) for train_index, test_index in cross_validation_iterator: break # Init model model = init_model(raw_feature_dim=X.shape[-1], unique_lable_num=len(label_binarizer.classes_)) # Training procedure model.fit(X[train_index], transformed_Y[train_index], batch_size=BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM, validation_data=(X[test_index], transformed_Y[test_index]), callbacks=[TensorBoard(log_dir="/tmp/Sequence Classification")], verbose=2) print("All done!")
def test_multinomial_loss(): # test if the multinomial loss and gradient computations are consistent X, y = iris.data, iris.target.astype(np.float64) n_samples, n_features = X.shape n_classes = len(np.unique(y)) rng = check_random_state(42) weights = rng.randn(n_features, n_classes) intercept = rng.randn(n_classes) sample_weights = rng.randn(n_samples) np.abs(sample_weights, sample_weights) # compute loss and gradient like in multinomial SAG dataset, _ = make_dataset(X, y, sample_weights, random_state=42) loss_1, grad_1 = _multinomial_grad_loss_all_samples(dataset, weights, intercept, n_samples, n_features, n_classes) # compute loss and gradient like in multinomial LogisticRegression lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T # comparison assert_array_almost_equal(grad_1, grad_2) assert_almost_equal(loss_1, loss_2)
def report(test_y, pred_y): lb = LabelBinarizer() test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y))) pred_y_combined = lb.transform(list(chain.from_iterable(pred_y))) tagset = sorted(set(lb.classes_)) class_indices = {cls: idx for idx, cls in enumerate(tagset)} print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
def iris_demo(): # load the iris dataset iris = load_iris() X = iris['data'] y_labels = iris['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building logistic regression classifier to classify iris data") nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]]) print("Training") nn.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05, random_state=RANDOM_STATE) y_pred = nn.predict(X_test) print("iris accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
def conv_demo(): # load the digits dataset digits = load_digits() X = digits['data'] y_labels = digits['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building neural net to classify digits") conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1], random_state=RANDOM_STATE) print("Training") conv_net.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05) y_pred = conv_net.predict(X_test) print("digits accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
def bio_classification_report(y_true, y_pred): """Evaluates entity extraction accuracy. Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb """ from sklearn.preprocessing import LabelBinarizer from itertools import chain from sklearn.metrics import classification_report lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
def binarize_label_columns(df, columns, two_classes_as='single'): ''' Inputs: df: Pandas dataframe object. columns: Columns to binarize. tow_classes_as: How to handle two classes, as 'single' or 'multiple' columns. Returns a tuple with the following items: df: Pandas dataframe object with new columns. binlabel_names: Names of the newly created binary variables. lb_objects: a dictionary with columns as keys and sklear.LabelBinarizer objects as values. ''' binlabel_names = [] lb_objects = {} for col in columns: if len(df[col].unique()) > 1: rows_notnull = df[col].notnull() # Use only valid feature observations lb = LabelBinarizer() binclass = lb.fit_transform(df[col][rows_notnull]) # Fit & transform on valid observations if len(lb.classes_) == 2 and two_classes_as == 'multiple': binclass = np.hstack((1 - binclass, binclass)) lb_objects[col] = lb if len(lb.classes_) > 2 or two_classes_as == 'multiple': col_binlabel_names = [col+'_'+str(c) for c in lb.classes_] binlabel_names += col_binlabel_names # Names for the binarized classes for n in col_binlabel_names: df[n] = np.NaN # Initialize columns df.loc[rows_notnull, col_binlabel_names] = binclass # Merge binarized data elif two_classes_as == 'single': binlabel_names.append(col+'_bin') # Names for the binarized classes df[col+'_bin'] = np.NaN # Initialize columns df.loc[rows_notnull, col+'_bin'] = binclass # Merge binarized data return df, binlabel_names, lb_objects
def scorer_auc(y_true, y_pred): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import LabelBinarizer """Dedicated to 2class probabilistic outputs""" le = LabelBinarizer() y_true = le.fit_transform(y_true) return roc_auc_score(y_true, y_pred)
def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Note: This function was copied from http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb Args: y_true: True labels, list of strings y_pred: Predicted labels, list of strings Returns: classification report as string """ lbin = LabelBinarizer() y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred))) #tagset = set(lbin.classes_) - {NO_NE_LABEL} tagset = set(lbin.classes_) tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin): def __init__(self, estimator): self.estimator = estimator def fit(self, X, Y): # binarize labels self.bl = LabelBinarizer() Y = self.bl.fit_transform(Y) self.classes_ = self.bl.classes_ # create an estimator for each label self.estimators_ = [] for i in xrange(self.bl.classes_.shape[0]): estimator = clone(self.estimator) estimator.fit(X, Y[:, i]) self.estimators_.append(estimator) def predict(self, X): self._check_is_fitted() X = np.atleast_2d(X) Y = np.empty((X.shape[0], self.classes_.shape[0])) for i, estimator in enumerate(self.estimators_): Y[:, i] = estimator.predict(X).T return self.bl.inverse_transform(Y) def _check_is_fitted(self): if not hasattr(self, "estimators_"): raise ValueError("The object hasn't been fitted yet!")
class CategoricalToNumerical(object): def __init__(self, dimensionality_reducer=None, verify=True): pass """Takes in a dimensionality reducer in order to convert categorical features into numerical. """ if dimensionality_reducer is None: dimensionality_reducer = RandomizedPCA(1) self.dimensionality_reducer = dimensionality_reducer self.verify = verify self.binarizer = LabelBinarizer() def fit(self, X, y=None): self._verify(X, self.verify) binarized = self.binarizer.fit_transform(X) self.dimensionality_reducer.fit(binarized) def transform(self, X): self._verify(X, False) binarized = self.binarizer.transform(X) result = self.dimensionality_reducer.transform(binarized).flatten() assert X.shape == result.shape return result def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def _verify(self, X, verify): if verify: assert is_categorical(X) else: assert isinstance(X, np.ndarray) assert len(X.shape) == 1
def train(self, X, y): n_features = X.shape[1] # class_prior = self.class_prior # Binarize Y labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) n_effective_classes = Y.shape[1] self.class_count = np.zeros(n_effective_classes) self.feature_count = np.zeros((n_effective_classes, n_features)) print "Start counting..." self.class_count = Y.sum(axis=0) print "Finished class counting!" print "Start feature counting..." self.feature_count = np.dot(Y.T, X) print "Finished feature counting!" # Apply add-k-smoothing print "Start smoothing..." self.class_count_smooth = self.class_count + self.k * len(self.classes) self.feature_count_smooth = self.feature_count + self.k print "Finished smooting!" # Convert to log probabilities self.feature_log_prob = (np.log(self.feature_count_smooth) - np.log(self.class_count_smooth.reshape(-1,1))) self.class_log_prior = np.zeros(len(self.classes)) - np.log(len(self.classes)) return self
def get_dataset2(test_fraction): """ @:param: test_fraction used to split train and test Vectorizes the features and labels into categorical values and randomly splits into train and test set :return: X_train, X_test, y_train, y_test """ data = [] with open('labels.csv', 'r') as datafile: csv_reader = csv.reader(datafile, delimiter=',', quotechar='|') for row in csv_reader: data.append(row) data = numpy.asarray(data) X = data[:, 0:data.shape[1]-1] y = data[:, data.shape[1]-1] # X,y = get_tabledata() vec = DictVectorizer() feature_dict = [dict(enumerate(x)) for x in X.tolist()] X = vec.fit_transform(feature_dict).toarray() joblib.dump(vec, 'vectorizer.pkl') lb = LabelBinarizer() y = lb.fit_transform(y) joblib.dump(lb, 'binarizer.pkl') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction) return X_train, X_test, y_train, y_test
def main(): feature_vectorized_file_name = 'Data/feature_vectorized2' if os.path.exists(feature_vectorized_file_name) == False: sparse_merge, price = _load(feature_vectorized_file_name) print(sparse_merge.shape) else: ######################################################################## start_time = time.time() merge, submission, price = get_extract_feature() merge = merge[:TRAIN_SIZE] merge['item_condition_id'] = merge['item_condition_id'].astype( 'category') print('[{}] Convert categorical completed'.format(time.time() - start_time)) # vectorize features wb = CountVectorizer() X_category2 = wb.fit_transform(merge['category_2']) X_category3 = wb.fit_transform(merge['category_name']) X_brand2 = wb.fit_transform(merge['brand_name']) print( '[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_category1 = lb.fit_transform(merge['category_1']) X_category4 = lb.fit_transform(merge['category_name']) print( '[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) # hand feature for col in merge.columns: if ('Len' in col) or ('Frec' in col): merge[col] = np.log1p(merge[col]) merge[col] = merge[col] / merge[col].max() hand_feature = [ 'brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', 'brand_name_item_description_Intsct' ] X_hand_feature = merge[hand_feature].values name_w1 = param_space_best_WordBatch['name_w1'] name_w2 = param_space_best_WordBatch['name_w2'] desc_w1 = param_space_best_WordBatch['desc_w1'] desc_w2 = param_space_best_WordBatch['desc_w2'] wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [name_w1, name_w2], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) merge['item_description'] = merge['category_2'].map(str)+' .#d3 .#d3 '+\ merge['name'].map(str)+' .#d3 .#d3 '+\ merge['item_description'].map(str) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 3, "hash_ngrams_weights": [desc_w1, desc_w2, 0.7], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 6, 0, 1), dtype=bool)] print( '[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1, X_category2, X_category3, X_category4, X_hand_feature, X_name, X_description)).tocsr() print(X_dummies.shape, X_brand.shape, X_brand2.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_category4.shape, X_hand_feature.shape, X_name.shape, X_description.shape, sparse_merge.shape) _save(feature_vectorized_file_name, [sparse_merge, price]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model # use 3 fold cross validation # learner_name='best_FTRL' # learner_name='FTRL' learner_name = 'best_FM_FTRL' #learner_name='FM_FTRL' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger) optimizer.run() a = 12
INIT_LR = 1e-3 EPOCHS = 10 BS = 128 # grab the MNIST dataset print("[INFO] accessing MNIST...") ((trainData, trainLabels), (testData, testLabels)) = mnist.load_data() # add a channel (i.e., grayscale) dimension to the digits trainData = trainData.reshape((trainData.shape[0], 28, 28, 1)) testData = testData.reshape((testData.shape[0], 28, 28, 1)) # scale data to the range of [0, 1] trainData = trainData.astype("float32") / 255.0 testData = testData.astype("float32") / 255.0 # convert the labels from integers to vectors le = LabelBinarizer() trainLabels = le.fit_transform(trainLabels) testLabels = le.transform(testLabels) print("[INFO] compiling model...") opt = Adam(lr=INIT_LR) model = SudokuNet.build(width=28, height=28, depth=1, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # train the network print("[INFO] training network...") H = model.fit(trainData, trainLabels, validation_data=(testData, testLabels), batch_size=BS, epochs=EPOCHS, verbose=1)
# load the input image (224x224) and preprocess it image = load_img(imagePath, target_size=(224, 224)) image = img_to_array(image) image = preprocess_input(image) # update the data and labels lists, respectively data.append(image) labels.append(label) # convert the data and labels to NumPy arrays data = np.array(data, dtype="float32") labels = np.array(labels) # perform one-hot encoding on the labels lb = LabelBinarizer() labels = lb.fit_transform(labels) labels = to_categorical(labels) # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.20, stratify=labels, random_state=42) # construct the training image generator for data augmentation aug = ImageDataGenerator(rotation_range=20, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2,
import numpy as np print("[INFO] accessing MNIST...") dataset = datasets.fetch_mldata("MNIST Original") data = dataset.data if K.image_data_format() == "channels_first": data = data.reshape(data.shape[0], 1, 28, 28) else: data = data.reshape(data.shape[0], 28, 28, 1) (trainX, testX, trainY, testY) = train_test_split(data/255, dataset.target.astype("int"), test_size=0.25, random_state=42) lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.transform(testY) print("[INFO] compiling model...") optimizer = SGD(lr=0.01) model = LeNet.build(width=28, height=28, depth=1, classes=10) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=128, epochs=20, verbose=1) print("[INFO] evaluating network...")
num_of_class = len(set(labels)) epochs = 100 channel_num = 1 # input image dimensions img_height, img_width = 64, 64 if K.image_data_format() == 'channels_first': input_shape = (channel_num, img_height, img_width) else: input_shape = (img_height, img_width, channel_num) _print('label binarize.') from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() labels_one_hot = lb.fit_transform(labels) with open('./label_binarizer.model', 'wb') as fp: pickle.dump(lb, fp) from sklearn.model_selection import train_test_split valid_size = 0.1 valid_idx = int(len(data_paths) * (1 - valid_size)) #_print('split train and test.') #X_train, X_valid, y_train, y_valid = train_test_split( # data_paths, labels_one_hot, test_size=0.1) _print('train gen') train_gen = MyGenerator(data_paths[:valid_idx], labels_one_hot[:valid_idx],
def train(self, train, test, save_file=None, is_logging=True): if is_logging: old_stdout = sys.stdout log_file = open(save_file + '.log', 'w') sys.stdout = log_file # initialize batch loader batch_loader = BatchLoader(train[0], train[1], self.seq_len) # # prepare test data reshaped_test = test[0].reshape( (test[0].shape[0], self.seq_len, self.input_dim)) lb = LabelBinarizer() lb.fit(batch_loader.get_classes()) binarized_test_labels = lb.fit_transform(test[1]) # initialize the variables init = tf.global_variables_initializer() with tf.Session() as sess: # run the initializer sess.run(init) # for e in range(1, self.num_epochs+1): # # iteration_count = int(np.ceil(train[0].shape[0] / self.batch_size)) # # for idx in range(0, iteration_count): # batch_x, batch_y = batch_loader.next_batch(self.batch_size) # batch_x = np.expand_dims(batch_x, axis=1) # # batch_x = batch_x.reshape((self.batch_size, self.seq_len, self.input_dim)) # # sess.run(self.train_op, feed_dict={self.X: batch_x, self.Y: batch_y}) # if e % self.display_step == 0 or e == 1: # # Calculate batch loss and accuracy # loss, acc = sess.run([self.loss_op, self.accuracy], feed_dict={self.X: batch_x, self.Y: batch_y}) # print("Minibatch Step " + str(e) + ", Loss= {:.4f}".format(loss) + ", Training Accuracy= {:.3f}".format(acc)) # # if e % 100 == 0: # predictions = [] # for smpl_idx in range(0, test[0].shape[0]): # acc = [] # for desc in test[0][smpl_idx]: # acc.append(sess.run(self.prediction, feed_dict={self.X: desc.reshape((1, 1, self.input_dim)), self.Y: binarized_test_labels[smpl_idx, :].reshape((1, self.num_classes))})) # sum_acc = np.sum(acc, axis=0) # predictions.append(np.argmax(sum_acc)) # # # binarized_predictions = lb.fit_transform(predictions) # print("Epoch #" + str(e) + ", Test Accuracy:", (100 * np.sum(test[1] == predictions)) / test[1].shape[0]) # # # if e % 100 == 0: # # # evaluate model every 100 iterations # # print("Epoch #" + str(e) + ", Test Accuracy:", # # sess.run(self.accuracy, feed_dict={self.X: reshaped_test, self.Y: binarized_test_labels}), flush=True) for step in range(1, self.num_epochs + 1): batch_x, batch_y = batch_loader.next_batch(self.batch_size) batch_x = batch_x.reshape( (self.batch_size, self.seq_len, self.input_dim)) sess.run(self.train_op, feed_dict={ self.X: batch_x, self.Y: batch_y }) if step % self.display_step == 0 or step == 1: # Calculate batch loss and accuracy loss, acc = sess.run([self.loss_op, self.accuracy], feed_dict={ self.X: batch_x, self.Y: batch_y }) print("Step " + str(step) + ", Minibatch Loss= " + \ "{:.4f}".format(loss) + ", Training Accuracy= " + \ "{:.3f}".format(acc)) if step % 100 == 0: # evaluate model every 100 iterations print( "Testing Accuracy:", sess.run(self.accuracy, feed_dict={ self.X: reshaped_test, self.Y: binarized_test_labels })) print("Optimization Finished!") if is_logging: sys.stdout = old_stdout log_file.close()
answer = ROWS[:, 12] # The answer # Provide list of datetime categories yearlist = np.array(range(1970, 2050)).astype(str) monthlist = np.array(range(1, 13)).astype(str) daylist = np.array(range(1, 32)).astype(str) hourlist = np.array(range(8, 17)).astype(str) # Minutes with step size of 5, like 15, 20, 30, 45, etc... minutelist = np.array(range(0, 61, 5)).astype(str) weeklist = np.array(range(1, 8)).astype(str) # ------------------------End Prepare Data----------------------------- # ------------------------Vectorize------------------------------- vectorizer = LabelBinarizer() # Vectorize by going through the SQL result procedure_name_vec = vectorizer.fit_transform(procedure_name) print("Procedure Name") print(procedure_name_vec[0]) provider_name_vec = vectorizer.fit_transform(provider_name) print("Provider Name") print(provider_name_vec[0]) answer_vec = vectorizer.fit_transform(answer) # This is the output print("Answer shape and %s = %s " % (answer[3], answer_vec[3])) print(answer_vec.shape) appt_duration_vec = appt_duration[:, None] complete_vec = complete[:, None] cancel_vec = cancel[:, None] noshow_vec = noshow[:, None] lb_year = LabelBinarizer().fit(yearlist)
#Step 3: scale the raw pixel intensities to the #range [0, 1.0], then construct the training (75%)and #testing splits(25%)Use train_test_split() function data = dataset.data.astype("float") / 255.0 #(trainX, testX, trainY, testY) = train_test_split(data / 255.0, dataset.target.astype("int"), test_size=0.25, random_state=42) # initialize the optimizer and model print("[INFO] compiling model...") lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.fit_transform(testY) #Step 4: Define the 784-256-128-10 architecture using #sequential model in KerasCheckthis tutorial #(https://keras.io/getting-started/sequential-model-guide/) #inputShape = (height,width,depth) model = Sequential() model.add(Dense(256, input_shape=(784,), activation="sigmoid")) model.add(Dense(128, activation="sigmoid")) model.add(Dense(10, activation="softmax"))
help="path to models directory") args = vars(ap.parse_args()) # load the testing data, then scale it into the range [0, 1] (testX, testY) = cifar10.load_data()[1] testX = testX.astype("float") / 255.0 # initialize the label names for the CIFAR-10 dataset labelNames = [ "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck" ] # convert the labels from integers to vectors lb = LabelBinarizer() testY = lb.fit_transform(testY) # construct the path used to collect the models then initialize the # model list modelPaths = os.path.sep.join([args["models"], "*.model"]) modelPaths = list(glob.glob(modelPaths)) models = [] # loop over the model paths, loading the model, and adding it to # the list of models for (i, modelPath) in enumerate(modelPaths): print("[INFO] loading model {}/{}".format(i + 1, len(modelPaths))) models.append(load_model(modelPath)) # initialize the list of predictions print("[INFO] evaluating ensemble...")
output = Feature_set.S_TREE_CROP_NAME Feature_set = Feature_set.drop(columns=['S_TREE_CROP_NAME']) from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification #print(output) #print(Feature_set) clf = RandomForestClassifier() print(Feature_set) #Encoding to convert String attributes into numbers from sklearn.preprocessing import LabelBinarizer croptype_lb = LabelBinarizer() soiltype_lb = LabelBinarizer() irrigation_lb = LabelBinarizer() croptype = croptype_lb.fit_transform(Feature_set.S_TREE_CROP_TYPE.values) soiltype = soiltype_lb.fit_transform(Feature_set.S_SOIL_TYPE.values) irrigation = irrigation_lb.fit_transform(Feature_set.S_IRRIGATION.values) temp = Feature_set.copy() temp = temp.drop(columns=['S_IRRIGATION', 'S_SOIL_TYPE', 'S_TREE_CROP_TYPE']) temp['Crop Type'] = pd.DataFrame(croptype) temp['Irrigation'] = pd.DataFrame(irrigation) stype = pd.DataFrame(soiltype) temp["Soil Type"] = (stype[0].astype(str) + stype[1].astype(str) + stype[2].astype(str) + stype[3].astype(str) + stype[4].astype(str) + stype[5].astype(str)).astype(int) #Final Feature set is in temp Final_Feature_Set = temp.copy() #print(Final_Feature_Set) #print("Correlations")
# min normalization # standardization [upb lwb] # 3D tensor # neural network input is always a tensor # higher dimension matirx set1 = np.reshape(set1, (len(set1), 32, len(np.transpose(set1)), 1)) set2 = np.reshape(set2, (len(set2), 32, len(np.transpose(set2)), 1)) set3 = np.reshape(set3, (len(set3), 32, len(np.transpose(set3)), 1)) # Encoding Data Label # -- 1. binarization # -- 2. One hot encoding label_encoder = LabelBinarizer() U_L = label_encoder.fit_transform(U_L) T_L = label_encoder.fit_transform(T_L) # task finished # 1. Data read # 2. data Normalize # 3. label binarize # Set the random seed random_seed = 2 'DATA SPLITING' # " Change Here" # Exp1 # Split the train and the validation set for the fitting X_train, X_val, Y_train, Y_val = train_test_split(set3,
from sklearn.preprocessing import LabelBinarizer # Generate data from sklearn.datasets import fetch_mldata np.random.seed(0) mnist = fetch_mldata('MNIST original') X, y = mnist.data, mnist.target X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) lb = LabelBinarizer(neg_label=0, pos_label=1) y_train = lb.fit_transform(y_train).astype(np.float64) y_test = lb.transform(y_test).astype(np.float64) ppross = decomposition.RandomizedPCA(n_components=50, whiten=True) ppross.fit(X_train) X_train = ppross.transform(X_train) X_test = ppross.transform(X_test) n = X_train.shape[0] p = y_train.shape[1] d = X_train.shape[1] # sigma = 1113.379020273871 sigma = np.median(pdist(X_train[np.random.choice(X_train.shape[0], 1000), :])) # sigma = 1. / np.sqrt(2 * 0.00728932) print 'sigma: ', sigma
class RCNN(): def __init__(self): self.init_lr = 1e-4 self.epochs = 5 self.bs = 2 self.baseModel = MobileNetV2(weights="imagenet", include_top=False, input_tensor=Input(shape=(224, 224, 3))) self.model = None self.aug = ImageDataGenerator(rotation_range=20, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.15, horizontal_flip=True, fill_mode="nearest") self.trainX, self.trainY = None, None self.testX, self.testY = None, None self.H = None self.lb = None self.build_model() def load_dataset(self): imagePaths = list(paths.list_images(config.BASE_PATH)) data = [] labels = [] for imagePath in imagePaths: label = imagePath.split(os.path.sep)[-2] image = load_img(imagePath, target_size=config.INPUT_DIMS) image = img_to_array(image) image = preprocess_input(image) data.append(image) labels.append(label) data = np.array(data, dtype="float32") labels = np.array(labels) self.lb = LabelBinarizer() labels = self.lb.fit_transform(labels) labels = to_categorical(labels) (self.trainX, self.testX, self.trainY, self.testY) = train_test_split(data, labels, test_size=0.20, stratify=labels, random_state=42) return self def build_model(self): headModel = self.baseModel.output headModel = AveragePooling2D(pool_size=(7, 7))(headModel) headModel = Flatten(name="flatten")(headModel) headModel = Dense(128, activation="relu")(headModel) headModel = Dropout(0.5)(headModel) headModel = Dense(len(config.LABELS), activation="softmax")(headModel) self.model = Model(inputs=self.baseModel.input, outputs=headModel) for layer in self.baseModel.layers: layer.trainable = False return self def summary(self): self.model.summary() def compile(self): print("[+] Model is compiling...") opt = Adam(lr=self.init_lr) self.model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"]) return self def train(self): print("[+] Model is training...") self.H = self.model.fit(self.aug.flow(self.trainX, self.trainY, batch_size=self.bs), steps_per_epoch=len(self.trainX) // self.bs, validation_data=(self.testX, self.testY), validation_steps=len(self.testX) // self.bs, epochs=self.epochs) return self def evaluate(self): print("[INFO] evaluating network...") predIdxs = self.model.predict(self.testX, batch_size=self.bs) # for each image in the testing set we need to find the index of the # label with corresponding largest predicted probability predIdxs = np.argmax(predIdxs, axis=1) # show a nicely formatted classification report print( classification_report(self.testY.argmax(axis=1), predIdxs, target_names=self.lb.classes_)) # serialize the model to disk print("[+] saving mask detector model...") self.model.save(config.MODEL_PATH, save_format="h5") # serialize the label encoder to disk print("[+] saving label encoder...") f = open(config.ENCODER_PATH, "wb") f.write(pickle.dumps(self.lb)) f.close() # plot the training loss and accuracy N = self.epochs plt.style.use("ggplot") plt.figure() plt.plot(np.arange(0, N), self.H.history["loss"], label="train_loss") plt.plot(np.arange(0, N), self.H.history["val_loss"], label="val_loss") plt.plot(np.arange(0, N), self.H.history["accuracy"], label="train_acc") plt.plot(np.arange(0, N), self.H.history["val_accuracy"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="lower left") plt.savefig('test.png')
def LabelBinarize(df): lb = LabelBinarizer(sparse_output=True) return lb.fit_transform(df).astype(np.int32)
class KernelSVC(BaseClassifier): """Estimator for learning kernel SVMs by Newton's method. Parameters ---------- alpha : float Weight of the penalty term. solver : str, 'cg', 'dense' max_iter : int Maximum number of iterations to perform. tol : float Tolerance of the stopping criterion. kernel: "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed" Kernel to use. Default: "linear" degree : int, default=3 Degree for poly, rbf and sigmoid kernels. Ignored by other kernels. gamma : float, optional Kernel coefficient for rbf and poly kernels. Default: 1/n_features. Ignored by other kernels. coef0 : float, optional Independent term in poly and sigmoid kernels. Ignored by other kernels. random_state : RandomState or int The seed of the pseudo random number generator to use. verbose : int Verbosity level. n_jobs : int Number of jobs to use to compute the kernel matrix. Example ------- >>> from sklearn.datasets import make_classification >>> from lightning.classification import KernelSVC >>> X, y = make_classification() >>> clf = KernelSVC().fit(X, y) >>> accuracy = clf.score(X, y) """ def __init__(self, alpha=1.0, solver="cg", max_iter=50, tol=1e-3, kernel="linear", gamma=0.1, coef0=1, degree=4, random_state=None, verbose=0, n_jobs=1): self.alpha = alpha self.solver = solver self.max_iter = max_iter self.tol = tol self.kernel = kernel self.gamma = gamma self.coef0 = coef0 self.degree = degree self.random_state = random_state self.verbose = verbose self.n_jobs = n_jobs def _kernel_params(self): return { "gamma": self.gamma, "degree": self.degree, "coef0": self.coef0 } def _solve(self, A, b): if self.solver == "cg": x, info = cg(A, b, tol=self.tol) elif self.solver == "dense": x = solve(A, b, sym_pos=True) return x def _fit_binary(self, K, y, rs): n_samples = K.shape[0] coef = np.zeros(n_samples) if n_samples < 1000: sv = np.ones(n_samples, dtype=bool) else: sv = np.zeros(n_samples, dtype=bool) sv[:1000] = True rs.shuffle(sv) for t in xrange(1, self.max_iter + 1): if self.verbose: print("Iteration", t, "#SV=", np.sum(sv)) K_sv = K[sv][:, sv] I = np.diag(self.alpha * np.ones(K_sv.shape[0])) coef_sv = self._solve(K_sv + I, y[sv]) coef *= 0 coef[sv] = coef_sv pred = np.dot(K, coef) errors = 1 - y * pred last_sv = sv sv = errors > 0 if np.array_equal(last_sv, sv): if self.verbose: print("Converged at iteration", t) break return coef def _post_process(self, X): # We can't know the support vectors when using precomputed kernels. if self.kernel != "precomputed": sv = np.sum(self.coef_ != 0, axis=0, dtype=bool) if np.sum(sv) > 0: self.coef_ = np.ascontiguousarray(self.coef_[:, sv]) mask = safe_mask(X, sv) self.support_vectors_ = np.ascontiguousarray(X[mask]) self.support_indices_ = np.arange(X.shape[0], dtype=np.int32)[sv] self.n_samples_ = X.shape[0] if self.verbose >= 1: print("Number of support vectors:", np.sum(sv)) def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : classifier Returns self. """ n_samples, n_features = X.shape rs = check_random_state(self.random_state) self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1) Y = self.label_binarizer_.fit_transform(y) self.classes_ = self.label_binarizer_.classes_.astype(np.int32) n_vectors = Y.shape[1] if self.verbose: print("Pre-computing kernel matrix...") K = pairwise_kernels(X, filter_params=True, n_jobs=self.n_jobs, metric=self.kernel, **self._kernel_params()) coef = [self._fit_binary(K, Y[:, i], rs) for i in xrange(n_vectors)] self.coef_ = np.array(coef) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) self._post_process(X) return self def decision_function(self, X): """ Return the decision function for test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- P : array, shape = [n_classes, n_samples] Decision function for X """ K = pairwise_kernels(X, self.support_vectors_, filter_params=True, n_jobs=self.n_jobs, metric=self.kernel, **self._kernel_params()) return np.dot(K, self.coef_.T)
# In[34]: X_train_counts = count_vect.fit_transform(X_train) # In[35]: count_vect.vocabulary_.items()[0:3] # In[36]: len(count_vect.vocabulary_) # In[56]: lab_bin = LabelBinarizer() y_train_bin = lab_bin.fit_transform(y_train) y_test_bin = lab_bin.fit_transform(y_test) # ## Train # In[58]: from sklearn.naive_bayes import MultinomialNB # In[59]: clf = MultinomialNB().fit(X_train_counts, y_train_bin) # In[60]: len(clf.coef_[0])
import pandas as pd import numpy as np from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import pickle df = pd.read_csv('../data/iowa_recidivism_2019.csv') y = df['Return to Prison'] # transform target to binary with 1 equal to recidivism lb_target = LabelBinarizer() y = pd.Series(lb_target.fit_transform(y).reshape(-1,)) y.index = df.index X = df.drop('Return to Prison', axis=1) # train: 19515 # test: 6505 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25) X_train.dropna(axis = 0, subset=['Sex'], inplace=True) class encoded_X_train(): def __init__(self, X_train, y_train): self.X_train = X_train self.y_train = y_train def oh_encode(self, feature):
required=True, help="path to the trained model directory") args = vars(ap.parse_args()) # show information on the process ID print("[INFO process ID: {}]".format(os.getpid())) # load the training and testing set and then scale it to # the range [0,1] ((X_train, y_train), (X_test, y_test)) = cifar10.load_data() X_train = X_train.astype("float") / 255.0 X_test = X_test.astype("float") / 255.0 # convert the labels from integers to vectors lb = LabelBinarizer() y_test = lb.fit_transform(y_test) y_train = lb.fit_transform(y_train) # initialize the label names for the CIRFA10 dataset labelNames = [ "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck" ] # initialize the optimizer and model print("[INFO] compiling model...") opt = SGD(lr=0.01, decay=0.01 / 50, momentum=0.9, nesterov=True) model = MiniVGGNetwork.build(width=32, height=32, depth=3, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
data = data.reshape(data.shape[0], 1, 28, 28) # otherwise, we are using "channels last" ordering, so the design # matrix shape should be: num_samples x rows x columns x depth else: data = data.reshape(data.shape[0], 28, 28, 1) (trainX, testX, trainY, testY) = train_test_split(data / 255.0, dataset.target.astype("int"), test_size=0.25, random_state=42) # convert the labels from integers to vectors le = LabelBinarizer() trainY = le.fit_transform(trainY) testY = le.transform(testY) print("[INFO] compiling model...") opt = SGD(lr=0.01) model = LeNet.build(width=28, height=28, depth=1, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # train the network print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=128,
def main(): start_time = time.time() train = pd.read_table('../train.tsv', engine='c') test = pd.read_table('../test.tsv', engine='c') train = train.sample(frac=1).reset_index(drop=True) test = train.loc[100000:120000] train = train.loc[0:100000] print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) test_labels = np.log1p(test['price']) nrow_train = train.shape[0] # y = train["price"] y = np.log1p(train["price"]) merge = pd.concat([train, test]) submission = test[['train_id']] del train del test gc.collect() handle_missing_inplace(merge) print('[{}] Finished to handle missing'.format(time.time() - start_time)) cutting(merge) print('[{}] Finished to cut'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Finished to convert categorical'.format(time.time() - start_time)) cv = CountVectorizer(min_df=NAME_MIN_DF) X_name = cv.fit_transform(merge['name']) print('[{}] Finished count vectorize `name`'.format(time.time() - start_time)) cv = CountVectorizer() X_category = cv.fit_transform(merge['category_name']) print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time)) tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION, ngram_range=(1, 3), stop_words='english') X_description = tv.fit_transform(merge['item_description']) print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time)) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr() print('[{}] Finished to create sparse merge'.format(time.time() - start_time)) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_train:] print("TRAINING SHAPE") print(X.shape) print("Test SHAPE") print(X_test.shape) model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3) model.fit(X, y) print('[{}] Finished to train ridge sag'.format(time.time() - start_time)) predsR = model.predict(X=X_test) print('[{}] Finished to predict ridge sag'.format(time.time() - start_time)) model = Ridge(solver="lsqr", fit_intercept=True, random_state=145, alpha = 3) model.fit(X, y) print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time)) predsR2 = model.predict(X=X_test) print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time)) train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] params = { 'learning_rate': 0.76, 'application': 'regression', 'max_depth': 3, 'num_leaves': 99, 'verbosity': -1, 'metric': 'RMSE', 'nthread': 4 } params2 = { 'learning_rate': 0.85, 'application': 'regression', 'max_depth': 3, 'num_leaves': 110, 'verbosity': -1, 'metric': 'RMSE', 'nthread': 4 } model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \ early_stopping_rounds=500, verbose_eval=500) predsL = model.predict(X_test) print('[{}] Finished to predict lgb 1'.format(time.time() - start_time)) train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) d_train2 = lgb.Dataset(train_X2, label=train_y2) d_valid2 = lgb.Dataset(valid_X2, label=valid_y2) watchlist2 = [d_train2, d_valid2] model = lgb.train(params2, train_set=d_train2, num_boost_round=3000, valid_sets=watchlist2, \ early_stopping_rounds=50, verbose_eval=500) predsL2 = model.predict(X_test) print('[{}] Finished to predict lgb 2'.format(time.time() - start_time)) preds = predsR2*0.15 + predsR*0.15 + predsL*0.5 + predsL2*0.2 submission['price'] = np.expm1(preds) submission.to_csv("submission_lgbm_ridge_11.csv", index=False) print("ERROR") print(rmsle(preds, test_labels))
ROWS = 10 fig, axes = plt.subplots(ROWS, ROWS, figsize=(10, 10)) for i in range(ROWS): for j in range(ROWS): k = np.random.choice(range(X_train_orig.shape[0])) axes[i][j].set_axis_off() axes[i][j].imshow(X_train_orig[k].reshape((28, 28))) #plt.show() # Normalize image vectors X_train = X_train_orig / 255. X_test = X_test_orig / 255. # Convert training and test labels to one hot matrices label_binrizer = LabelBinarizer() Y_train = label_binrizer.fit_transform(Y_train_orig) Y_test = label_binrizer.fit_transform(Y_test_orig) print("number of training examples = " + str(X_train.shape[0])) print("number of test examples = " + str(X_test.shape[0])) print("X_train shape: " + str(X_train.shape)) print("Y_train shape: " + str(Y_train.shape)) print("X_test shape: " + str(X_test.shape)) print("Y_test shape: " + str(Y_test.shape)) # train the neural network model = ResNet18(input_shape=(28, 28, 1), classes=24) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(X_train, Y_train, epochs=2, batch_size=32)
def do_vgg_train(path_input, width, height, basename, vgg_size, fc_size, logLevel="WARN"): """Train a VGG-like convolutional network """ logvgg = logging.getLogger(f"{__name__}.console.trainvgg") logvgg.setLevel(logLevel) model_file = f"{basename}.model" label_bin_file = f"{basename}.pickle" plot_file = f"{basename}.png" logvgg.debug(f"mf {model_file} lbf {label_bin_file} pf {plot_file}") data, labels = load_dataset(path_input, width, height, "INFO") # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25) # convert the labels from integers to vectors (for 2-class, binary # classification you should use Keras' to_categorical function # instead as the scikit-learn's LabelBinarizer will not return a # vector) lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.transform(testY) # construct the image generator for data augmentation # rotation is ok, shear/shift/flip reduced aug = ImageDataGenerator( rotation_range=30, width_shift_range=0.01, height_shift_range=0.01, shear_range=0.002, zoom_range=0.02, horizontal_flip=False, fill_mode="nearest", ) if vgg_size == "small": # TODO fc_size set from here model = SmallVGGNet.build(width=width, height=height, depth=3, classes=len(lb.classes_)) elif vgg_size == "middle": # default value of fc_size if fc_size == -1: fc_size = 512 model = MiddleVGGNet.build( width=width, height=height, depth=3, classes=len(lb.classes_), fully_connected_size=fc_size, ) else: logvgg.critical(f"Unrecognized dimension {vgg_size}, stopping.") return -1 # initialize our initial learning rate, # of epochs to train for, and batch size INIT_LR = 0.01 EPOCHS = 75 # EPOCHS = 3 BS = 32 # TODO fiddle with this # initialize the model and optimizer (you'll want to use # binary_crossentropy for 2-class classification) logvgg.info("Training network...") opt = SGD(lr=INIT_LR, decay=INIT_LR / EPOCHS) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # TODO fiddle with this # save model summary summary_file = f"{basename}_summary.txt" with open(summary_file, "w") as sf: model.summary(line_length=100, print_fn=lambda x: sf.write(f"{x}\n")) # using an actual logger: print_fn=logger.info # save the model structure in JSON format config = model.get_config() config_json_file = f"{basename}_structure.json" with open(config_json_file, "w") as jf: json.dump(config, jf) # train the network H = model.fit_generator( aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS, ) # save the model and label binarizer to disk logvgg.info("Serializing network and label binarizer...") model.save(model_file) with open(label_bin_file, "wb") as f: f.write(pickle.dumps(lb)) # evaluate the network logvgg.info("Evaluating network...") predictions = model.predict(testX, batch_size=32) report = classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=lb.classes_) logvgg.info(f"\n{report}") report_file = f"{basename}_report.txt" with open(report_file, "w") as rf: rf.write(report) # plot the training loss and accuracy N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["val_loss"], label="val_loss") plt.plot(N, H.history["acc"], label="train_acc") plt.plot(N, H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy (SmallVGGNet)") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.savefig(plot_file)
picture = load_img(path, target_size=(224, 224)) picture = img_to_array(picture) picture = preprocess_input(picture) data.append(picture) labels.append(label) data = np.array(data, dtype='float32') labels = np.array(labels) le = LabelBinarizer() labels = le.fit_transform(labels) labels = to_categorical(labels) (x_train, x_test, y_train, y_test) = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42) print(x_train.shape) print(y_train.shape) augmentation = ImageDataGenerator(rotation_range=20, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2,
import numpy as np import os images = list(paths.list_images("dataset")) x = [] Y = [] for image in tqdm(images, desc="Processing Images s"): label = image.split(os.path.sep)[-2] img = load_img(image, target_size=(100, 100)) img = img_to_array(img) img = preprocess_input(img) x.append(img) Y.append(label) x_arr = np.array(x, dtype='float32') Y_arr = np.array(Y) print(x_arr.shape) print(Y_arr.shape) binarizer = LabelBinarizer() y_arr = binarizer.fit_transform(Y_arr) y_arr = to_categorical(y_arr) print(y_arr.shape) np.save('X_RAW.npy', x_arr) np.save('Y_RAW.npy', y_arr) print("Raw Data Saved to disc")
'latitude', 'longitude', 'review_count', 'is_open', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Caters', 'WheelchairAccessible', 'BikeParking', 'AcceptsInsurance', 'BusinessAcceptsCreditCards', 'CoatCheck', 'HappyHour', 'GoodForKids', 'Open24Hours', 'OutdoorSeating', 'HasTV', 'BusinessAcceptsBitcoin', 'ByAppointmentOnly', 'DogsAllowed', 'DriveThru', 'Smoking', 'NoiseLevel', 'AgesAllowed', 'Alcohol', 'WiFi', 'Music', 'Ambience', 'BusinessParking', 'pos_count', 'neg_count', 'checkin_count' ]] y = np.array(dataset[['stars']]) lab_enc = preprocessing.LabelEncoder() label_dataset_encoded = lab_enc.fit_transform(y) encoder = LabelBinarizer() y = encoder.fit_transform(label_dataset_encoded) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sc = StandardScaler() #X_train = sc.fit_transform(X_train) #X_test = sc.fit_transform(X_test) ''' #Initializing Neural Network classifier = Sequential() # Adding the input layer and the first hidden layer classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 8)) # Adding the second hidden layer classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu')) # Adding the output layer classifier.add(Dense(output_dim = 9, init = 'uniform', activation = 'sigmoid'))
def one_hot_encode(labels): enc = LabelBinarizer() return enc.fit_transform(labels)
# if len(d.split()) == 3: # new_data.append(d) # #selected = new_data all_words = [] for s in data: for se in s.split(): all_words.append(se) words = np.unique(all_words) vocabulary_size = len(words) enc = LabelBinarizer() ohe = enc.fit_transform(words) input_dic = {} for w in range(0, len(words)): input_dic[words[w]] = np.reshape(ohe[w], (1, vocabulary_size)) context_words = [] for s in data: se = s.split() context_words.append([se[2], [se[1], se[0]]]) #context_words.append([se[1], [se[0], se[2]]]) #context_words.append([se[2], [se[0], se[1]]]) #for s in selected: # se = s.split()
unique_val = np.array(labels) np.unique(unique_val) plt.figure(figsize = (18,8)) sns.countplot(x =labels) train.drop('label', axis = 1, inplace = True) images = train.values images = np.array([np.reshape(i, (28, 28)) for i in images]) images = np.array([i.flatten() for i in images]) from sklearn.preprocessing import LabelBinarizer label_binrizer = LabelBinarizer() labels = label_binrizer.fit_transform(labels) plt.imshow(images[0].reshape(28,28)) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(images, labels, test_size = 0.3, random_state = 101) import keras from keras.models import Sequential from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout batch_size = 128 num_classes = 24 epochs = 50 x_train = x_train / 255 x_test = x_test / 255
class OneAgainstRest(MulticlassExtension): """ the multiclass extension based on the one-against-rest algorithm. """ def __init__(self, estimator_cls: Callable[[List], Estimator], params: Optional[List] = None) -> None: super().__init__() self.estimator_cls = estimator_cls self.params = params if params is not None else [] self.label_binarizer_ = None self.classes = None self.estimators = None def train(self, x, y): """ training multiple estimators each for distinguishing a pair of classes. Args: x (numpy.ndarray): input points y (numpy.ndarray): input labels Raises: Exception: given all data points are assigned to the same class, the prediction would be boring """ self.label_binarizer_ = LabelBinarizer(neg_label=0) Y = self.label_binarizer_.fit_transform(y) self.classes = self.label_binarizer_.classes_ columns = (np.ravel(col) for col in Y.T) self.estimators = [] for _, column in enumerate(columns): unique_y = np.unique(column) if len(unique_y) == 1: raise Exception( "given all data points are assigned to the same class, " "the prediction would be boring.") estimator = self.estimator_cls(*self.params) estimator.fit(x, column) self.estimators.append(estimator) def test(self, x, y): """ testing multiple estimators each for distinguishing a pair of classes. Args: x (numpy.ndarray): input points y (numpy.ndarray): input labels Returns: float: accuracy """ A = self.predict(x) B = y _l = len(A) diff = np.sum(A != B) logger.debug("%d out of %d are wrong", diff, _l) return 1 - (diff * 1.0 / _l) def predict(self, x): """ applying multiple estimators for prediction Args: x (numpy.ndarray): NxD array Returns: numpy.ndarray: predicted labels, Nx1 array """ n_samples = _num_samples(x) maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators): pred = np.ravel(e.decision_function(x)) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes[np.array(argmaxima.T)]
ap = argparse.ArgumentParser() ap.add_argument("-m", "--model", required=True, help="path to output model") ap.add_argument("-o", "--output", required=True, help="path to output directory (logs, plots, etc.)") args = vars(ap.parse_args()) print("[INFO] loading CIFAR-10 data ...") ((train_x, train_y), (test_x, test_y)) = cifar10.load_data() train_x = train_x.astype("float") test_x = test_x.astype("float") mean = np.mean(train_x, axis=0) train_x -= mean test_x -= mean lb = LabelBinarizer() train_y = lb.fit_transform(train_y) test_y = lb.transform(test_y) aug = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest") fig_path = os.path.sep.join([args["output"], "{}.png".format(os.getpid())]) json_path = os.path.sep.join([args["output"], "{}.json".format(os.getpid())]) # callbacks = [TrainingMonitor(fig_path, json_path=json_path), LearningRateScheduler(poly_decay)] callbacks = [LearningRateScheduler(poly_decay)] print("[INFO] compiling model ...") opt = SGD(lr=INIT_LR, momentum=0.9) model = MiniGoogleNet.build(width=32, height=32, depth=3, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) print("[INFO] training model ...")