def partial_3(): perceptron_3 = Perceptron() stop_3 = 0 counter_3 = 0 while stop_3 == 0: for x in range(1000): perceptron_3.partial_fit([data_3[x]],[labels_3[x]], classes= np.unique(labels_3)) if perceptron_3.score(data_3,labels_3) == 1: stop_3 +=1 if counter_3 >= 100000: return print('No Convergence') else: counter_3 +=1 weights = perceptron_3.coef_ w1 = weights[0][0] w2 = weights[0][1] w0 = perceptron_3.intercept_[0] print("Weights was adjusted {} times".format(counter_3)) print("Intercept (w0) is {}".format(w0)) print("Final weight vector is : {} Hence:".format(weights)) print("w1 is : {} , w2 weight is: {}".format(w1, w2)) #deriving the line based on HW Problem 1.2 a = -1* (w1/w2) b = -1* (w0/w2) print("The equation of the decision boundary line is y = ({})x + ({})".format(a,b))
def train_classifiers(models, train_data): classifiers = dict() for modelname, model in models.items(): if settings["classifier"] == "Perceptron": classifier = Perceptron() if settings["classifier"] == "PassiveAggressive": classifier = PassiveAggressiveClassifier() for sample_no, (text, is_acq) in enumerate(train_data): bow = dictionary.doc2bow(simple_preprocess(text)) model_features = sparse2full(model[bow], model.__out_size) label = np.array([is_acq]) #ln.debug("%s, %s "% (model_features, label.shape)) classifier.partial_fit(model_features, label, classes=np.array([True, False])) if sample_no % 500 == 0: ln.debug("Classifier for %s trained %s samples so far." % (modelname, sample_no)) classifiers[modelname] = classifier ln.info("Finished training classifier for %s" % modelname) return classifiers
def partial_2(): perceptron_2 = Perceptron() stop_2 = 0 counter_2 = 0 while stop_2 == 0: for x in range(1000): perceptron_2.partial_fit([data_2[x]],[labels_2[x]], classes= np.unique(labels_2)) if perceptron_2.score(data_2,labels_2) == 1: counter_2 +=1 stop_2 +=1 break else: counter_2 +=1 weights = perceptron_2.coef_ w1 = weights[0][0] w2 = weights[0][1] w0 = perceptron_2.intercept_[0] print("Weights was adjusted {} times".format(counter_2)) print("Intercept (w0) is {}".format(w0)) print("Final weight vector is : {} Hence:".format(weights)) print("w1 is : {} , w2 weight is: {}".format(w1, w2)) #deriving the line based on HW Problem 1.2 a = -1* (w1/w2) b = -1* (w0/w2) print("The equation of the decision boundary line is y = ({})x + ({})".format(a,b))
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = PartialPerceptron(classes=[0, 1], max_iter=1000, tol=1e-3) b = Perceptron(max_iter=1000, tol=1e-3) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(a.coef_, b.coef_)
def train(self, parsed_sentences, path, **kwargs): all_classes = [ 'O', 'B-per', 'I-per', 'B-gpe', 'I-gpe', 'B-geo', 'I-geo', 'B-org', 'I-org', 'B-tim', 'I-tim', 'B-art', 'I-art', 'B-eve', 'I-eve', 'B-nat', 'I-nat' ] X, y = self.get_minibatch(parsed_sentences, kwargs.get('batch_size', 500)) vectorizer = DictVectorizer(sparse=False) vectorizer.fit(X) clf = Perceptron(verbose=10, n_jobs=-1, n_iter=kwargs.get('n_iter', 5)) while len(X): X = vectorizer.transform(X) clf.partial_fit(X, y, all_classes) X, y = self.get_minibatch(parsed_sentences, kwargs.get('batch_size', 500)) clf = Pipeline([('vectorizer', vectorizer), ('classifier', clf)]) model_pkl = open(path, 'wb') pickle.dump(clf, model_pkl) model_pkl.close() self._classifier = clf
class CalibratedPerceptron(BaseSKMObject, ClassifierMixin): """ Calibrated Perceptron classifier """ def __init__(self, nominal_attributes=None): super().__init__() self.perceptron = Perceptron() self.cc = None def fit(self, X, y, sample_weight=None): self.perceptron.fit(X, y) if self.cc is None: self.cc = CalibratedClassifierCV(self.perceptron, cv='prefit', method='isotonic') self.cc.fit(X, y, sample_weight=sample_weight) def partial_fit(self, X, y, classes=None, sample_weight=None): print(y) self.perceptron.partial_fit(X, y, classes=classes, sample_weight=sample_weight) if self.cc is None: self.cc = CalibratedClassifierCV(self.perceptron, cv='prefit', method='sigmoid') self.cc.fit(X, y, sample_weight=sample_weight) def predict(self, X): return self.perceptron.predict(X) def predict_proba(self, X): return self.cc.predict_proba(X)
def training_per(X_train, y_train, X_test, classes, dataset): per = Perceptron(verbose=10, n_jobs=-1, max_iter=5) per.partial_fit(X_train, y_train.values.ravel(), classes) y_pred = per.predict(X_test) model_filename = os.getcwd() + '/models/' + dataset + '/per_model.pkl' with open(model_filename, 'wb') as file_model: pickle.dump(per, file_model) return (y_pred)
class DrunkLearningOnline(DrunkLearningBatch): """drunk_learning class for online learning""" def __init__(self): super(DrunkLearningOnline, self).__init__() self.clf = Perceptron() self.filename = 'modelPerceptron.pkl' def partial_fit(self, X, y): X = np.array([X]) y = np.array(y) self.clf.partial_fit(X, y, [0, 1]) joblib.dump(self.clf, self.filename, compress=9)
def train(cls, parsed_sentences, feature_detector, all_classes, **kwargs): X,y = cls.get_minibatch(parsed_sentences, feature_detector, kwargs.get('batch_size', 500)) vectorizer = DictVectorizer(sparse=False) vectorizer.fit(X) clf = Perceptron(verbose=10, n_jobs=-1, n_iter=kwargs.get('n_iter',5)) while len(X): X = vectorizer.transform(X) clf.partial_fit(X, y, all_classes) X,y = cls.get_minibatch(parsed_sentences, feature_detector, kwargs.get('batch_size', 500)) clf = Pipeline([ ('vectorizer', vectorizer), ('classifier', clf) ]) return cls(clf, feature_detector)
def train_and_test_with_perceptron(X_train, y_train, X_test, y_test): classes = np.unique(y_train) classes = classes.tolist() print(classes) per = Perceptron(verbose=10, n_jobs=-1) per.partial_fit(X_train, y_train, classes=classes) new_class = list(set(classes) - set(['O'])) print(new_class) print( classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_class))
def incremental_train_scikit_classifier( sentences, feature_detector, batch_size, max_iterations): initial_corpus_iterator, sentences = itertools.tee(sentences) # compute all labels ALL_LABELS = set([]) for sentence in initial_corpus_iterator: for w, t in sentence: ALL_LABELS.add(t) ALL_LABELS = list(ALL_LABELS) batch = list(itertools.islice(sentences, batch_size)) dataset = feature_detector(batch) # split the dataset into featuresets and the predicted labels featuresets, labels = zip(*dataset) # This vectorizer doesn't need to be fitted vectorizer = FeatureHasher(n_features=1000000) classifier = Perceptron(tol=0.00001, max_iter=25, n_jobs=-1) for _ in range(max_iterations): current_corpus_iterator, sentences = itertools.tee(sentences) batch_count = 0 while True: batch_count += 1 print("Training on batch={0}".format(batch_count)) classifier.partial_fit(vectorizer.transform(featuresets), labels, ALL_LABELS) batch = list(itertools.islice(current_corpus_iterator, batch_size)) if not batch: break dataset = feature_detector(batch) featuresets, labels = zip(*dataset) scikit_classifier = ScikitClassifier(classifier=classifier, vectorizer=vectorizer) return scikit_classifier
def percey_demo(iris, column, epochs): def class_to_targets(target): if target == 'Iris-' + iris: return 1 else: return 0 percey = Perceptron() print("Training a Perceptron to classify Iris-" + iris + " using " + column + " width and " + column + " length.") iris_inputs = iris_data[[column + ' width', column + ' length']] iris_targets = iris_data['class'].apply(class_to_targets) xmin = min(iris_inputs[iris_inputs.columns[0]]) xmax = max(iris_inputs[iris_inputs.columns[0]]) xnums = np.arange(xmin, xmax, (xmax - xmin) / 100) for x in range(epochs): #print(np.unique(iris_targets)) percey.partial_fit(iris_inputs, iris_targets, classes=np.unique(iris_targets)) weights = percey.coef_[0] threshold = percey.intercept_ # print(threshold) # print(weights) def makeline(xval): return (-threshold - weights[0] * xval) / weights[1] plt.scatter(iris_data[column + " width"], iris_data[column + " length"], c=iris_data['class'].apply(iris_to_color)) plt.plot(xnums, makeline(xnums), c="orange") plt.xlabel(column + " width") plt.ylabel(column + " length") plt.axis(ymin=min(iris_inputs[iris_inputs.columns[1]]) - 0.5, ymax=max(iris_inputs[iris_inputs.columns[1]]) + 0.5) plt.title( "Training Perceptron to identify Iris-" + iris + " on epoch=" + str(x) + "\nRed = Iris-setosa, Blue = Iris-versicolor, Green = Iris-virginica" ) plt.show()
def trainPerceptron(): pathTrain = './corpus_train/train_set/' pathTest = './corpus_train/test_set/' trainDocs = os.listdir(pathTrain) testDocs = os.listdir(pathTest) classifier = Perceptron() for i in range(0, len(os.listdir(pathTrain))): trainDocString = getString(pathTrain + trainDocs[i]) testDocString = getString(pathTest + testDocs[i]) trainSentences = getSentences(trainDocString) testSentences = getSentences(testDocString) docFeatures = getFeatures(trainDocString, trainSentences) docTargets = getTargets(trainSentences, testSentences) classifier.partial_fit(docFeatures, docTargets, classes=[0, 1]) return classifier
class ClassifierBolt(Bolt): outputs = ['prediction', 'actual', 'id', 'training_count'] def initialize(self, config, context): self.config = config["sgd_config"].copy() if self.config['model'] == 'SGD': self.clf = SGDClassifier(loss=self.config['loss'], penalty=self.config['penalty']) elif self.config['model'] == 'MLP': self.clf = MLPClassifier( hidden_layer_sizes=self.config['hidden_layer_sizes']) elif self.config['model'] == 'PassiveAggressive': self.clf = PassiveAggressiveClassifier() elif self.config['model'] == 'Perceptron': self.clf = Perceptron(penalty=self.config['penalty']) self.trained_count = 0 self.pure_training_size = config["benchmark_config"][ "pure_training_size"] self.results = [] def process(self, tup): id, image_data, classification = tup.values x = [image_data] if self.trained_count >= self.pure_training_size: prediction = self.clf.predict(x)[0] self.results.append(prediction == classification) self.log( "{} prediction {} result: {} (predicted: {}, actual: {}) accuracy: {}%, last 32: {}%" .format( self.config, len(self.results), prediction == classification, prediction, classification, 100 * sum(self.results) // len(self.results), 100 * sum(self.results[-32:]) // len(self.results[-32:]))) self.emit([prediction, classification, id, self.trained_count]) y = [classification] self.clf.partial_fit(x, y, classes=['Active', 'Rest']) self.trained_count += 1 self.log("trained {}".format(self.trained_count))
def fit_perceptron(p: Perceptron, X, Y, times, error=0.1): for time in range(1, times + 1): p = p.partial_fit(X, Y, classes=np.unique(Y)) Y_hat = p.predict(X) correct = np.equal(Y, Y_hat) vals, count = np.unique(correct, return_counts=True) index = 0 if not vals[0] else 1 if len(vals) == 1: count = np.append(count, 0) false_num = count[index] err = false_num / len(correct) if err < error: return p, time return p, times
class PerceptronClassifier(object): def __init__(self, classes): self.classes = classes self.model = Perceptron() self.w = None def predict(self,X): X = X.reshape(1,-1) try: return self.model.predict(X)[0] except: return self.classes[0] def partial_fit(self, X, y, sample_weight = 1.0): X = X.reshape(1,-1) y = y.reshape(1,-1) return self.model.partial_fit(X,y, sample_weight = [sample_weight], classes = self.classes)
class PerceptronMask(BaseSKMObject, ClassifierMixin): """ Mask for sklearn.linear_model.Perceptron. scikit-multiflow requires a few interfaces, not present in scikit-learn, This mask serves as a wrapper for the Perceptron classifier. """ def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, n_iter=None): self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter self.tol = tol self.shuffle = shuffle self.verbose = verbose self.eta0 = eta0 self.n_jobs = n_jobs self.random_state = random_state self.early_stopping = early_stopping self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change self.class_weight = class_weight self.warm_start = warm_start self.n_iter = n_iter super().__init__() self.classifier = Perceptron( penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, shuffle=self.shuffle, verbose=self.verbose, eta0=self.eta0, n_jobs=self.n_jobs, random_state=self.random_state, early_stopping=self.early_stopping, validation_fraction=self.validation_fraction, n_iter_no_change=self.n_iter_no_change, class_weight=self.class_weight, warm_start=self.warm_start) def fit(self, X, y, classes=None, sample_weight=None): """ Calls the Perceptron fit function from sklearn. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. y: Array-like The class labels for all samples in X. classes: Not used. sample_weight: Samples weight. If not provided, uniform weights are assumed. Returns ------- PerceptronMask self """ self.classifier.fit(X=X, y=y, sample_weight=sample_weight) return self def partial_fit(self, X, y, classes=None, sample_weight=None): """ partial_fit Calls the Perceptron partial_fit from sklearn. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. y: Array-like The class labels for all samples in X. classes: Not used. sample_weight: Samples weight. If not provided, uniform weights are assumed. Returns ------- PerceptronMask self """ self.classifier.partial_fit(X=X, y=y, classes=classes, sample_weight=sample_weight) return self def predict(self, X): """ predict Uses the current model to predict samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. Returns ------- numpy.ndarray A numpy.ndarray containing the predicted labels for all instances in X. """ return np.asarray(self.classifier.predict(X)) def predict_proba(self, X): """ Predicts the probability of each sample belonging to each one of the known classes. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) A matrix of the samples we want to predict. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_values) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ return self.classifier._predict_proba_lr(X)
class PerceptronMask(BaseSKMObject, ClassifierMixin): """ Mask for sklearn.linear_model.Perceptron. scikit-multiflow requires a few interfaces, not present in scikit-learn, This mask serves as a wrapper for the Perceptron classifier. Examples -------- >>> # Imports >>> from skmultiflow.neural_networks import PerceptronMask >>> from skmultiflow.data import SEAGenerator >>> >>> # Setup a data stream >>> stream = SEAGenerator(random_state=1) >>> >>> # Setup the Perceptron Mask >>> perceptron = PerceptronMask() >>> >>> n_samples = 0 >>> correct_cnt = 0 >>> while n_samples < 5000 and stream.has_more_samples(): >>> X, y = stream.next_sample() >>> my_pred = perceptron.predict(X) >>> if y[0] == my_pred[0]: >>> correct_cnt += 1 >>> perceptron.partial_fit(X, y, classes=stream.target_values) >>> n_samples += 1 >>> >>> # Display the results >>> print('Perceptron Mask usage example') >>> print('{} samples analyzed'.format(n_samples)) >>> print("Perceptron's performance: {}".format(correct_cnt / n_samples)) """ def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False): self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter self.tol = tol self.shuffle = shuffle self.verbose = verbose self.eta0 = eta0 self.n_jobs = n_jobs self.random_state = random_state self.early_stopping = early_stopping self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change self.class_weight = class_weight self.warm_start = warm_start super().__init__() self.classifier = Perceptron( penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, shuffle=self.shuffle, verbose=self.verbose, eta0=self.eta0, n_jobs=self.n_jobs, random_state=self.random_state, early_stopping=self.early_stopping, validation_fraction=self.validation_fraction, n_iter_no_change=self.n_iter_no_change, class_weight=self.class_weight, warm_start=self.warm_start) def fit(self, X, y, classes=None, sample_weight=None): """ Calls the Perceptron fit function from sklearn. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. y: Array-like The class labels for all samples in X. classes: Not used. sample_weight: Samples weight. If not provided, uniform weights are assumed. Returns ------- PerceptronMask self """ self.classifier.fit(X=X, y=y, sample_weight=sample_weight) return self def partial_fit(self, X, y, classes=None, sample_weight=None): """ partial_fit Calls the Perceptron partial_fit from sklearn. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. y: Array-like The class labels for all samples in X. classes: Not used. sample_weight: Samples weight. If not provided, uniform weights are assumed. Returns ------- PerceptronMask self """ self.classifier.partial_fit(X=X, y=y, classes=classes, sample_weight=sample_weight) return self def predict(self, X): """ predict Uses the current model to predict samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix. Returns ------- numpy.ndarray A numpy.ndarray containing the predicted labels for all instances in X. """ return np.asarray(self.classifier.predict(X)) def predict_proba(self, X): """ Predicts the probability of each sample belonging to each one of the known classes. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) A matrix of the samples we want to predict. Returns ------- numpy.ndarray An array of shape (n_samples, n_features), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_values) elements, each of which represents the probability that the i-th sample of X belongs to a certain label. """ return self.classifier._predict_proba_lr(X)
array = clones_test.values X_test = array[:,3:30] Y_test = array[:,2] print("test loaded") chunkSize=1024 #clf=SGDClassifier() #clf=PassiveAggressiveClassifier() clf=Perceptron() for chunk in pd.read_csv(path_train, names=colNames, chunksize=chunkSize): chunk = chunk.sample(frac=1).reset_index(drop=True) # shuffle data array = chunk.values X_train = array[:, 3:30] Y_train = array[:, 2] start_time = time.time() model =clf.partial_fit(X_train,Y_train,classes=numpy.unique(Y_train.astype(bool))) end_time=time.time() print("one chunk complete") filename = 'sgd_model.sav' pickle.dump(clf, open(filename, 'wb')) print("model saved") # load the model from disk start_time = time.time() loaded_model = pickle.load(open(filename, 'rb')) # result = loaded_model.score(X_test, Y_test.astype(bool)) # print(result) for chunk in pd.read_csv(path_test, names=colNames, chunksize=chunkSize): print("chunk read complete") array = chunk.values
# For looping through chunks of data, set step size step_size = 1000 percept = Perceptron(n_jobs = -1) prev = 0 nxt = step_size X_train = features_to_train[prev:nxt,:] Y_train = targets_to_train[prev:nxt] print len(X_train) print len(Y_train) percept.partial_fit(X_train, Y_train, classes=np.unique(targets_to_train)) prev += step_size nxt += step_size for i in range(len(features_to_train) / step_size - 1): X_train = features_to_train[prev:nxt,:] Y_train = targets_to_train[prev:nxt] percept.partial_fit(X_train, Y_train) predicted_targets = percept.predict(features_to_test) prev += step_size nxt += step_size
# return X fh = FeatureHasher(n_features = 2**20, input_type="string", non_negative=True) # ohe = OneHotEncoder(categorical_features=columns) # Train classifier clf = Perceptron() train = pd.read_csv("testtrain.csv", chunksize = 50000, iterator = True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace = True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.partial_fit(Xcat, y_train, classes=all_classes) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("testtest.csv", usecols=usecols) X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) X_test.drop(["hour"], axis=1, inplace = True) X_enc_test = fh.transform(np.asarray(X_test.astype(str))) y_act = pd.read_csv("testtest.csv", usecols=['click']) y_pred = clf.predict(X_enc_test) with open('logloss.txt','a') as f: f.write('\n'+str(log_loss(y_act, y_pred))+'\tPerceptron')
model = train_averaged_perceptron(y_train, X_train, y_vali, X_vali, num_iter=1000) print("AP. Train-Accuracy: {:.3}".format(model.score(X_train, y_train))) print("AP. Vali-Accuracy: {:.3}".format(model.score(X_vali, y_vali))) # Note that Sci-Kit Learn's Perceptron uses an alternative method of training. # Is it an averaged perceptron or a regular perceptron? skP = Perceptron() print("Train sklearn-Perceptron (skP)") for iter in range(1000): # Note we use partial_fit rather than fit to expose the loop to our code! skP.partial_fit(X_train, y_train, classes=(0, 1)) learning_curves["skPerceptron"].add_sample(skP, X_train, y_train, X_vali, y_vali) print("skP. Train-Accuracy: {:.3}".format(skP.score(X_train, y_train))) print("skP. Vali-Accuracy: {:.3}".format(skP.score(X_vali, y_vali))) ## TODO Exploration 1: use a loop around partial-fit to generate another graph! # ## TODO Exploration 1A: Try a MLP (Multi-Layer Perceptron). mlp = MLPClassifier(hidden_layer_sizes=(32, )) print("Train MLPClassifier (mla)") for iter in range(1000): # Note we use partial_fit rather than fit to expose the loop to our code! mlp.partial_fit(X_train, y_train, classes=(0, 1)) learning_curves["MLPClassifier"].add_sample(mlp, X_train, y_train, X_vali, y_vali)
labels = dataset[:, 2].reshape((numSamples,)) classif = Perceptron() print('Fitting a', type(classif).__name__, 'model to the dataset')] ## We have to reshape the data here because partial_fit expects a 2D arrray ##for the X input and an array for the Y input reshapedData = data.reshape(1000,1,2) reshapedLabel = labels.reshape(1000,1) y = reshapedLabel y_index, x_index, callsToPf, totalLoops, errorRate = 0,0,0,0,1 while(errorRate != 0): errorRate = 0 for x in reshapedData: classif.partial_fit(x,y[y_index],classes = [-1,1]) callsToPf += 1 preds = classif.predict(data) errorRate = metrics.zero_one_loss(y,preds) print(errorRate) y_index += 1 if(errorRate == 0): break totalLoops += 1 y_index = 0 print("It took ", callsToPf, " weight updates to get the errorRate to 0") print("Final weight vector is ",classif.coef_) # x1 and x2 are the weights x1 = classif.coef_[0][0]
#print x_df.head() vectorizer = DictVectorizer(sparse=False) x = vectorizer.fit_transform(x_df.to_dict("records")) #print x.shape #The output class y = dframe.tag.values all_classes = np.unique(y) #print all_classes.shape #print y.shape x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) print(x_train.shape) print(y_train.shape) clf = Perceptron(verbose=10, n_jobs=-1, n_iter=5) all_classes = list(set(y)) clf.partial_fit(x_train, y_train, all_classes) clf = joblib.dump(clf, 'clf.model') print "Done" clf = joblib.load('clf.model') print(f1_score(clf.predict(x_test), y_test, average="micro"))
class StreamingLearner(BaseListener): """ Trains a Perceptron classifier on a stream of data (updates with every sample) using feature hashing (as you cannot know the vocabulary in before). In this example only English tweets containing a happy :) or sad :( emoticons, which are used as annotation for the sentiment of the message, are used as training and testing data. Every 5th tweet is used for evaluation of the model. """ def __init__(self, zmq_sub_string, channel): self.classes = ["pos", "neg"] self.re_emoticons = re.compile(r":\)|:\(") self.vec = HashingVectorizer(n_features=2 ** 20, non_negative=True) self.clf = Perceptron() self.count = { "train": { "pos": 0, "neg": 0, }, "test": { "pos": 0, "neg": 0, } } self.train = 1 self.eval_count = { "pos": {"tp": 0, "fp": 0, "fn": 0}, "neg": {"tp": 0, "fp": 0, "fn": 0}, } super(StreamingLearner, self).__init__(zmq_sub_string, channel) def on_msg(self, tweet): print_tick() if tweet.get("lang") != "en": return # skip non english tweets emoticons = self.re_emoticons.findall(tweet["text"]) if not emoticons: return # skip tweets without emoticons text = self.re_emoticons.sub("", tweet["text"].replace("\n", "")) X = self.vec.transform([text]) # label for message last_emoticon = emoticons[-1] if last_emoticon == ":)": label = "pos" elif last_emoticon == ":(": label = "neg" y = np.asarray([label]) if not self.train: # use every 5th message for evaluation print("") print("TEST %s |" % label, text) self.count["test"][label] += 1 y_pred = self.clf.predict(X) pred_label, gold_label = y_pred[0], label print("PRED: ", pred_label) if pred_label == gold_label: self.eval_count[gold_label]["tp"] += 1 else: self.eval_count[pred_label]["fp"] += 1 self.eval_count[gold_label]["fn"] += 1 pos_acc = ( self.eval_count["pos"]["tp"] / self.count["test"]["pos"] ) if self.count["test"]["pos"] else 0 neg_acc = ( self.eval_count["neg"]["tp"] / self.count["test"]["neg"] ) if self.count["test"]["neg"] else 0 print("*** CLF TESTED ON: %s :) samples (Acc %.3f)," " %s :( samples (Acc %.3f)" % (self.count["test"]["pos"], pos_acc, self.count["test"]["neg"], neg_acc)) print(json.dumps(self.eval_count, indent=2)) print() else: self.count["train"][label] += 1 # set higher sample weight for underrepresented class tc = self.count["train"] if label == "pos": sample_weight = min(3, max(1, tc["neg"] - tc["pos"])) elif label == "neg": sample_weight = min(3, max(1, tc["pos"] - tc["neg"])) else: sample_weight = 0 print("\nTRAIN %s (weight %s) |" % (label, sample_weight), text) print(">>> CLF TRAINED ON: %s :) samples, %s :( samples" % ( self.count["train"]["pos"], self.count["train"]["neg"])) self.clf.partial_fit(X, y, self.classes, [sample_weight]) self.train += 1 # use every 5th message for evaluation if not self.train % 5: self.train = 0
#nbrtest=int(X.shape[0]*0.33) #nbrtrain=X.shape[0]-nbrtest #X_train=X[:nbrtrain] #X=np.delete(X,range(nbrtrain), 0) #X_test=X #del(X) #y_train=y[:nbrtrain] #y=np.delete(y, range(nbrtrain)) #y_test=y #del(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) #print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) per = Perceptron(verbose=10, n_jobs=-1, max_iter=5) per.partial_fit(X_train, y_train, classes) new_classes = classes.copy() print(new_classes.pop()) print(new_classes) print( classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))
# 現在の手(0~2の整数)をランダムに初期化 j = np.random.randint(0, 3) # 過去の手(入力データ)をscikit_learn用の配列に変換 Jprev_set = np.array([Jprev]) # 現在の手(ターゲット)をscikit_learn用の配列に変換 jnow_set = np.array([j]) # 三層ニューラルネットワークを定義 #clf_janken = MLPClassifier(hidden_layer_sizes=(200, ), random_state=None) # 単純パーセプトロンを定義 clf_janken = Perceptron(random_state=None) # ランダムな入力でオンライン学習を1回行う。 # 初回の学習では、あり得るターゲット(0, 1, 2)を分類器に知らせる必要がある clf_janken.partial_fit(Jprev_set, jnow_set, classes=[0, 1, 2]) # 勝敗の回数を初期化 win = 0 draw = 0 lose = 0 # 状態保存用のフラグ appliStop = False jankenLoop = False recognizedHand = 0 # 学習済ファイルの確認 if len(sys.argv) == 2: savefile = sys.argv[1] try:
ch_prev_set = np.array([ch_prev]) # 今回の手(0~2の整数)をランダムに初期化 j = np.random.randint(0, 3) # 今回の手(ターゲット)をscikit_learn用の配列に変換 h_now_set = np.array([j]) # ====2.機械学習の実行============================= # 単純パーセプトロンを定義 clf = Perceptron(random_state=None) # ランダムな入力でオンライン学習を1回行う。 # 初回の学習では、あり得るターゲット(0, 1, 2)を分類器に知らせる必要がある clf.partial_fit(ch_prev_set, h_now_set, classes=[0, 1, 2]) # ====3.機械学習の結果の表示と評価============================= def janken_ml(h_choice, result): global ch_prev, ch_prev_set, total h_choice -= 1 if(h_choice < 0 or h_choice > 2): flash("0,1,2を入力してください") # 過去のじゃんけんの手(ベクトル形式)をscikit_learn形式に ch_prev_set = np.array([ch_prev]) # 今回のじゃんけんの手(0~2の整数)をscikit_learn形式に h_now_set = np.array([h_choice]) # コンピュータが、過去の手から人間の今回の手を予測
class Neuron: __classes = [] __savefile_per = "" __per = None def __init__(self, save_folder, classes): """ :param save_folder: :param classes: """ self.__classes = classes self.__savefile_per = save_folder + "per.joblib" self.__per = Perceptron() def train(self, X, y): """ :param X: :param y: :return: """ self.__per.partial_fit(X, y, classes=self.__classes) self.__save_model() def predict(self, X): """ :param X: :return: """ if self.__per is None: self.__load_model() return self.__per.predict(X) def update(self, X, y): """ :param X: :param y: :return: """ self.train(X, y) # If no longer using train(), add save_model() def __save_model(self): """ """ dump(self.__per, self.__savefile_per) def __load_model(self): """ """ if os.path.exists(self.__savefile_per): self.__per = load(self.__savefile_per) else: print("ERROR: Perceptron not initialized. Run train() first.")
class Parser(ParserI): @staticmethod def build_labels_dataset(parses, feature_extractor): """ Transform a list of parses to a labels dataset """ labels_X, labels_y = [], [] for gold_parse in parses: for child, head in enumerate(gold_parse.heads()[1:-1]): features = feature_extractor(gold_parse, head, child + 1) label = gold_parse.labels()[child + 1] labels_X.append(features) labels_y.append(label) return labels_X, labels_y @staticmethod def build_transition_dataset(parses, feature_extractor): """ Transform a list of parses to a transitions dataset """ transitions_X, transitions_y = [], [] for gold_parse in parses: # Init an empty parse dep_parse = DependencyParse(gold_parse.tagged_words()[1:-1]) # Start from an empty state state = ParserState(dep_parse) while state.stack or (state.buffer_index + 1) < len(dep_parse): features = feature_extractor(state) gold_moves = state.next_gold(gold_parse) if not gold_moves: # Something is wrong here ... break # Pick one of the possible transitions t = random.choice(gold_moves) # Append the features and transition to the dataset transitions_X.append(features) transitions_y.append(t) # Apply the transition to the state state.apply(t) return transitions_X, transitions_y def __init__(self, feature_detector, label_feature_detector): self.feature_extractor = feature_detector self.label_feature_detector = label_feature_detector self._vectorizer = FeatureHasher() self._model = SGDClassifier(loss='modified_huber') self._label_vectorizer = FeatureHasher() self._label_model = Perceptron() def evaluate(self, parses): correct_heads, correct_labels, total = 0, 0, 0 for parse in parses: predicted_parse = self.parse(parse.tagged_words()[1:-1]) heads = np.array(parse.heads()[1:-1]) predicted_heads = np.array(predicted_parse.heads()[1:-1]) labels = np.array(parse.labels()[1:-1]) # Relabel the gold parse with what our model would label self.label_parse(parse) predicted_labels = np.array(parse.labels()[1:-1]) total += len(heads) correct_heads += np.sum(heads == predicted_heads) correct_labels += np.sum(labels == predicted_labels) return correct_heads / total, correct_labels / total def parse(self, sent, *args, **kwargs): """ Parse a tagged sentence """ state = ParserState(DependencyParse(sent)) while state.stack or (state.buffer_index + 1) < len(state.parse): # Extract the features of the current state features = self.feature_extractor(state) vectorized_features = self._vectorizer.transform([features]) # Get probabilities for the next transitions predictions = self._model.predict_proba(vectorized_features)[0] scores = dict(zip(list(self._model.classes_), list(predictions))) # Check what moves are actually valid valid_moves = state.next_valid() # Get the most probable valid mode guess = max(valid_moves, key=lambda move: scores[move]) # apply the transition to the state state.apply(guess) self.label_parse(state.parse) # Add labels too ... return state.parse def label_parse(self, parse): """ Add labels to a dependency parse """ label_features = [] for child, head in enumerate(parse.heads()[1:-1]): features = self.label_feature_detector(parse, head, child + 1) label_features.append(features) vectorized_label_features = self._label_vectorizer.transform(label_features) predicted_labels = self._label_model.predict(vectorized_label_features) parse._labels = [None] + list(predicted_labels) + [None] return parse def train(self, corpus_iterator, n_iter=5, batch_size=100): """ Train a model on a given corpus """ for _ in range(n_iter): # Fork the iterator corpus_iterator, parses = itertools.tee(corpus_iterator) batch_count = 0 while True: batch_count += 1 print("Training on batch={0}".format(batch_count)) batch = list(itertools.islice(parses, batch_size)) # No more batches if not batch: break # Train the model on a batch self.train_batch(batch) def train_batch(self, gold_parses): """ Train the model on a single batch """ t_X, t_Y = self.build_transition_dataset( gold_parses, self.feature_extractor) self._model.partial_fit(self._vectorizer.transform(t_X), t_Y, classes=Transitions.ALL) l_X, l_Y = self.build_labels_dataset( gold_parses, self.label_feature_detector) self._label_model.partial_fit(self._label_vectorizer.transform(l_X), l_Y, classes=DEPENDENCY_LABELS)
def train(self): model = os.path.abspath('1server/nlp/data/model3.joblib') if os.path.exists(model): model = load(model) # train_file = open(self.file_path) # lines = [line for line in train_file.read().split("\n")] # # train_file.close() # train_ = [] # for row in lines: # if parseEntity(row): # train_.append(parseEntity(row)) # score = model.evaluate( # train_[:1500] # ) # data_test = [conlltags2tree(iobs) for iobs in train_[1500:]] self._chunk = model return model else: # train_file = open(self.file_path) # lines = [line for line in train_file.read().split("\n")] # # train_file.close() # train_ = [] # word_feature = [] # for row in lines: # if parseEntity(row): # train_.append(parseEntity(row)) # for sentence in train_: # history = [] # untagged_sentence, tags = zip(*sentence) # pprint(sentence) # for index in range(len(sentence)): # featureset = features(untagged_sentence, index, history) # featureset['label'] = tags[index] # word_feature.append((featureset, tags[index])) # history.append(tags[index]) # feature_key = [k for k, v in word_feature] # feature_key_unique = [i for i in feature_key[0]] # with open(os.path.abspath( # 'server/nlp/data/list_data_features.csv'), 'w', encoding="utf8",newline="") as csv_feature: # writer = csv.writer(csv_feature) # writer.writerow(feature_key_unique) # for k in feature_key: # writer.writerow(list(k.values())) pd = df.read_csv( os.path.abspath('server/nlp/data/list_data_features.csv'), encoding="ISO-8859-1", error_bad_lines=False) vectorizer = DictVectorizer(sparse=False) pd = pd[:2600] pd = pd.fillna(method='ffill') y = pd['label'].values x = pd.drop('label', axis=1) pd_dict = x.to_dict("records") print(pd.isnull().sum()) # x = [vectorizer.fit_transform(i)[0].tolist() for i in pd_dict] # x = np.asarray(x) # pprint(x[0]) # y = np.asarray(y) x = vectorizer.fit_transform(pd_dict) all_classes = np.unique(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) clf = Perceptron(verbose=10, n_jobs=-1, max_iter=1000) all_classes = list(set(y)) clf.partial_fit(x_train, y_train, all_classes) new_classes = all_classes.copy() new_classes.remove('O') print( classification_report(y_pred=clf.predict(x_test), y_true=y_test, labels=new_classes)) self._chunk = clf # csv_feature.close() # vectorizer = DictVectorizer() # feature_select = [k for k, v in word_feature[:5000]] # le = LabelEncoder() # for i in feature_select: # for k in i: # feature_select[i][k] = # data_x = vectorizer.fit_transform(feature_select).toarray() # data_y = [ v for k, v in word_feature[:5000]] # all_classes = np.unique(data_y) # Y = np.asarray(data_y) # X = np.asarray(data_x) # clf = GaussianNB() # clf.fit(data_x, Y) # print(Y.shape) # data_x, Y = np.arange(len(all_classes)*2).reshape( # (len(all_classes), 2)), range(len(all_classes)) # X = np.asarray(data_x) # print(type(data_y)) # x_train, x_test, y_train, y_test = train_test_split( # data_x, Y, test_size=0.2, random_state=0) # print(data_x) # pprint(dict(word_feature)) # x = vectorizer.fit_transform(dict(word_feature)) # print(60*"=") # pprint(x) # self._chunk = NamedEntityChunker(train_) # score = self._chunk.evaluate( # [conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in train_[1500:]]) # dump(self._chunk, os.path.abspath( # 'server/nlp/data/model3.joblib')) return self._chunk
Jprev[3 * i:3 * i + 3] = janken_array[j] # 現在の手(0~2の整数)をランダムに初期化 j = np.random.randint(0, 3) # 過去の手(入力データ)をscikit_learn用の配列に変換 Jprev_set = np.array([Jprev]) # 現在の手(ターゲット)をscikit_learn用の配列に変換 jnow_set = np.array([j]) # 単純パーセプトロンを定義 clf = Perceptron(random_state=None) # ランダムな入力でオンライン学習を1回行う。 # 初回の学習では、あり得るターゲット(0, 1, 2)を分類器に知らせる必要がある clf.partial_fit(Jprev_set, jnow_set, classes=[0, 1, 2]) # プログラム上はグー、チョキ、パーは0, 1, 2に対応するが、 # キー入力は入力のしやすさから1, 2, 3に割り当てる print('1:グー、2:チョキ、3:パー') # 対戦結果の初期化 win = 0 draw = 0 lose = 0 try: while True: try: # 入力された数値(1~3)を(0~2)に変換 j = int(input()) - 1