def classifyData(self, algo=None, saveModel=False): bench = Benchmark() classifier = None prediction = None if algo == SVM_SGD: classifier = SGDClassifier(n_jobs=-1, loss='hinge', penalty='l2', alpha=1e-5, n_iter=50, random_state=42) # classifier = self.doSVMwithGridSearch() elif algo == NEURAL_NETWORK: classifier = sknn.mlp.Classifier( layers=[ # Sigmoid, Tanh, Rectifier, Softmax, Linear # sknn.mlp.Layer("Tanh", units=300), (sknn.mlp.Layer("Linear", units=300) for i in range(2)), sknn.mlp.Layer("Softmax"), ], learning_rate=NN_LEARNING_RATE, n_iter=10, learning_momentum=.9, debug=False, regularize=None, # L1, L2, dropout, and batch normalization. learning_rule= 'sgd' # sgd, momentum, nesterov, adadelta, adagrad, rmsprop, adam ) elif algo == RANDOM_FOREST: classifier = RandomForestClassifier( n_estimators=NR_FOREST_ESTIMATORS, n_jobs=-1) elif algo == NAIVE_BAYES: classifier = MultinomialNB() classifier.fit(self.dataset.X_train, self.dataset.Y_train) bench.end('Training Data using: ' + algo) # save that training model if saveModel: joblib.dump(classifier, './model/classifier_{}_{}'.format(algo, time.time()), compress=9) bench.end('Dumping Classifier Data') prediction = classifier.predict(self.dataset.X_test) score = classifier.score(self.dataset.X_test, self.dataset.Y_test) bench.end('Predicting Data using: ' + algo) if algo == NEURAL_NETWORK: prediction = [x[0] for x in prediction] self.saveResults(prediction, algo, score=score)
def classifyData(self, algo=None, saveModel=False): bench = Benchmark() classifier = None prediction = None if algo == SVM_SGD: classifier = SGDClassifier(n_jobs=-1, loss='hinge', penalty='l2', alpha=1e-5, n_iter=50, random_state=42) # classifier = self.doSVMwithGridSearch() elif algo == NEURAL_NETWORK: classifier = sknn.mlp.Classifier( layers=[ # Sigmoid, Tanh, Rectifier, Softmax, Linear # sknn.mlp.Layer("Tanh", units=300), (sknn.mlp.Layer("Linear", units=300) for i in range(2)), sknn.mlp.Layer("Softmax"), ], learning_rate=NN_LEARNING_RATE, n_iter=10, learning_momentum=.9, debug=False, regularize=None, # L1, L2, dropout, and batch normalization. learning_rule='sgd' # sgd, momentum, nesterov, adadelta, adagrad, rmsprop, adam ) elif algo == RANDOM_FOREST: classifier = RandomForestClassifier(n_estimators=NR_FOREST_ESTIMATORS, n_jobs=-1) elif algo == NAIVE_BAYES: classifier = MultinomialNB() classifier.fit(self.dataset.X_train, self.dataset.Y_train) bench.end('Training Data using: ' + algo) # save that training model if saveModel: joblib.dump(classifier, './model/classifier_{}_{}'.format(algo, time.time()), compress=9) bench.end('Dumping Classifier Data') prediction = classifier.predict(self.dataset.X_test) score = classifier.score(self.dataset.X_test, self.dataset.Y_test) bench.end('Predicting Data using: ' + algo) if algo == NEURAL_NETWORK: prediction = [x[0] for x in prediction] self.saveResults(prediction, algo, score=score)
from TextAnalyzer import TextAnalyzer __author__ = 'Raphael' SGD_CLASSIFIER_LOSS_OPTIONS = ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',) # REVIEW_DATA = './data/Season-1.csv' REVIEW_DATA = './data/All-seasons.csv' if __name__ == '__main__': bench = Benchmark() textAnalzyer = TextAnalyzer(REVIEW_DATA) bench.end('Initializing') textAnalzyer.createDataFrame(nameFilter=['Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy']) bench.end('Reading CSV') textAnalzyer.cleanData() bench.end('Cleaning Data') textAnalzyer.splitData() bench.end('Generating Test and Training Data') textAnalzyer.determineBestParams() bench.end('Determining Best Results')
SGD_CLASSIFIER_LOSS_OPTIONS = ( 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', ) # REVIEW_DATA = './data/Season-1.csv' REVIEW_DATA = './data/All-seasons.csv' if __name__ == '__main__': bench = Benchmark() textAnalzyer = TextAnalyzer(REVIEW_DATA) bench.end('Initializing') textAnalzyer.createDataFrame(nameFilter=[ 'Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy' ]) bench.end('Reading CSV') textAnalzyer.cleanData() bench.end('Cleaning Data') textAnalzyer.splitData() bench.end('Generating Test and Training Data') textAnalzyer.determineBestParams() bench.end('Determining Best Results')
output.to_csv("./results/{}.csv".format(classifierName), index=False) logger.info('Accuracy: {} %'.format(round(kwargs['score'] * 100, 3))) def optimizeParams(self): self.params = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-1, 1e-2, 1e-3), } if __name__ == '__main__': bench = Benchmark() anal = TextAnalyzer(REVIEW_DATA) bench.end('Initializing') anal.createDataFrame(nameFilter=['Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy']) bench.end('Reading CSV') anal.cleanData() # Prepare data in a format that is good for scikitlearn bench.end('Cleaning Data') anal.vectorizeData(scheme='bagofwords') bench.end('Generating Bag of Words Representation') anal.genTfIdf() # bench.end('Generating TF-IDF Representation') anal.splitData()
output.to_csv("./results/{}.csv".format(classifierName), index=False) logger.info('Accuracy: {} %'.format(round(kwargs['score'] * 100, 3))) def optimizeParams(self): self.params = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-1, 1e-2, 1e-3), } if __name__ == '__main__': bench = Benchmark() anal = TextAnalyzer(REVIEW_DATA) bench.end('Initializing') anal.createDataFrame(nameFilter=[ 'Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy' ]) bench.end('Reading CSV') anal.cleanData() # Prepare data in a format that is good for scikitlearn bench.end('Cleaning Data') anal.vectorizeData(scheme='bagofwords') bench.end('Generating Bag of Words Representation') anal.genTfIdf() # bench.end('Generating TF-IDF Representation')