Exemple #1
0
    def classifyData(self, algo=None, saveModel=False):
        bench = Benchmark()
        classifier = None
        prediction = None

        if algo == SVM_SGD:
            classifier = SGDClassifier(n_jobs=-1,
                                       loss='hinge',
                                       penalty='l2',
                                       alpha=1e-5,
                                       n_iter=50,
                                       random_state=42)
            # classifier = self.doSVMwithGridSearch()
        elif algo == NEURAL_NETWORK:
            classifier = sknn.mlp.Classifier(
                layers=[  # Sigmoid, Tanh, Rectifier, Softmax, Linear
                    # sknn.mlp.Layer("Tanh", units=300),
                    (sknn.mlp.Layer("Linear", units=300) for i in range(2)),
                    sknn.mlp.Layer("Softmax"),
                ],
                learning_rate=NN_LEARNING_RATE,
                n_iter=10,
                learning_momentum=.9,
                debug=False,
                regularize=None,  # L1, L2, dropout, and batch normalization.
                learning_rule=
                'sgd'  # sgd, momentum, nesterov, adadelta, adagrad, rmsprop, adam
            )

        elif algo == RANDOM_FOREST:
            classifier = RandomForestClassifier(
                n_estimators=NR_FOREST_ESTIMATORS, n_jobs=-1)

        elif algo == NAIVE_BAYES:
            classifier = MultinomialNB()

        classifier.fit(self.dataset.X_train, self.dataset.Y_train)

        bench.end('Training Data using: ' + algo)

        # save that training model
        if saveModel:
            joblib.dump(classifier,
                        './model/classifier_{}_{}'.format(algo, time.time()),
                        compress=9)
            bench.end('Dumping Classifier Data')

        prediction = classifier.predict(self.dataset.X_test)
        score = classifier.score(self.dataset.X_test, self.dataset.Y_test)
        bench.end('Predicting Data using: ' + algo)

        if algo == NEURAL_NETWORK:
            prediction = [x[0] for x in prediction]

        self.saveResults(prediction, algo, score=score)
    def classifyData(self, algo=None, saveModel=False):
        bench = Benchmark()
        classifier = None
        prediction = None

        if algo == SVM_SGD:
            classifier = SGDClassifier(n_jobs=-1, loss='hinge', penalty='l2', alpha=1e-5, n_iter=50, random_state=42)
            # classifier = self.doSVMwithGridSearch()
        elif algo == NEURAL_NETWORK:
            classifier = sknn.mlp.Classifier(
                layers=[  # Sigmoid, Tanh, Rectifier, Softmax, Linear
                          # sknn.mlp.Layer("Tanh", units=300),

                          (sknn.mlp.Layer("Linear", units=300) for i in range(2)),
                          sknn.mlp.Layer("Softmax"),
              ],
                learning_rate=NN_LEARNING_RATE,
                n_iter=10,
                learning_momentum=.9,
                debug=False,
                regularize=None,  # L1, L2, dropout, and batch normalization.
                learning_rule='sgd'  # sgd, momentum, nesterov, adadelta, adagrad, rmsprop, adam
            )

        elif algo == RANDOM_FOREST:
            classifier = RandomForestClassifier(n_estimators=NR_FOREST_ESTIMATORS, n_jobs=-1)

        elif algo == NAIVE_BAYES:
            classifier = MultinomialNB()

        classifier.fit(self.dataset.X_train, self.dataset.Y_train)

        bench.end('Training Data using: ' + algo)

        # save that training model
        if saveModel:
            joblib.dump(classifier, './model/classifier_{}_{}'.format(algo, time.time()), compress=9)
            bench.end('Dumping Classifier Data')

        prediction = classifier.predict(self.dataset.X_test)
        score = classifier.score(self.dataset.X_test, self.dataset.Y_test)
        bench.end('Predicting Data using: ' + algo)

        if algo == NEURAL_NETWORK:
            prediction = [x[0] for x in prediction]

        self.saveResults(prediction, algo, score=score)
from TextAnalyzer import TextAnalyzer

__author__ = 'Raphael'


SGD_CLASSIFIER_LOSS_OPTIONS = ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',)


# REVIEW_DATA = './data/Season-1.csv'
REVIEW_DATA = './data/All-seasons.csv'

if __name__ == '__main__':
    bench = Benchmark()

    textAnalzyer = TextAnalyzer(REVIEW_DATA)
    bench.end('Initializing')

    textAnalzyer.createDataFrame(nameFilter=['Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy',
                                             'Timmy'])
    bench.end('Reading CSV')

    textAnalzyer.cleanData()
    bench.end('Cleaning Data')

    textAnalzyer.splitData()
    bench.end('Generating Test and Training Data')

    textAnalzyer.determineBestParams()
    bench.end('Determining Best Results')

Exemple #4
0
SGD_CLASSIFIER_LOSS_OPTIONS = (
    'hinge',
    'log',
    'modified_huber',
    'squared_hinge',
    'perceptron',
)

# REVIEW_DATA = './data/Season-1.csv'
REVIEW_DATA = './data/All-seasons.csv'

if __name__ == '__main__':
    bench = Benchmark()

    textAnalzyer = TextAnalyzer(REVIEW_DATA)
    bench.end('Initializing')

    textAnalzyer.createDataFrame(nameFilter=[
        'Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy'
    ])
    bench.end('Reading CSV')

    textAnalzyer.cleanData()
    bench.end('Cleaning Data')

    textAnalzyer.splitData()
    bench.end('Generating Test and Training Data')

    textAnalzyer.determineBestParams()
    bench.end('Determining Best Results')
        output.to_csv("./results/{}.csv".format(classifierName), index=False)
        logger.info('Accuracy: {} %'.format(round(kwargs['score'] * 100, 3)))

    def optimizeParams(self):
        self.params = {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': (1e-1, 1e-2, 1e-3),
        }


if __name__ == '__main__':
    bench = Benchmark()

    anal = TextAnalyzer(REVIEW_DATA)
    bench.end('Initializing')

    anal.createDataFrame(nameFilter=['Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy',
                                     'Timmy'])
    bench.end('Reading CSV')

    anal.cleanData()  # Prepare data in a format that is good for scikitlearn
    bench.end('Cleaning Data')

    anal.vectorizeData(scheme='bagofwords')
    bench.end('Generating Bag of Words Representation')

    anal.genTfIdf()  #
    bench.end('Generating TF-IDF Representation')

    anal.splitData()
Exemple #6
0
        output.to_csv("./results/{}.csv".format(classifierName), index=False)
        logger.info('Accuracy: {} %'.format(round(kwargs['score'] * 100, 3)))

    def optimizeParams(self):
        self.params = {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': (1e-1, 1e-2, 1e-3),
        }


if __name__ == '__main__':
    bench = Benchmark()

    anal = TextAnalyzer(REVIEW_DATA)
    bench.end('Initializing')

    anal.createDataFrame(nameFilter=[
        'Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy', 'Timmy'
    ])
    bench.end('Reading CSV')

    anal.cleanData()  # Prepare data in a format that is good for scikitlearn
    bench.end('Cleaning Data')

    anal.vectorizeData(scheme='bagofwords')
    bench.end('Generating Bag of Words Representation')

    anal.genTfIdf()  #
    bench.end('Generating TF-IDF Representation')