Exemple #1
0
def add_compression(name, file_name_classifier, file_name_vectorizer,
                    file_name_features, file_path):

    with cd(file_path):
        print "loading vectorizer"
        vectorizer = joblib.load(file_name_vectorizer)
        print "loading features"
        feature_reduction_class = joblib.load(file_name_features)
        print "loading classifier"
        classifier = joblib.load(file_name_classifier)

    new_file_name_vectorizer = "%s.joblib" % file_name_vectorizer
    new_file_name_features = "%s.joblib" % file_name_features
    new_file_name_classifier = "%s.joblib" % file_name_classifier

    new_data_path = "%s/%s" % (file_path, name)
    print new_file_name_features
    print new_file_name_vectorizer
    print new_file_name_classifier
    print new_data_path

    if not os.path.exists(new_data_path):
        with cd(file_path):
            os.makedirs(name)

    with cd("%s/%s" % (file_path, name)):
        print "Gzipping vectorizer"
        joblib.dump(vectorizer, new_file_name_vectorizer, compress=("zlib", 9))
        print "Gzipping vectorizer Completed"

        print "Gzipping features"
        joblib.dump(feature_reduction_class,
                    new_file_name_features,
                    compress=("zlib", 9))
        print "Gzipping features Completed"

        print "Gzipping classifier"
        joblib.dump(classifier, new_file_name_classifier, compress=("zlib", 9))
        print "Gzipping classifier completed"

    with cd("%s/%s" % (file_path, name)):
        print "Loading vectorizer"
        with open(new_file_name_vectorizer, 'rb') as f:
            print joblib.load(f)
        print "Loading vectorizer Completed"

        print "Loading features"
        with open(new_file_name_features, 'rb') as f:
            print joblib.load(f)
        print "Loading features Completed"

        print "Loading classifier"
        with open(new_file_name_classifier, 'rb') as f:
            print joblib.load(f)
        print "Loading classifier completed "

        #feature_reduction_class=load(open(file_name_features, 'rb'))
        #classifier= load(open(file_name_classifier, 'rb'))

    return
    def count_vectorize(self):
        """
                token_pattern=u'(?u)\\b\\w\\w+\\b' removes single word from the
                vocabullary
                
               
                replacing joblib with cPickle because its too fast as compared
                to jblib bullshit
                """

        #vectorizer = CountVectorizer(preprocessor=preprocess, analyzer=stemmed_words, ngram_range=(2, 6))
        vectorizer = CountVectorizer(ngram_range=(1, 3))

        dtm = vectorizer.fit_transform(self.sentences)  # a sparse
        #this is a sparse matrix to convert it into dense matrix
        #use    dt.todense()
        with cd(self.path):
            joblib.dump(vectorizer.vocabulary_,
                        self.file_name_vectorizer,
                        compress=("zlib", 9))
        #dump(vectorizer.vocabulary_, open("%s/%s"%(self.path, self.file_name_vectorizer), 'wb'), HIGHEST_PROTOCOL)

        if self.enable_print:
            print sorted(vectorizer.vocabulary_.items(),
                         key=operator.itemgetter(1))

        print "shape of the document matrix is rows=%s,columns=%s" % dtm.shape
        if self.use_dense_matrix:
            self.dtm = dtm.todense()
        else:
            self.dtm = dtm

        return self.dtm
Exemple #3
0
    def svm_bagclassifier_prediction(data,
                                     file_name_classifier,
                                     file_name_vectorizer,
                                     file_name_features,
                                     file_path,
                                     bagging=False):

        target, sentences = zip(*data)
        vectorize_class = HouzierVectorizer(sentences, file_path,
                                            file_name_vectorizer, False, False)
        #example_counts= example_counts.toarray()
        vocabulary_to_load = vectorize_class.return_vectorizer()
        #vectorize_class = HouzierVectorizer(examples, True, False)
        #x_vectorize = vectorize_class.count_vectorize()

        loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load)
        sentences_counts = loaded_vectorizer.transform(sentences)

        with cd(file_path):
            feature_reduction_class = joblib.load(file_name_features)
            classifier = joblib.load(file_name_classifier)

            #feature_reduction_class=load(open(file_name_features, 'rb'))
            #classifier= load(open(file_name_classifier, 'rb'))

        reduced_features = feature_reduction_class.transform(
            sentences_counts.toarray())

        predictions = classifier.predict(reduced_features)
        print accuracy_score(target, predictions)

        return
 def return_vectorizer(self):
     with cd(self.path):
         vocabulary = joblib.load(self.file_name_vectorizer)
     return vocabulary
Exemple #5
0
    def svm_bagclassifier(data,
                          file_name_classifier,
                          file_name_vectorizer,
                          file_name_features,
                          file_path,
                          bagging=False):
        """
                file_name_classifier: The name under which the joblib must
                                    store the classifier 
                file_name_vectorizer: The name under which the joblib must
                                store the vocabulary of the trained vectorizer
                file_name_features: The combined_features name under which the
                jblib must store th features vector.

                file_path: The filepath at which all these above files must be
                stored
                
                """
        start = time.time()
        tags, sentences = zip(*data)
        sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences)
        sentences = GeneralMethodsClassifiers.pre_process_text(sentences)
        vectorize_class = HouzierVectorizer(sentences, file_path,
                                            file_name_vectorizer, False, False)

        ##getting features list
        x_vectorize = vectorize_class.count_vectorize()
        tfidf = TfidfTransformer(norm="l2", sublinear_tf=True)

        ##convert them into term frequency
        x_transform = tfidf.fit_transform(x_vectorize)

        X_normalized = preprocessing.normalize(x_transform.toarray(),
                                               norm='l2')
        print "Feature after vectorization of the data [%s, %s]" % x_transform.shape
        ##Going for feature selection
        # This dataset is way too high-dimensional. Better do PCA:
        #pca = PCA()
        pca = KernelPCA(kernel="linear")
        #pca = RandomizedPCA()
        #pca = NMF()
        #
        ## Maybe some original features where good, too?
        ##this will select features basec on chi2 test

        selection = SelectKBest(chi2, k=200)
        combined_features = FeatureUnion([("pca", pca),
                                          ("univ_select", selection)])

        X_features = combined_features.fit_transform(X_normalized, tags)
        with cd(file_path):
            joblib.dump(combined_features,
                        file_name_features,
                        compress=("zlib", 9))
        """
                dump(combined_features, open('%s/%s'%(file_path,
                                                      file_name_features),
                                             'wb'), HIGHEST_PROTOCOL)
                """

        print "Feature after feature slection with pca and selectkbest\
                    of the data [%s, %s]" % X_features.shape
        n_estimators = 5
        svc_classifier = SVC(kernel='linear',
                             C=1,
                             gamma="auto",
                             probability=True,
                             decision_function_shape="ovr",
                             class_weight="balanced",
                             cache_size=20000)

        if bagging:
            classifier = OneVsRestClassifier(
                BaggingClassifier(svc_classifier,
                                  max_samples=1.0,
                                  max_features=1.0,
                                  n_jobs=-1,
                                  verbose=3,
                                  n_estimators=n_estimators,
                                  bootstrap=False))
        else:
            classifier = svc_classifier

        classifier.fit(X_features, tags)

        print classifier.classes_
        with cd(file_path):
            joblib.dump(classifier, file_name_classifier, compress=("zlib", 9))
        """
                dump(classifier, open('%s/%s'%(file_path,
                                               file_name_classifier),
                                               'wb'), HIGHEST_PROTOCOL)
                """
        print "Storing Classifier with joblib"
        print time.time() - start
        return
Exemple #6
0
def store_with_joblib(file_path, _object, file_name):
    with cd(file_path):
        joblib.dump(__object, file_name)

    return
Exemple #7
0
    def svm_bagclassifier(sentiment_data,
                          file_name_classifier,
                          file_name_vectorizer,
                          file_name_features,
                          bagging=False):
        """
                vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
                X_train = vectorizer.fit_transform(sentences)
                """
        import time
        start = time.time()
        sentiments, sentences = zip(*sentiment_data[0:1000])
        sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences)
        sentences = GeneralMethodsClassifiers.pre_process_text(sentences)
        vectorize_class = HouzierVectorizer(
            sentences, "%s/CompiledModels/SentimentClassifiers" % base_dir,
            file_name_vectorizer, False, False)

        ##getting features list
        x_vectorize = vectorize_class.count_vectorize()
        tfidf = TfidfTransformer(norm="l2", sublinear_tf=True)

        ##convert them into term frequency
        x_transform = tfidf.fit_transform(x_vectorize)

        X_normalized = preprocessing.normalize(x_transform.toarray(),
                                               norm='l2')
        print "Feature after vectorization of the data [%s, %s]" % x_transform.shape
        ##Going for feature selection
        # This dataset is way too high-dimensional. Better do PCA:
        #pca = PCA()
        pca = KernelPCA(kernel="linear")
        #pca = RandomizedPCA()
        #pca = NMF()
        #
        ## Maybe some original features where good, too?
        ##this will select features basec on chi2 test

        selection = SelectKBest(chi2, k=2)
        combined_features = FeatureUnion([("pca", pca),
                                          ("univ_select", selection)])

        X_features = combined_features.fit_transform(X_normalized, sentiments)
        with cd("%s/CompiledModels/SentimentClassifiers" % base_dir):
            joblib.dump(combined_features,
                        file_name_features,
                        compress=("zlib", 9))
        """
                dump(combined_features,
                     open('%s/%s'%(SentimentClassifiersPath,SentimentFeatureFileName), 'wb'),HIGHEST_PROTOCOL)

                """
        #X_pca = pca.fit_transform(x_transform)

        print "Feature after feature slection with pca and selectkbest\
                    of the data [%s, %s]" % X_features.shape

        #http://stackoverflow.com/questions/32934267/feature-union-of-hetereogenous-features

        #clf = SVC(C=1, kernel="linear", gamma=.001, probability=True, class_weight='auto')

        n_estimators = 3
        svc_classifier = SVC(kernel='linear',
                             C=1,
                             gamma="auto",
                             probability=True,
                             decision_function_shape="ovr",
                             class_weight="balanced",
                             cache_size=20000)

        if bagging:
            classifier = OneVsRestClassifier(
                BaggingClassifier(svc_classifier,
                                  max_samples=1.0,
                                  max_features=1.0,
                                  n_jobs=-1,
                                  verbose=3,
                                  n_estimators=n_estimators,
                                  bootstrap=False))
        else:
            classifier = svc_classifier

        classifier.fit(X_features, sentiments)

        print classifier.classes_
        with cd("%s/CompiledModels/SentimentClassifiers" % base_dir):
            joblib.dump(classifier, file_name_classifier, compress=("zlib", 9))
        """
                dump(file_name_classifier,open('%s/%s'%(SentimentClassifiersPath,
                                                       SentimentClassifierFileName
                                                        ),
                                               'wb'), HIGHEST_PROTOCOL)
                """

        print "Storing Classifier with joblib"
        ##example to build your own vectorizer
        ##http://stackoverflow.com/questions/31744519/load-pickled-classifier-data-vocabulary-not-fitted-error
        from sklearn.feature_extraction.text import CountVectorizer
        #count_vectorizer = CountVectorizer()
        examples_negative = [
            'Free Viagra call today!', "I am dissapointed in you",
            "i am not good"
        ]
        examples_neutral = [
            "I dont know", "Sun rises in the east",
            "I'm going to attend theLinux users group tomorrow."
        ]
        examples_positive = [
            "hey there, I am too good to be true", "An Awesome man",
            "A beautiful beautiful lady"
        ]

        examples = examples_positive + examples_negative + examples_neutral

        #example_counts= example_counts.toarray()
        vocabulary_to_load = vectorize_class.return_vectorizer()
        #vectorize_class = HouzierVectorizer(examples, True, False)
        #x_vectorize = vectorize_class.count_vectorize()

        loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load)
        example_counts = loaded_vectorizer.transform(examples)

        print example_counts, example_counts.shape

        f = combined_features.transform(example_counts.toarray())

        predictions = classifier.predict(f)
        predict_probabilities = classifier.predict_proba(f)
        for sent, prob, tag in zip(examples, predict_probabilities,
                                   predictions):
            print sent, prob, tag

        print time.time() - start
        return