def add_compression(name, file_name_classifier, file_name_vectorizer, file_name_features, file_path): with cd(file_path): print "loading vectorizer" vectorizer = joblib.load(file_name_vectorizer) print "loading features" feature_reduction_class = joblib.load(file_name_features) print "loading classifier" classifier = joblib.load(file_name_classifier) new_file_name_vectorizer = "%s.joblib" % file_name_vectorizer new_file_name_features = "%s.joblib" % file_name_features new_file_name_classifier = "%s.joblib" % file_name_classifier new_data_path = "%s/%s" % (file_path, name) print new_file_name_features print new_file_name_vectorizer print new_file_name_classifier print new_data_path if not os.path.exists(new_data_path): with cd(file_path): os.makedirs(name) with cd("%s/%s" % (file_path, name)): print "Gzipping vectorizer" joblib.dump(vectorizer, new_file_name_vectorizer, compress=("zlib", 9)) print "Gzipping vectorizer Completed" print "Gzipping features" joblib.dump(feature_reduction_class, new_file_name_features, compress=("zlib", 9)) print "Gzipping features Completed" print "Gzipping classifier" joblib.dump(classifier, new_file_name_classifier, compress=("zlib", 9)) print "Gzipping classifier completed" with cd("%s/%s" % (file_path, name)): print "Loading vectorizer" with open(new_file_name_vectorizer, 'rb') as f: print joblib.load(f) print "Loading vectorizer Completed" print "Loading features" with open(new_file_name_features, 'rb') as f: print joblib.load(f) print "Loading features Completed" print "Loading classifier" with open(new_file_name_classifier, 'rb') as f: print joblib.load(f) print "Loading classifier completed " #feature_reduction_class=load(open(file_name_features, 'rb')) #classifier= load(open(file_name_classifier, 'rb')) return
def count_vectorize(self): """ token_pattern=u'(?u)\\b\\w\\w+\\b' removes single word from the vocabullary replacing joblib with cPickle because its too fast as compared to jblib bullshit """ #vectorizer = CountVectorizer(preprocessor=preprocess, analyzer=stemmed_words, ngram_range=(2, 6)) vectorizer = CountVectorizer(ngram_range=(1, 3)) dtm = vectorizer.fit_transform(self.sentences) # a sparse #this is a sparse matrix to convert it into dense matrix #use dt.todense() with cd(self.path): joblib.dump(vectorizer.vocabulary_, self.file_name_vectorizer, compress=("zlib", 9)) #dump(vectorizer.vocabulary_, open("%s/%s"%(self.path, self.file_name_vectorizer), 'wb'), HIGHEST_PROTOCOL) if self.enable_print: print sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1)) print "shape of the document matrix is rows=%s,columns=%s" % dtm.shape if self.use_dense_matrix: self.dtm = dtm.todense() else: self.dtm = dtm return self.dtm
def svm_bagclassifier_prediction(data, file_name_classifier, file_name_vectorizer, file_name_features, file_path, bagging=False): target, sentences = zip(*data) vectorize_class = HouzierVectorizer(sentences, file_path, file_name_vectorizer, False, False) #example_counts= example_counts.toarray() vocabulary_to_load = vectorize_class.return_vectorizer() #vectorize_class = HouzierVectorizer(examples, True, False) #x_vectorize = vectorize_class.count_vectorize() loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load) sentences_counts = loaded_vectorizer.transform(sentences) with cd(file_path): feature_reduction_class = joblib.load(file_name_features) classifier = joblib.load(file_name_classifier) #feature_reduction_class=load(open(file_name_features, 'rb')) #classifier= load(open(file_name_classifier, 'rb')) reduced_features = feature_reduction_class.transform( sentences_counts.toarray()) predictions = classifier.predict(reduced_features) print accuracy_score(target, predictions) return
def return_vectorizer(self): with cd(self.path): vocabulary = joblib.load(self.file_name_vectorizer) return vocabulary
def svm_bagclassifier(data, file_name_classifier, file_name_vectorizer, file_name_features, file_path, bagging=False): """ file_name_classifier: The name under which the joblib must store the classifier file_name_vectorizer: The name under which the joblib must store the vocabulary of the trained vectorizer file_name_features: The combined_features name under which the jblib must store th features vector. file_path: The filepath at which all these above files must be stored """ start = time.time() tags, sentences = zip(*data) sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences) sentences = GeneralMethodsClassifiers.pre_process_text(sentences) vectorize_class = HouzierVectorizer(sentences, file_path, file_name_vectorizer, False, False) ##getting features list x_vectorize = vectorize_class.count_vectorize() tfidf = TfidfTransformer(norm="l2", sublinear_tf=True) ##convert them into term frequency x_transform = tfidf.fit_transform(x_vectorize) X_normalized = preprocessing.normalize(x_transform.toarray(), norm='l2') print "Feature after vectorization of the data [%s, %s]" % x_transform.shape ##Going for feature selection # This dataset is way too high-dimensional. Better do PCA: #pca = PCA() pca = KernelPCA(kernel="linear") #pca = RandomizedPCA() #pca = NMF() # ## Maybe some original features where good, too? ##this will select features basec on chi2 test selection = SelectKBest(chi2, k=200) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit_transform(X_normalized, tags) with cd(file_path): joblib.dump(combined_features, file_name_features, compress=("zlib", 9)) """ dump(combined_features, open('%s/%s'%(file_path, file_name_features), 'wb'), HIGHEST_PROTOCOL) """ print "Feature after feature slection with pca and selectkbest\ of the data [%s, %s]" % X_features.shape n_estimators = 5 svc_classifier = SVC(kernel='linear', C=1, gamma="auto", probability=True, decision_function_shape="ovr", class_weight="balanced", cache_size=20000) if bagging: classifier = OneVsRestClassifier( BaggingClassifier(svc_classifier, max_samples=1.0, max_features=1.0, n_jobs=-1, verbose=3, n_estimators=n_estimators, bootstrap=False)) else: classifier = svc_classifier classifier.fit(X_features, tags) print classifier.classes_ with cd(file_path): joblib.dump(classifier, file_name_classifier, compress=("zlib", 9)) """ dump(classifier, open('%s/%s'%(file_path, file_name_classifier), 'wb'), HIGHEST_PROTOCOL) """ print "Storing Classifier with joblib" print time.time() - start return
def store_with_joblib(file_path, _object, file_name): with cd(file_path): joblib.dump(__object, file_name) return
def svm_bagclassifier(sentiment_data, file_name_classifier, file_name_vectorizer, file_name_features, bagging=False): """ vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(sentences) """ import time start = time.time() sentiments, sentences = zip(*sentiment_data[0:1000]) sentences = GeneralMethodsClassifiers.snowball_stemmer(sentences) sentences = GeneralMethodsClassifiers.pre_process_text(sentences) vectorize_class = HouzierVectorizer( sentences, "%s/CompiledModels/SentimentClassifiers" % base_dir, file_name_vectorizer, False, False) ##getting features list x_vectorize = vectorize_class.count_vectorize() tfidf = TfidfTransformer(norm="l2", sublinear_tf=True) ##convert them into term frequency x_transform = tfidf.fit_transform(x_vectorize) X_normalized = preprocessing.normalize(x_transform.toarray(), norm='l2') print "Feature after vectorization of the data [%s, %s]" % x_transform.shape ##Going for feature selection # This dataset is way too high-dimensional. Better do PCA: #pca = PCA() pca = KernelPCA(kernel="linear") #pca = RandomizedPCA() #pca = NMF() # ## Maybe some original features where good, too? ##this will select features basec on chi2 test selection = SelectKBest(chi2, k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit_transform(X_normalized, sentiments) with cd("%s/CompiledModels/SentimentClassifiers" % base_dir): joblib.dump(combined_features, file_name_features, compress=("zlib", 9)) """ dump(combined_features, open('%s/%s'%(SentimentClassifiersPath,SentimentFeatureFileName), 'wb'),HIGHEST_PROTOCOL) """ #X_pca = pca.fit_transform(x_transform) print "Feature after feature slection with pca and selectkbest\ of the data [%s, %s]" % X_features.shape #http://stackoverflow.com/questions/32934267/feature-union-of-hetereogenous-features #clf = SVC(C=1, kernel="linear", gamma=.001, probability=True, class_weight='auto') n_estimators = 3 svc_classifier = SVC(kernel='linear', C=1, gamma="auto", probability=True, decision_function_shape="ovr", class_weight="balanced", cache_size=20000) if bagging: classifier = OneVsRestClassifier( BaggingClassifier(svc_classifier, max_samples=1.0, max_features=1.0, n_jobs=-1, verbose=3, n_estimators=n_estimators, bootstrap=False)) else: classifier = svc_classifier classifier.fit(X_features, sentiments) print classifier.classes_ with cd("%s/CompiledModels/SentimentClassifiers" % base_dir): joblib.dump(classifier, file_name_classifier, compress=("zlib", 9)) """ dump(file_name_classifier,open('%s/%s'%(SentimentClassifiersPath, SentimentClassifierFileName ), 'wb'), HIGHEST_PROTOCOL) """ print "Storing Classifier with joblib" ##example to build your own vectorizer ##http://stackoverflow.com/questions/31744519/load-pickled-classifier-data-vocabulary-not-fitted-error from sklearn.feature_extraction.text import CountVectorizer #count_vectorizer = CountVectorizer() examples_negative = [ 'Free Viagra call today!', "I am dissapointed in you", "i am not good" ] examples_neutral = [ "I dont know", "Sun rises in the east", "I'm going to attend theLinux users group tomorrow." ] examples_positive = [ "hey there, I am too good to be true", "An Awesome man", "A beautiful beautiful lady" ] examples = examples_positive + examples_negative + examples_neutral #example_counts= example_counts.toarray() vocabulary_to_load = vectorize_class.return_vectorizer() #vectorize_class = HouzierVectorizer(examples, True, False) #x_vectorize = vectorize_class.count_vectorize() loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load) example_counts = loaded_vectorizer.transform(examples) print example_counts, example_counts.shape f = combined_features.transform(example_counts.toarray()) predictions = classifier.predict(f) predict_probabilities = classifier.predict_proba(f) for sent, prob, tag in zip(examples, predict_probabilities, predictions): print sent, prob, tag print time.time() - start return