def testProcessingPipeline(filename): jsonData = getJSONData(filename) ingredient_list = getIngredientList(jsonData) tfidf = ClassificationUtils.load_classifier("tfidf.pickle") bag_of_words = ClassificationUtils.load_classifier("bag_of_words.pickle") adaboost = ClassificationUtils.load_classifier("ada_idf_cook.pickle") randomfor = ClassificationUtils.load_classifier("rf_idf_cook.pickle") bagging = ClassificationUtils.load_classifier("bagging_idf_cook.pickle") test_data_tfidf = tfidf.transform(ingredient_list) test_data_bag = bag_of_words.transform(ingredient_list) adaboost.predict(test_data_bag) adaboost.predict(test_data_tfidf) randomfor.predict(test_data_bag) randomfor.predict(test_data_tfidf) bagging.predict(test_data_bag) bagging.predict(test_data_tfidf)
bagging_predict = bagging.predict(test_data_bag) end = time.time() print "Time Taken to Test the models : ", end - start print "Accuracy of AdaBoost Algorithm : ", metrics.accuracy_score( test_labels, ada_predict) print "Accuracy of Random Forests : ", metrics.accuracy_score( test_labels, rf_predict) print "Accuracy of Extra Trees : ", metrics.accuracy_score( test_labels, extree_predict) print "Accuracy of Bagging : ", metrics.accuracy_score(test_labels, bagging_predict) # Saving the tf-idf model and classifiers ClassificationUtils.save_classifier("ada_bag_cook.pickle", adaboost) ClassificationUtils.save_classifier("rf_bag_cook.pickle", randomforest) ClassificationUtils.save_classifier("extree_bag_cook.pickle", extratrees) ClassificationUtils.save_classifier("bagging_bag_cook.pickle", bagging) ClassificationUtils.save_classifier("bag_of_words.pickle", tfidf) def printIngredientDistribution(): print "----------- Distribution of the Recipe Ingredients ------------------" for key in ingredient_map.keys(): print key, " : ", ingredient_map[key] def printCuisineDistribution(): print "----------- Distribution of the Cuisines ------------------" for key in cuisine_map.keys():
Created on Sun Dec 27 09:37:18 2015 Text Filelds of JSON - request_text request_title request_text_edit_aware requester_subreddits_at_request (JSONArray) Count of subreddit field - requester_number_of_subreddits_at_request @author: Rupak Chakraborty """ import ClassificationUtils import scipy filename = "Random Acts of Pizza/train.json" jsonData = ClassificationUtils.getJSONData(filename) subreddits_pizza = set([]) subreddits_non_pizza = set([]) subreddit_pizza_map = {} subreddit_non_pizza_map = {} pizza_title_list = [] non_pizza_title_list = [] pizza_text_list = [] non_pizza_text_list = [] pizza_request_suffix_map = { "requester_upvotes_minus_downvotes_at_request": [], "requester_number_of_subreddits_at_request": [], "requester_upvotes_plus_downvotes_at_request": [], "requester_account_age_in_days_at_request": [], "requester_days_since_first_post_on_raop_at_request": [],
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics import time sentiment_data = [] sentiment_labels = [] TRAIN_SIZE = 20000 filename = "Bag of Words Meet Bag of Popcorn (Google Word2Vec)/labeledTrainData.tsv" data = pd.read_csv(filename, sep="\t") # Preprocessing the Data print "Starting Preprocessing Data...." start = time.time() for label, review in zip(data["sentiment"], data["review"]): sentiment_data.append(ClassificationUtils.textCleaningPipeline(review)) sentiment_labels.append(label) end = time.time() print "Taken Taken for Data Preprocessing : ", end - start #Separating the Training and Test Labels train_labels = sentiment_labels[:TRAIN_SIZE] test_labels = sentiment_labels[TRAIN_SIZE:] train_data = sentiment_data[:TRAIN_SIZE] test_data = sentiment_data[TRAIN_SIZE:] #Initializing Feature Extractors count_vec = CountVectorizer()
return nouns, adj, verbs # Extract Text features request_text__data = list(jsonData["request_text_edit_aware"]) request_text_title_data = list(jsonData["request_title"]) clean_text_data = list([]) clean_title_data = list([]) print "Starting feature loading and cleaning ..." start = time.time() for i in range(len(request_text__data)): title_string = ClassificationUtils.textCleaningPipeline( request_text_title_data[i]) text_string = ClassificationUtils.textCleaningPipeline( request_text__data[i]) clean_text_data.append(text_string) clean_title_data.append(title_string) end = time.time() print "Time taken to load and clean text features : ", end - start # Extract whole features number_of_downvotes_of_request_at_retrieval = np.array( jsonData["number_of_downvotes_of_request_at_retrieval"], dtype=float) number_of_upvotes_of_request_at_retrieval = np.array( jsonData["number_of_upvotes_of_request_at_retrieval"], dtype=float) request_number_of_comments_at_retrieval = np.array(
def main(): # Parse Args. # 'scale' allows the image_scaled scale to be set. Eg: 0.25, 0.5, 1.0 argList = sys.argv[1:] shortArg = 'a:d:t:s:S:k:rn:i:me' longArg = [ 'algorithm=', 'dataset=', 'train-ratio=', 'scale=', 'test-scale=', 'folds=', 'rotations', 'noise=', 'noise-intensity=', 'multiprocess', 'example', 'data-ratio=', 'mrlbp-classifier=', 'noise-train', 'ecs', 'debug' ] valid_algorithms = [ 'RLBP', 'MRLBP', 'MRELBP', 'BM3DELBP', 'NoiseClassifier' ] valid_datasets = ['kylberg'] valid_noise = ['gaussian', 'speckle', 'salt-pepper'] valid_mrlbp_classifiers = ['svm', 'knn'] try: args, vals = getopt.getopt(argList, shortArg, longArg) for arg, val in args: if arg in ('-a', '--algorithm'): if val in valid_algorithms: print('Using algorithm:', val) GlobalConfig.set("algorithm", val) else: raise ValueError( 'Invalid algorithm configured, choose one of the following:', valid_algorithms) elif arg in ('-d', '--dataset'): if val in valid_datasets: print("Using dataset:", val) GlobalConfig.set("dataset", val) else: raise ValueError( 'Invalid dataset configured, choose one of the following:', valid_datasets) elif arg in ('-t', '--train-test'): if 0 < float(val) <= 1.0: print('Using train-ratio ratio of', val) GlobalConfig.set('train_ratio', float(val)) else: raise ValueError( 'Train-test ratio must be 0 < train-test <= 1.0') elif arg in ('-s', '--scale'): if 0 < float(val) <= 1.0: print('Using training image scale:', val) GlobalConfig.set('scale', float(val)) else: raise ValueError('Scale must be 0 < scale <= 1.0') elif arg in ('-S', '--test-scale'): if 0 < float(val) <= 1.0: print('Using testing image scale:', val) GlobalConfig.set('test_scale', float(val)) else: raise ValueError('Test scale must be 0 < scale <= 1.0') elif arg in ('-k', '--folds'): print('Doing {} folds'.format(val)) GlobalConfig.set("folds", int(val)) elif arg in ('-r', '--rotations'): print('Using rotated image_scaled sources') GlobalConfig.set("rotate", True) elif arg in ('-n', '--noise'): if val in valid_noise: print('Applying noise:', val) GlobalConfig.set("noise", val) else: raise ValueError( 'Invalid noise type, choose one of the following:', valid_noise) elif arg in ('-i', '--noise-intensity'): print('Using noise intensity (sigma / ratio) of:', val) GlobalConfig.set("noise_val", float(val)) elif arg in ('-m', '--multiprocess'): cores = psutil.cpu_count() print('Using {} processor cores for computing featurevectors'. format(cores)) GlobalConfig.set('multiprocess', True) GlobalConfig.set('cpu_count', cores) elif arg in ('-e', '--example'): print('Generating algorithm example image_scaled') GlobalConfig.set('examples', True) elif arg == '--data-ratio': if 0 < float(val) <= 1.0: print('Using dataset ratio:', val) GlobalConfig.set('data_ratio', float(val)) else: raise ValueError('Data ratio must be 0 < ratio <= 1.0') elif arg == '--mrlbp-classifier': if val in valid_mrlbp_classifiers: print( "MRLBP algorithm (if configured) will use {} classifier" .format(val)) GlobalConfig.set('mrlbp_classifier', val) else: raise ValueError( 'Invalid classifier chosen for mrlbp, choose one of the following:', valid_mrlbp_classifiers) elif arg == '--noise-train': print( "Applying noise to the training dataset as well as the test dataset" ) GlobalConfig.set('train_noise', True) elif arg == '--ecs': print("Loading dataset from C:\Local") GlobalConfig.set('ECS', True) elif arg == '--debug': print("Running in debug mode") GlobalConfig.set('debug', True) else: raise ValueError('Unhandled argument provided:', arg) except getopt.error as err: print(str(err)) if GlobalConfig.get('ECS'): GlobalConfig.set( 'CWD', r'\\filestore.soton.ac.uk\users\ojvl1g17\mydocuments\COMP3200-Texture-Classification' ) else: GlobalConfig.set('CWD', os.getcwd()) if GlobalConfig.get('examples'): write_examples() # Load configured Dataset if GlobalConfig.get('dataset') == 'kylberg': if GlobalConfig.get('debug'): # To save time in debug mode, only load one class and load a smaller proportion of it (25% of samples) kylberg = DatasetManager.KylbergTextures( num_classes=2, data_ratio=GlobalConfig.get('data_ratio')) else: kylberg = DatasetManager.KylbergTextures( num_classes=28, data_ratio=GlobalConfig.get('data_ratio')) # Load Dataset & Cross Validator dataset = kylberg.load_data() cross_validator = kylberg.get_cross_validator() print("Dataset loaded") elif GlobalConfig.get('dataset') is None: raise ValueError('No Dataset configured') else: raise ValueError('Invalid dataset') if GlobalConfig.get('rotate'): dataset_folder = GlobalConfig.get('dataset') + '-rotated' else: dataset_folder = GlobalConfig.get('dataset') out_folder = os.path.join(GlobalConfig.get('CWD'), 'out', GlobalConfig.get('algorithm'), dataset_folder) # Initialise algorithm if GlobalConfig.get('algorithm') == 'RLBP': print("Applying RLBP algorithm") algorithm = RLBP.RobustLBP() elif GlobalConfig.get('algorithm') == 'MRLBP': print("Applying MRLBP algorithm") algorithm = RLBP.MultiresolutionLBP(p=[8, 16, 24], r=[1, 2, 3]) elif GlobalConfig.get('algorithm') == 'MRELBP': print("Applying MRELBP algorithm") algorithm = MRELBP.MedianRobustExtendedLBP(r1=[2, 4, 6, 8], p=8, w_center=3, w_r1=[3, 5, 7, 9]) elif GlobalConfig.get('algorithm') == 'BM3DELBP': print("Applying BM3DELBP algorithm") algorithm = BM3DELBP.BM3DELBP() elif GlobalConfig.get('algorithm') == 'NoiseClassifier': # Noise Classifier is used in BM3DELBP algorithm usually, this allows for benchmarking of the classifier alone algorithm = NoiseClassifier.NoiseClassifier() pass else: raise ValueError('Invalid algorithm choice') # Get the Training out directory (i.e. Images without scaling/rotation/noise) train_out_dir = os.path.join( out_folder, algorithm.get_outdir(noisy_image=False, scaled_image=False)) # Get the Testing out directory (i.e. Images with scaling/rotation/noise) if GlobalConfig.get('noise') is not None: noisy_image = True else: noisy_image = False if GlobalConfig.get('test_scale') is not None: scaled_image = True else: scaled_image = False test_out_dir = os.path.join( out_folder, algorithm.get_outdir(noisy_image, scaled_image)) # Out path for noise classifier noise_out_dir = os.path.join( GlobalConfig.get('CWD'), 'out', 'NoiseClassifier', dataset_folder, "scale-{}".format(int(GlobalConfig.get('scale') * 100))) test_noise_out_dir = os.path.join( GlobalConfig.get('CWD'), 'out', 'NoiseClassifier', dataset_folder, algorithm.get_outdir(noisy_image, scaled_image)) print("Replacing DatasetManager.Image with BM3DELBPImages") # Convert DatasetManager.Image into BM3DELBP.BM3DELBPImage if GlobalConfig.get('algorithm') == 'NoiseClassifier' or GlobalConfig.get( 'algorithm') == 'BM3DELBP': for index, img in enumerate(dataset): dataset[index] = BM3DELBP.BM3DELBPImage(img) # Also convert rotated images if necessary if img.test_rotations is not None: for index, rotated_img in enumerate(img.test_rotations): img.test_rotations[index] = BM3DELBP.BM3DELBPImage( rotated_img) if GlobalConfig.get('multiprocess'): for index, img in enumerate(dataset): dataset[index] = (index, img) if GlobalConfig.get('rotate'): maxtasks = 50 else: maxtasks = None if GlobalConfig.get( 'algorithm') == 'NoiseClassifier' or GlobalConfig.get( 'algorithm') == 'BM3DELBP': with Pool(processes=GlobalConfig.get('cpu_count'), maxtasksperchild=maxtasks) as pool: # Generate image noise featurevectors for index, image in tqdm.tqdm(pool.istarmap( describe_noise_pool, zip(dataset, repeat(noise_out_dir), repeat(test_noise_out_dir))), total=len(dataset), desc='Noise Featurevectors'): dataset[index] = image else: with Pool(processes=GlobalConfig.get('cpu_count'), maxtasksperchild=maxtasks) as pool: # Generate featurevectors for index, image in tqdm.tqdm(pool.istarmap( describe_image_pool, zip(repeat(algorithm), dataset, repeat(train_out_dir), repeat(test_out_dir))), total=len(dataset), desc='Texture Featurevectors'): dataset[index] = image else: # Process the images without using multiprocessing Pools if GlobalConfig.get( 'algorithm') == 'NoiseClassifier' or GlobalConfig.get( 'algorithm') == 'BM3DELBP': for index, img in enumerate(dataset): # Generate image noise featurevectors describe_noise(img, noise_out_dir, test_noise_out_dir) else: print("BEGINNING TIMER:") start = timer() for index, img in enumerate(dataset): # Generate featurevetors describe_image(algorithm, img, train_out_dir, test_out_dir) end = timer() print("TIME TAKEN:", end - start) # Train models and perform predictions if GlobalConfig.get('algorithm') == 'RLBP': predictor = RLBP.RobustLBPPredictor(dataset, cross_validator) elif GlobalConfig.get('algorithm') == 'MRLBP': print("Performing MRLBP Classification") predictor = RLBP.MultiresolutionLBPPredictor(dataset, cross_validator) elif GlobalConfig.get('algorithm') == 'MRELBP': print("Performing MRELBP Classification") predictor = MRELBP.MedianRobustExtendedLBPPredictor( dataset, cross_validator) elif GlobalConfig.get('algorithm') == 'BM3DELBP': print("Performing BM3DELBP Classification") predictor = BM3DELBP.BM3DELBPPredictor(dataset, cross_validator) elif GlobalConfig.get('algorithm') == 'NoiseClassifier': print("Applying noise classifier") predictor = BM3DELBP.NoiseTypePredictor(dataset, cross_validator) else: raise ValueError('Invalid algorithm choice') # Get the test label & test prediction for every fold of cross validation y_test, y_predicted = predictor.begin_cross_validation() if GlobalConfig.get('algorithm') == 'NoiseClassifier': if GlobalConfig.get('noise') is None: classes = ['no-noise', 'gaussian', 'speckle', 'salt-pepper'] else: classes = ['gaussian', 'speckle', 'salt-pepper'] else: classes = kylberg.classes # Display confusion matrix ClassificationUtils.pretty_print_conf_matrix( y_test, y_predicted, classes, title='{} Confusion Matrix'.format(GlobalConfig.get('algorithm')), out_dir=test_out_dir) # Display classification report ClassificationUtils.make_classification_report(y_test, y_predicted, classes, test_out_dir)
# -*- coding: utf-8 -*- """ Created on Fri Dec 25 18:18:43 2015 Test Data for the Kaggle Email Spam Test Competition @author: Rupak Chakraborty """ import numpy as np import ClassificationUtils test_filename = "test.csv" num_features = 100 test_file = open(test_filename, "r") test_data = test_file.read() test_data = test_data.split() test_set = np.zeros((len(test_data), num_features)) ClassificationUtils.populateNumpyData(test_filename, test_set) svm = ClassificationUtils.load_classifier("svm_email.pickle") rf = ClassificationUtils.load_classifier("rf_email.pickle") bnb = ClassificationUtils.load_classifier("bnb_email.pickle") gnb = ClassificationUtils.load_classifier("gnb_email.pickle") mnb = ClassificationUtils.load_classifier("mnb_email.pickle") svm_predict = svm.predict(test_set) rf_predict = rf.predict(test_set) bnb_predict = bnb.predict(test_set) gnb_predict = gnb.predict(test_set) mnb_predict = mnb.predict(test_set)
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics import time sentiment_data = [] sentiment_labels = [] TRAIN_SIZE = 20000 filename = "Bag of Words Meet Bag of Popcorn (Google Word2Vec)/labeledTrainData.tsv" data = pd.read_csv(filename,sep="\t") # Preprocessing the Data print "Starting Preprocessing Data...." start = time.time() for label,review in zip(data["sentiment"],data["review"]): sentiment_data.append(ClassificationUtils.textCleaningPipeline(review)) sentiment_labels.append(label) end = time.time() print "Taken Taken for Data Preprocessing : ",end-start #Separating the Training and Test Labels train_labels = sentiment_labels[:TRAIN_SIZE] test_labels = sentiment_labels[TRAIN_SIZE:] train_data = sentiment_data[:TRAIN_SIZE] test_data = sentiment_data[TRAIN_SIZE:] #Initializing Feature Extractors count_vec = CountVectorizer()
svm = SVC() bnb = BernoulliNB(alpha=0.2) mnb = MultinomialNB(alpha=0.4) gnb = GaussianNB() rf = RandomForestClassifier(n_jobs=4, n_estimators=17) knn = KNeighborsClassifier(n_neighbors=5) trainDataFile = "train.csv" trainLabelFile = "train_labels.csv" fLabel = open(trainLabelFile, "r") labels = fLabel.read() class_labels = labels.split() train_set = np.zeros((len(class_labels), 100)) ClassificationUtils.populateNumpyData(trainDataFile, train_set) svm.fit(train_set, class_labels) predict_svm = svm.predict(train_set) print "Accuracy For SVM - ", metrics.accuracy_score(class_labels, predict_svm) print "----------------- Classification Report for SVM ---------------------" print metrics.classification_report(class_labels, predict_svm) ClassificationUtils.save_classifier("svm_email.pickle", svm) bnb.fit(train_set, class_labels) predict_bnb = bnb.predict(train_set) print "Accuracy For Bernoulli - ", metrics.accuracy_score( class_labels, predict_bnb) print "----------------- Classification Report for Bernoulli ---------------------" print metrics.classification_report(class_labels, predict_bnb)
test_label = train_label[0:3000] bnb = BernoulliNB() gnb = GaussianNB() mnb = MultinomialNB() randfor = RandomForestClassifier(n_jobs=4, n_estimators=23) supvec = SVC() start = time.time() bnb.fit(train_data, train_label) gnb.fit(train_data, train_label) mnb.fit(train_data, train_label) randfor.fit(train_data, train_label) end = time.time() ClassificationUtils.save_classifier("bnb_cook.pickle", bnb) ClassificationUtils.save_classifier("gnb_cook.pickle", gnb) ClassificationUtils.save_classifier("mnb_cook.pickle", mnb) ClassificationUtils.save_classifier("rf_cook.pickle", randfor) print randfor.feature_importances_ print "Time Taken to Train Models : ", end - start start = time.time() bernoullipredict = bnb.predict(test_data) gaussianpredict = gnb.predict(test_data) multinomialpredict = mnb.predict(test_data) randforestpredict = randfor.predict(test_data) print "--------- Classification Report for Bernoulli Bayes ---------------" print metrics.classification_report(test_label, bernoullipredict)
request_text_title_data = list(jsonData["request_title"]) request_number_of_subreddits = list( jsonData["requester_number_of_subreddits_at_request"]) requester_pizza_status = list(jsonData["requester_received_pizza"]) requester_subreddits_at_request = list( jsonData["requester_subreddits_at_request"]) data_set = [] TRAIN_SIZE = 3500 print "Starting Loading of Dataset... " start = time.time() for i in range(len(requester_subreddits_at_request)): jsonArray = requester_subreddits_at_request[i] subreddit_string = ClassificationUtils.convertJSONArrayToSpaceDelimitedString( jsonArray) title_string = ClassificationUtils.textCleaningPipeline( request_text_title_data[i]) text_string = ClassificationUtils.textCleaningPipeline( request_text__data[i]) final_string = title_string + " " + text_string + " " + subreddit_string data_set.append(final_string) requester_pizza_status[i] = class_map[requester_pizza_status[i]] end = time.time() print "Time Taken to Load Dataset : ", end - start #Splitting the data into train and test set train_data = data_set[:TRAIN_SIZE] test_data = data_set[TRAIN_SIZE:]