예제 #1
0
def testProcessingPipeline(filename):

    jsonData = getJSONData(filename)
    ingredient_list = getIngredientList(jsonData)

    tfidf = ClassificationUtils.load_classifier("tfidf.pickle")
    bag_of_words = ClassificationUtils.load_classifier("bag_of_words.pickle")
    adaboost = ClassificationUtils.load_classifier("ada_idf_cook.pickle")
    randomfor = ClassificationUtils.load_classifier("rf_idf_cook.pickle")
    bagging = ClassificationUtils.load_classifier("bagging_idf_cook.pickle")

    test_data_tfidf = tfidf.transform(ingredient_list)
    test_data_bag = bag_of_words.transform(ingredient_list)

    adaboost.predict(test_data_bag)
    adaboost.predict(test_data_tfidf)

    randomfor.predict(test_data_bag)
    randomfor.predict(test_data_tfidf)

    bagging.predict(test_data_bag)
    bagging.predict(test_data_tfidf)
bagging_predict = bagging.predict(test_data_bag)
end = time.time()
print "Time Taken to Test the models : ", end - start

print "Accuracy of AdaBoost Algorithm : ", metrics.accuracy_score(
    test_labels, ada_predict)
print "Accuracy of Random Forests : ", metrics.accuracy_score(
    test_labels, rf_predict)
print "Accuracy of Extra Trees : ", metrics.accuracy_score(
    test_labels, extree_predict)
print "Accuracy of Bagging : ", metrics.accuracy_score(test_labels,
                                                       bagging_predict)

# Saving the tf-idf model and classifiers

ClassificationUtils.save_classifier("ada_bag_cook.pickle", adaboost)
ClassificationUtils.save_classifier("rf_bag_cook.pickle", randomforest)
ClassificationUtils.save_classifier("extree_bag_cook.pickle", extratrees)
ClassificationUtils.save_classifier("bagging_bag_cook.pickle", bagging)
ClassificationUtils.save_classifier("bag_of_words.pickle", tfidf)


def printIngredientDistribution():
    print "----------- Distribution of the Recipe Ingredients ------------------"
    for key in ingredient_map.keys():
        print key, " : ", ingredient_map[key]


def printCuisineDistribution():
    print "----------- Distribution of the Cuisines ------------------"
    for key in cuisine_map.keys():
예제 #3
0
Created on Sun Dec 27 09:37:18 2015
Text Filelds of JSON - 
request_text
request_title
request_text_edit_aware
requester_subreddits_at_request (JSONArray)
Count of subreddit field - 
requester_number_of_subreddits_at_request
@author: Rupak Chakraborty
"""

import ClassificationUtils
import scipy

filename = "Random Acts of Pizza/train.json"
jsonData = ClassificationUtils.getJSONData(filename)
subreddits_pizza = set([])
subreddits_non_pizza = set([])
subreddit_pizza_map = {}
subreddit_non_pizza_map = {}
pizza_title_list = []
non_pizza_title_list = []
pizza_text_list = []
non_pizza_text_list = []

pizza_request_suffix_map = {
    "requester_upvotes_minus_downvotes_at_request": [],
    "requester_number_of_subreddits_at_request": [],
    "requester_upvotes_plus_downvotes_at_request": [],
    "requester_account_age_in_days_at_request": [],
    "requester_days_since_first_post_on_raop_at_request": [],
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import time

sentiment_data = []
sentiment_labels = []
TRAIN_SIZE = 20000
filename = "Bag of Words Meet Bag of Popcorn (Google Word2Vec)/labeledTrainData.tsv"
data = pd.read_csv(filename, sep="\t")

# Preprocessing the Data
print "Starting Preprocessing Data...."
start = time.time()

for label, review in zip(data["sentiment"], data["review"]):
    sentiment_data.append(ClassificationUtils.textCleaningPipeline(review))
    sentiment_labels.append(label)

end = time.time()
print "Taken Taken for Data Preprocessing : ", end - start

#Separating the Training and Test Labels

train_labels = sentiment_labels[:TRAIN_SIZE]
test_labels = sentiment_labels[TRAIN_SIZE:]
train_data = sentiment_data[:TRAIN_SIZE]
test_data = sentiment_data[TRAIN_SIZE:]

#Initializing Feature Extractors

count_vec = CountVectorizer()
    return nouns, adj, verbs


# Extract Text features

request_text__data = list(jsonData["request_text_edit_aware"])
request_text_title_data = list(jsonData["request_title"])
clean_text_data = list([])
clean_title_data = list([])

print "Starting feature loading and cleaning ..."
start = time.time()

for i in range(len(request_text__data)):

    title_string = ClassificationUtils.textCleaningPipeline(
        request_text_title_data[i])
    text_string = ClassificationUtils.textCleaningPipeline(
        request_text__data[i])
    clean_text_data.append(text_string)
    clean_title_data.append(title_string)

end = time.time()
print "Time taken to load and clean text features : ", end - start

# Extract whole features

number_of_downvotes_of_request_at_retrieval = np.array(
    jsonData["number_of_downvotes_of_request_at_retrieval"], dtype=float)
number_of_upvotes_of_request_at_retrieval = np.array(
    jsonData["number_of_upvotes_of_request_at_retrieval"], dtype=float)
request_number_of_comments_at_retrieval = np.array(
예제 #6
0
def main():
    # Parse Args.
    # 'scale' allows the image_scaled scale to be set. Eg: 0.25, 0.5, 1.0
    argList = sys.argv[1:]
    shortArg = 'a:d:t:s:S:k:rn:i:me'
    longArg = [
        'algorithm=', 'dataset=', 'train-ratio=', 'scale=', 'test-scale=',
        'folds=', 'rotations', 'noise=', 'noise-intensity=', 'multiprocess',
        'example', 'data-ratio=', 'mrlbp-classifier=', 'noise-train', 'ecs',
        'debug'
    ]

    valid_algorithms = [
        'RLBP', 'MRLBP', 'MRELBP', 'BM3DELBP', 'NoiseClassifier'
    ]
    valid_datasets = ['kylberg']
    valid_noise = ['gaussian', 'speckle', 'salt-pepper']
    valid_mrlbp_classifiers = ['svm', 'knn']

    try:
        args, vals = getopt.getopt(argList, shortArg, longArg)

        for arg, val in args:
            if arg in ('-a', '--algorithm'):
                if val in valid_algorithms:
                    print('Using algorithm:', val)
                    GlobalConfig.set("algorithm", val)
                else:
                    raise ValueError(
                        'Invalid algorithm configured, choose one of the following:',
                        valid_algorithms)
            elif arg in ('-d', '--dataset'):
                if val in valid_datasets:
                    print("Using dataset:", val)
                    GlobalConfig.set("dataset", val)
                else:
                    raise ValueError(
                        'Invalid dataset configured, choose one of the following:',
                        valid_datasets)
            elif arg in ('-t', '--train-test'):
                if 0 < float(val) <= 1.0:
                    print('Using train-ratio ratio of', val)
                    GlobalConfig.set('train_ratio', float(val))
                else:
                    raise ValueError(
                        'Train-test ratio must be 0 < train-test <= 1.0')
            elif arg in ('-s', '--scale'):
                if 0 < float(val) <= 1.0:
                    print('Using training image scale:', val)
                    GlobalConfig.set('scale', float(val))
                else:
                    raise ValueError('Scale must be 0 < scale <= 1.0')
            elif arg in ('-S', '--test-scale'):
                if 0 < float(val) <= 1.0:
                    print('Using testing image scale:', val)
                    GlobalConfig.set('test_scale', float(val))
                else:
                    raise ValueError('Test scale must be 0 < scale <= 1.0')
            elif arg in ('-k', '--folds'):
                print('Doing {} folds'.format(val))
                GlobalConfig.set("folds", int(val))
            elif arg in ('-r', '--rotations'):
                print('Using rotated image_scaled sources')
                GlobalConfig.set("rotate", True)
            elif arg in ('-n', '--noise'):
                if val in valid_noise:
                    print('Applying noise:', val)
                    GlobalConfig.set("noise", val)
                else:
                    raise ValueError(
                        'Invalid noise type, choose one of the following:',
                        valid_noise)
            elif arg in ('-i', '--noise-intensity'):
                print('Using noise intensity (sigma / ratio) of:', val)
                GlobalConfig.set("noise_val", float(val))
            elif arg in ('-m', '--multiprocess'):
                cores = psutil.cpu_count()
                print('Using {} processor cores for computing featurevectors'.
                      format(cores))
                GlobalConfig.set('multiprocess', True)
                GlobalConfig.set('cpu_count', cores)
            elif arg in ('-e', '--example'):
                print('Generating algorithm example image_scaled')
                GlobalConfig.set('examples', True)
            elif arg == '--data-ratio':
                if 0 < float(val) <= 1.0:
                    print('Using dataset ratio:', val)
                    GlobalConfig.set('data_ratio', float(val))
                else:
                    raise ValueError('Data ratio must be 0 < ratio <= 1.0')
            elif arg == '--mrlbp-classifier':
                if val in valid_mrlbp_classifiers:
                    print(
                        "MRLBP algorithm (if configured) will use {} classifier"
                        .format(val))
                    GlobalConfig.set('mrlbp_classifier', val)
                else:
                    raise ValueError(
                        'Invalid classifier chosen for mrlbp, choose one of the following:',
                        valid_mrlbp_classifiers)
            elif arg == '--noise-train':
                print(
                    "Applying noise to the training dataset as well as the test dataset"
                )
                GlobalConfig.set('train_noise', True)
            elif arg == '--ecs':
                print("Loading dataset from C:\Local")
                GlobalConfig.set('ECS', True)
            elif arg == '--debug':
                print("Running in debug mode")
                GlobalConfig.set('debug', True)
            else:
                raise ValueError('Unhandled argument provided:', arg)
    except getopt.error as err:
        print(str(err))

    if GlobalConfig.get('ECS'):
        GlobalConfig.set(
            'CWD',
            r'\\filestore.soton.ac.uk\users\ojvl1g17\mydocuments\COMP3200-Texture-Classification'
        )
    else:
        GlobalConfig.set('CWD', os.getcwd())

    if GlobalConfig.get('examples'):
        write_examples()

    # Load configured Dataset
    if GlobalConfig.get('dataset') == 'kylberg':
        if GlobalConfig.get('debug'):
            # To save time in debug mode, only load one class and load a smaller proportion of it (25% of samples)
            kylberg = DatasetManager.KylbergTextures(
                num_classes=2, data_ratio=GlobalConfig.get('data_ratio'))
        else:
            kylberg = DatasetManager.KylbergTextures(
                num_classes=28, data_ratio=GlobalConfig.get('data_ratio'))
        # Load Dataset & Cross Validator
        dataset = kylberg.load_data()
        cross_validator = kylberg.get_cross_validator()

        print("Dataset loaded")
    elif GlobalConfig.get('dataset') is None:
        raise ValueError('No Dataset configured')
    else:
        raise ValueError('Invalid dataset')

    if GlobalConfig.get('rotate'):
        dataset_folder = GlobalConfig.get('dataset') + '-rotated'
    else:
        dataset_folder = GlobalConfig.get('dataset')

    out_folder = os.path.join(GlobalConfig.get('CWD'), 'out',
                              GlobalConfig.get('algorithm'), dataset_folder)
    # Initialise algorithm
    if GlobalConfig.get('algorithm') == 'RLBP':
        print("Applying RLBP algorithm")
        algorithm = RLBP.RobustLBP()
    elif GlobalConfig.get('algorithm') == 'MRLBP':
        print("Applying MRLBP algorithm")
        algorithm = RLBP.MultiresolutionLBP(p=[8, 16, 24], r=[1, 2, 3])
    elif GlobalConfig.get('algorithm') == 'MRELBP':
        print("Applying MRELBP algorithm")
        algorithm = MRELBP.MedianRobustExtendedLBP(r1=[2, 4, 6, 8],
                                                   p=8,
                                                   w_center=3,
                                                   w_r1=[3, 5, 7, 9])
    elif GlobalConfig.get('algorithm') == 'BM3DELBP':
        print("Applying BM3DELBP algorithm")
        algorithm = BM3DELBP.BM3DELBP()
    elif GlobalConfig.get('algorithm') == 'NoiseClassifier':
        # Noise Classifier is used in BM3DELBP algorithm usually, this allows for benchmarking of the classifier alone
        algorithm = NoiseClassifier.NoiseClassifier()
        pass
    else:
        raise ValueError('Invalid algorithm choice')

    # Get the Training out directory (i.e. Images without scaling/rotation/noise)
    train_out_dir = os.path.join(
        out_folder, algorithm.get_outdir(noisy_image=False,
                                         scaled_image=False))
    # Get the Testing out directory (i.e. Images with scaling/rotation/noise)
    if GlobalConfig.get('noise') is not None:
        noisy_image = True
    else:
        noisy_image = False
    if GlobalConfig.get('test_scale') is not None:
        scaled_image = True
    else:
        scaled_image = False
    test_out_dir = os.path.join(
        out_folder, algorithm.get_outdir(noisy_image, scaled_image))

    # Out path for noise classifier
    noise_out_dir = os.path.join(
        GlobalConfig.get('CWD'), 'out', 'NoiseClassifier', dataset_folder,
        "scale-{}".format(int(GlobalConfig.get('scale') * 100)))
    test_noise_out_dir = os.path.join(
        GlobalConfig.get('CWD'), 'out', 'NoiseClassifier', dataset_folder,
        algorithm.get_outdir(noisy_image, scaled_image))

    print("Replacing DatasetManager.Image with BM3DELBPImages")
    # Convert DatasetManager.Image into BM3DELBP.BM3DELBPImage
    if GlobalConfig.get('algorithm') == 'NoiseClassifier' or GlobalConfig.get(
            'algorithm') == 'BM3DELBP':
        for index, img in enumerate(dataset):
            dataset[index] = BM3DELBP.BM3DELBPImage(img)
            # Also convert rotated images if necessary
            if img.test_rotations is not None:
                for index, rotated_img in enumerate(img.test_rotations):
                    img.test_rotations[index] = BM3DELBP.BM3DELBPImage(
                        rotated_img)

    if GlobalConfig.get('multiprocess'):
        for index, img in enumerate(dataset):
            dataset[index] = (index, img)

        if GlobalConfig.get('rotate'):
            maxtasks = 50
        else:
            maxtasks = None

        if GlobalConfig.get(
                'algorithm') == 'NoiseClassifier' or GlobalConfig.get(
                    'algorithm') == 'BM3DELBP':
            with Pool(processes=GlobalConfig.get('cpu_count'),
                      maxtasksperchild=maxtasks) as pool:
                # Generate image noise featurevectors
                for index, image in tqdm.tqdm(pool.istarmap(
                        describe_noise_pool,
                        zip(dataset, repeat(noise_out_dir),
                            repeat(test_noise_out_dir))),
                                              total=len(dataset),
                                              desc='Noise Featurevectors'):
                    dataset[index] = image
        else:
            with Pool(processes=GlobalConfig.get('cpu_count'),
                      maxtasksperchild=maxtasks) as pool:
                # Generate featurevectors
                for index, image in tqdm.tqdm(pool.istarmap(
                        describe_image_pool,
                        zip(repeat(algorithm), dataset, repeat(train_out_dir),
                            repeat(test_out_dir))),
                                              total=len(dataset),
                                              desc='Texture Featurevectors'):
                    dataset[index] = image
    else:
        # Process the images without using multiprocessing Pools
        if GlobalConfig.get(
                'algorithm') == 'NoiseClassifier' or GlobalConfig.get(
                    'algorithm') == 'BM3DELBP':
            for index, img in enumerate(dataset):
                # Generate image noise featurevectors
                describe_noise(img, noise_out_dir, test_noise_out_dir)
        else:
            print("BEGINNING TIMER:")
            start = timer()
            for index, img in enumerate(dataset):
                # Generate featurevetors
                describe_image(algorithm, img, train_out_dir, test_out_dir)
            end = timer()
            print("TIME TAKEN:", end - start)

    # Train models and perform predictions
    if GlobalConfig.get('algorithm') == 'RLBP':
        predictor = RLBP.RobustLBPPredictor(dataset, cross_validator)
    elif GlobalConfig.get('algorithm') == 'MRLBP':
        print("Performing MRLBP Classification")
        predictor = RLBP.MultiresolutionLBPPredictor(dataset, cross_validator)
    elif GlobalConfig.get('algorithm') == 'MRELBP':
        print("Performing MRELBP Classification")
        predictor = MRELBP.MedianRobustExtendedLBPPredictor(
            dataset, cross_validator)
    elif GlobalConfig.get('algorithm') == 'BM3DELBP':
        print("Performing BM3DELBP Classification")
        predictor = BM3DELBP.BM3DELBPPredictor(dataset, cross_validator)
    elif GlobalConfig.get('algorithm') == 'NoiseClassifier':
        print("Applying noise classifier")
        predictor = BM3DELBP.NoiseTypePredictor(dataset, cross_validator)
    else:
        raise ValueError('Invalid algorithm choice')

    # Get the test label & test prediction for every fold of cross validation
    y_test, y_predicted = predictor.begin_cross_validation()
    if GlobalConfig.get('algorithm') == 'NoiseClassifier':
        if GlobalConfig.get('noise') is None:
            classes = ['no-noise', 'gaussian', 'speckle', 'salt-pepper']
        else:
            classes = ['gaussian', 'speckle', 'salt-pepper']
    else:
        classes = kylberg.classes

    # Display confusion matrix
    ClassificationUtils.pretty_print_conf_matrix(
        y_test,
        y_predicted,
        classes,
        title='{} Confusion Matrix'.format(GlobalConfig.get('algorithm')),
        out_dir=test_out_dir)

    # Display classification report
    ClassificationUtils.make_classification_report(y_test, y_predicted,
                                                   classes, test_out_dir)
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 25 18:18:43 2015
Test Data for the Kaggle Email Spam Test Competition
@author: Rupak Chakraborty
"""

import numpy as np
import ClassificationUtils

test_filename = "test.csv"
num_features = 100
test_file = open(test_filename, "r")
test_data = test_file.read()
test_data = test_data.split()
test_set = np.zeros((len(test_data), num_features))
ClassificationUtils.populateNumpyData(test_filename, test_set)

svm = ClassificationUtils.load_classifier("svm_email.pickle")
rf = ClassificationUtils.load_classifier("rf_email.pickle")
bnb = ClassificationUtils.load_classifier("bnb_email.pickle")
gnb = ClassificationUtils.load_classifier("gnb_email.pickle")
mnb = ClassificationUtils.load_classifier("mnb_email.pickle")

svm_predict = svm.predict(test_set)
rf_predict = rf.predict(test_set)
bnb_predict = bnb.predict(test_set)
gnb_predict = gnb.predict(test_set)
mnb_predict = mnb.predict(test_set)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import time

sentiment_data = []
sentiment_labels = []
TRAIN_SIZE = 20000
filename = "Bag of Words Meet Bag of Popcorn (Google Word2Vec)/labeledTrainData.tsv"
data = pd.read_csv(filename,sep="\t")

# Preprocessing the Data
print "Starting Preprocessing Data...."
start = time.time() 

for label,review in zip(data["sentiment"],data["review"]):
    sentiment_data.append(ClassificationUtils.textCleaningPipeline(review))
    sentiment_labels.append(label) 
    
end = time.time()
print "Taken Taken for Data Preprocessing : ",end-start

#Separating the Training and Test Labels 
    
train_labels = sentiment_labels[:TRAIN_SIZE]
test_labels = sentiment_labels[TRAIN_SIZE:]
train_data = sentiment_data[:TRAIN_SIZE]
test_data = sentiment_data[TRAIN_SIZE:]

#Initializing Feature Extractors 

count_vec = CountVectorizer()
예제 #9
0
svm = SVC()
bnb = BernoulliNB(alpha=0.2)
mnb = MultinomialNB(alpha=0.4)
gnb = GaussianNB()
rf = RandomForestClassifier(n_jobs=4, n_estimators=17)
knn = KNeighborsClassifier(n_neighbors=5)

trainDataFile = "train.csv"
trainLabelFile = "train_labels.csv"

fLabel = open(trainLabelFile, "r")
labels = fLabel.read()
class_labels = labels.split()
train_set = np.zeros((len(class_labels), 100))
ClassificationUtils.populateNumpyData(trainDataFile, train_set)

svm.fit(train_set, class_labels)
predict_svm = svm.predict(train_set)
print "Accuracy For SVM - ", metrics.accuracy_score(class_labels, predict_svm)
print "----------------- Classification Report for SVM ---------------------"
print metrics.classification_report(class_labels, predict_svm)
ClassificationUtils.save_classifier("svm_email.pickle", svm)

bnb.fit(train_set, class_labels)
predict_bnb = bnb.predict(train_set)
print "Accuracy For Bernoulli - ", metrics.accuracy_score(
    class_labels, predict_bnb)

print "----------------- Classification Report for Bernoulli ---------------------"
print metrics.classification_report(class_labels, predict_bnb)
test_label = train_label[0:3000]

bnb = BernoulliNB()
gnb = GaussianNB()
mnb = MultinomialNB()
randfor = RandomForestClassifier(n_jobs=4, n_estimators=23)
supvec = SVC()

start = time.time()
bnb.fit(train_data, train_label)
gnb.fit(train_data, train_label)
mnb.fit(train_data, train_label)
randfor.fit(train_data, train_label)

end = time.time()
ClassificationUtils.save_classifier("bnb_cook.pickle", bnb)
ClassificationUtils.save_classifier("gnb_cook.pickle", gnb)
ClassificationUtils.save_classifier("mnb_cook.pickle", mnb)
ClassificationUtils.save_classifier("rf_cook.pickle", randfor)

print randfor.feature_importances_
print "Time Taken to Train Models : ", end - start

start = time.time()
bernoullipredict = bnb.predict(test_data)
gaussianpredict = gnb.predict(test_data)
multinomialpredict = mnb.predict(test_data)
randforestpredict = randfor.predict(test_data)

print "--------- Classification Report for Bernoulli Bayes ---------------"
print metrics.classification_report(test_label, bernoullipredict)
request_text_title_data = list(jsonData["request_title"])
request_number_of_subreddits = list(
    jsonData["requester_number_of_subreddits_at_request"])
requester_pizza_status = list(jsonData["requester_received_pizza"])
requester_subreddits_at_request = list(
    jsonData["requester_subreddits_at_request"])
data_set = []
TRAIN_SIZE = 3500

print "Starting Loading of Dataset... "
start = time.time()

for i in range(len(requester_subreddits_at_request)):

    jsonArray = requester_subreddits_at_request[i]
    subreddit_string = ClassificationUtils.convertJSONArrayToSpaceDelimitedString(
        jsonArray)
    title_string = ClassificationUtils.textCleaningPipeline(
        request_text_title_data[i])
    text_string = ClassificationUtils.textCleaningPipeline(
        request_text__data[i])
    final_string = title_string + " " + text_string + " " + subreddit_string
    data_set.append(final_string)
    requester_pizza_status[i] = class_map[requester_pizza_status[i]]

end = time.time()
print "Time Taken to Load Dataset : ", end - start

#Splitting the data into train and test set

train_data = data_set[:TRAIN_SIZE]
test_data = data_set[TRAIN_SIZE:]