def extract_username_socling_features(username): normalized_tokens = pp.tokenize_and_normalize(username) if normalized_tokens: feature_list = [ find_male_name(normalized_tokens), find_female_name(normalized_tokens), find_male_nickname(normalized_tokens), find_female_nickname(normalized_tokens), find_male_key_word(normalized_tokens), find_female_key_word(normalized_tokens), starts_with_o(normalized_tokens), starts_with_a(normalized_tokens), repeated_alphabet(pp.normalize(username)), caps(pp.normalize(username)) ] return feature_list else: return [0] * 10
def extract_description_socling_features(description): if description is not None: normalized_tokens = pp.tokenize_and_normalize(description) feature_list = [ find_male_name(normalized_tokens), find_female_name(normalized_tokens), find_male_nickname(normalized_tokens), find_female_nickname(normalized_tokens), find_male_key_word(normalized_tokens), find_female_key_word(normalized_tokens), repeated_alphabet(pp.normalize(description)), caps(pp.normalize(description)), possessive_bigrams(normalized_tokens), find_snapchat_link(pp.normalize(description)), find_instagram_link(pp.normalize(description)), find_tumblr_link(pp.normalize(description)) ] else: feature_list = [0] * 12 return feature_list
def extract_tweet_socling_features(tweet): if tweet is not None: text = pp.normalize(tweet) feature_list = [ find_ellipses(text), possessive_bigrams(pp.tokenize_and_normalize(tweet)), find_self_mentions(text), caps(text), find_affirmation(text), find_laughter(text), find_exclaim(text), find_question(text), repeated_alphabet(text) ] else: feature_list = [0] * 9 return feature_list
print 'gaussianization on idx %d failed' % idx print e else: try: rdata = gaussianizer(data) data = np.array(rdata) except Exception, e: print 'gaussianization failed' print e gaussianizer.mean = data.mean(axis=1) gaussianizer.std = data.mean(axis=1) if pre_processing == 'normalize': print 'normalizing data' normalizer = normalize(data) lowest_bic = np.infty bic = [] n_components_range = range(1, 10) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of gaussians with EM gmm = GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(data) bic.append(gmm.bic(data)) if bic[-1] < lowest_bic: lowest_bic = bic[-1]
''' import csv import os from pre_processing import normalize name_dict = {} abrv_dict = {} key_words_dict = {} csvs_dir = os.path.dirname(__file__) with open(csvs_dir + '/Lista_de_Nomes_Portugueses.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: name = normalize(row[0].decode('utf-8').lower().strip()) if row[2] == 'Sim': if 'M' in row: name_dict[name] = 'M' elif 'F' in row: name_dict[name] = 'F' else: name_dict[name] = 'U' with open(csvs_dir + '/Lista_de_Abreviaturas_Portuguesas.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: normalized_name = normalize(row[0].decode('utf-8').lower().strip())
def collect_train_and_test_data(location_id, door_count_placement_view_pair, trainPer, testPer, features, timezone = None, pre_processing=''): standardizer, normalizer, gaussianizer = None, None, None trainStart = trainPer[0] trainEnd = trainPer[1] predStart = testPer[0] predEnd = testPer[1] if all([isinstance(location_id, list), isinstance(door_count_placement_view_pair, list), isinstance(timezone,list)]): train_location_id = location_id[0] train_placement_view_pair = door_count_placement_view_pair[0] train_timezone = timezone[0] test_location_id = location_id[1] test_placement_view_pair = door_count_placement_view_pair[1] test_timezone = timezone[1] else: train_location_id = location_id train_placement_view_pair = door_count_placement_view_pair train_timezone = timezone test_location_id = location_id test_placement_view_pair = door_count_placement_view_pair test_timezone = timezone print train_placement_view_pair, test_placement_view_pair train_start_time = createDay(trainStart, train_timezone) train_end_time = createDay(trainEnd, train_timezone) print train_location_id, train_placement_view_pair, train_start_time, train_end_time print'\nGetting train data' print 'pre-processing step' standardizer = None if 'standardize' in pre_processing: standardizer = Standardizer(copy=copy, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True) if standardizer is not None: train_X = standardizer.transform(train_X, copy=None) if gaussianizer is not None: from rpy2.robjects.numpy2ri import numpy2ri robjects.conversion.py2ri = numpy2ri rtrain_X = gaussianizer(train_X) train_X = np.array(rtrain_X) gaussianizer.mean = train_X.mean(axis=1) gaussianizer.std = train_X.mean(axis=1) #add ones column print 'adding constant to train_X' if len(train_X.shape) > 1: ones_array = np.ones((train_X.shape[0],1)) train_X = np.append(train_X, ones_array, 1) else: train_X = np.dstack((train_X, np.ones(len(train_X)))) if len(train_X.shape) == 3: train_X = train_X[0] if pre_processing == 'normalize': print 'normalizing data' normalizer = normalize(train_X) print'\nGetting train data' if trainPer == testPer: test_X = train_X test_Y = train_Y else: test_start_time = createDay(predStart, test_timezone) test_end_time = createDay(predEnd, test_timezone) print test_location_id, test_placement_view_pair, test_start_time, test_end_time test_X, test_Y = collectData(test_location_id, test_placement_view_pair, test_start_time, test_end_time, features, adjusted=True) #pre process data if pre_processing == 'standardize': test_X = standardizer.transform(test_X, copy=None) if pre_processing == 'normalize': test_X = normalizer.transform(test_X, copy=None) if pre_processing == 'gaussianize': text_X = (text_X - gaussianizer.mean)/gaussianizer.std #add ones column print 'adding constant to test_X' if len(test_X.shape) > 1: ones_array = np.ones((test_X.shape[0],1)) test_X = np.append(test_X, ones_array, 1) else: test_X = np.dstack((test_X, np.ones(len(test_X)))) if len(test_X.shape) == 3: test_X = test_X[0] return ((train_X, train_Y), (test_X, test_Y))
def run(location_id, door_count_placement_view_pair, start_time, end_time, features, n_components=16, pre_processing='', BALANCE_DATA=False): """ Fits data to one GMM and plots confusion matrix, prediction and error ellipses location_id: location_id of installation, eg '55' <int> or <str> door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>) start_time: for prunning. time object with hour and minute time <time> end_time: for prunning. time object with hour and minute time <time> features: list with label (keys) of features used, [<str>,<str>...] n_components: number of mixture components pre_processing: pre-processing to be applied; accepts 'standardize' and 'gaussianize' with default values <str> BALANCE_DATA: hack for trying to balance the data <bool> """ global plotEE, plotPF, plotCM n_folds = 4 print '\npre-processing step' standardizer = None if 'standardize' in pre_processing: standardizer = Standardizer(copy=copy, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True, pre_processor=standardizer) if standardizer is not None: train_X = standardizer.transform(train_X, copy=None) if gaussianizer is not None: from rpy2.robjects.numpy2ri import numpy2ri ro.conversion.py2ri = numpy2ri rtrain_X = gaussianizer(train_X) train_X = np.array(rtrain_X) gaussianizer.mean = train_X.mean(axis=1) gaussianizer.std = train_X.mean(axis=1) #add ones column print 'adding constant to train_X' if len(train_X.shape) > 1: ones_array = np.ones((train_X.shape[0], 1)) train_X = np.append(train_X, ones_array, 1) else: train_X = np.dstack((train_X, np.ones(len(train_X)))) if len(train_X.shape) == 3: train_X = train_X[0] if pre_processing == 'normalize': print 'normalizing data' normalizer = normalize(train_X) #use sqrt of target to reduce hypothesis space truth[truth < 0] = 0 truth = np.sqrt(truth).astype(int) truth_str = map(str, truth) #hack to better balance the data if BALANCE_DATA: bins = np.bincount(truth) avg = bins.mean() dev = bins.std() tol = 10 tosmall = np.where(bins < avg - tol)[0] tobig = np.where(bins > avg + tol)[0] for item in tosmall: data = np.delete(data, np.where(truth == item)[0], axis=0) truth = np.delete(truth, np.where(truth == item)[0]) for item in tobig: data = np.delete(data, np.where(truth == item)[0], axis=0) truth = np.delete(truth, np.where(truth == item)[0]) #unbalanced targets affects, can't use StratifiedKFold, and, more important, GMM!, which assumes equal probability to all classes print 'Sample size is', len(truth) folds = KFold(len(truth), n_folds=n_folds) #shuffle=True, random_state=4 #to only take the first fold #train_index, test_index = next(iter(folds)) #for plotting idx = 1 for train_index, test_index in folds: X_train = data[train_index] y_train = truth[train_index] X_test = data[test_index] y_test = truth[test_index] # Try GMMs using different types of covariances. classifiers = dict( (covar_type, GMM(n_components=n_components, covariance_type=covar_type, params='wmc', init_params='wmc', n_iter=10000)) for covar_type in ['spherical', 'diag', 'tied', 'full']) n_classifiers = len(classifiers) if plotEE: plt.figure(idx, figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, classifier) in enumerate(classifiers.iteritems()): #np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)]) #start classifier with known means classifier.means_ = np.array([ X_train[y_train == i].mean(axis=0) for i in np.unique(y_train) ]) classifier.fit(X_train) y_train_pred = classifier.predict(X_train) y_train_pred = y_train_pred.astype(int) yresid = y_train - y_train_pred SSresid = np.sum(yresid**2) SStotal = (len(y_train) - 1) * np.var(y_train) train_accuracy = 1 - SSresid / SStotal #rsq isntead of np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 y_test_pred = classifier.predict(X_test) y_test_pred = y_test_pred.astype(int) yresid = y_test - y_test_pred SSresid = np.sum(yresid**2) SStotal = (len(y_test) - 1) * np.var(y_test) test_accuracy = 1 - SSresid / SStotal #rsq instead of np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 """ if features not in results: results[features] = {} results[features][name] = (train_accuracy, test_accuracy) """ if plotEE: plt.figure(idx, figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) fig1 = plt.subplot(2, n_classifiers / 2, index + 1) make_ellipses(classifier, fig1) for n, color in enumerate('rgb'): sample = data[truth == n] plt.scatter(sample[:, 0], sample[:, 1], 0.8, color=color, label=truth_str[n]) for n, color in enumerate('rgb'): sample = X_test[y_test == n] plt.plot(sample[:, 0], sample[:, 1], 'x', color=color) plt.text(0.05, 0.9, 'Train accuracy: %.2f' % train_accuracy, transform=fig1.transAxes) plt.text(0.05, 0.8, 'Test accuracy: %.2f' % test_accuracy, transform=fig1.transAxes) plt.xticks(()) plt.yticks(()) plt.title(name) if plotPF: #plot Ground Truth and Prediction x_train = np.array(range(len(y_train))) x_test = np.array(range(len(y_test))) plt.figure(idx * n_folds + index) fig2, ax2 = plt.subplots(2, 1, 1, figsize=(3 * n_classifiers / 2, 6)) #ax2 = plt.subplot(2, 2, index + 1) ax2[0].plot(x_train, y_train, label='Train Ground Truth') ax2[0].plot(x_train, y_train_pred, label='Train Prediction') ax2[1].plot(x_test, y_test, label='Test Ground Truth') ax2[1].plot(x_test, y_test_pred, label='Test Prediction') """ ax2[0].plot(x_train, y_train, label='Train Ground Truth', linestyle='none', marker='o') ax2[0].plot(x_train, y_train_pred, label='Train Prediction', linestyle='none', marker='o') ax2[1].plot(x_test, y_test, label='Test Ground Truth', linestyle='none', marker='o') ax2[1].plot(x_test, y_test_pred, label='Test Prediction', linestyle='none', marker='o') """ ax2[0].legend(loc='upper right', prop=dict(size=12), numpoints=1) ax2[0].set_title(str(features)) ax2[0].set_xlabel('time') ax2[0].set_ylabel('occupancy') ax2[0].grid() ax2[1].legend(loc='upper right', prop=dict(size=12), numpoints=1) ax2[1].set_title(str(features)) ax2[1].set_xlabel('time') ax2[1].set_ylabel('occupancy') ax2[1].grid() print 'y_train \n', y_train print 'y_train_pred \n', y_train_pred print 'y_test \n', y_test print 'y_test_pred \n', y_test_pred if plotCM: # Plot confusion matrices in a separate window cm = confusion_matrix(y_train, y_train_pred) plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() #plt.figure(idx) #plt.legend(loc='lower right', prop=dict(size=12)) idx += 1 if plotEE or plotPF or plotCM: plt.show()
def plot_features(location_id, door_count_placement_view_pair, start_time, end_time, features, pre_processing=''): """Plots features in pairs by incremental index, e.g. (0,1), (2,3)... ARGS location_id: location_id of installation, eg '55' <int> or <str> door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>) start_time: for prunning. time object with hour and minute time <time> end_time: for prunning. time object with hour and minute time <time> features: list with label (keys) of features used, [<str>,<str>...] pre_processing: pre-processing to be applied; accepts 'regularization' and 'pre_emphasis' with default values <str> """ print 'pre-processing step' standardizer, normalizer, gaussianizer = None, None, None dict_of_features = {} for feature in features : for processing in ['standardize']: pre_processing = [processing] if 'standardize' in pre_processing: standardizer = Standardizer(copy=True, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') print 'data mining step' data, _ = collectData(location_id, door_count_placement_view_pair, start_time, end_time, [feature], adjusted=False, pre_processor=standardizer) if standardizer is not None: print 'standardizer' data = standardizer.transform(data, copy=True) if gaussianizer is not None: print 'gaussianize' from rpy2.robjects.numpy2ri import numpy2ri robjects.conversion.py2ri = numpy2ri #this is slow and hacky! if data.ndim == 2 and data.shape[1] > 1 : data_transposed = data.T for idx in range(len(data_transposed)): try: rdata = gaussianizer(data_transposed[idx]) gaussianized_data = np.array(rdata) gaussianized_data = gaussianized_data.reshape((len(gaussianized_data),)) data[:,idx] = gaussianized_data except Exception, e: print 'gaussianization on idx %d failed' % idx print e else: try: rdata = gaussianizer(data) data = np.array(rdata) except Exception, e: print 'gaussianization failed' print e gaussianizer.mean = data.mean(axis=1) gaussianizer.std = data.mean(axis=1) if 'normalize' in pre_processing: print 'normalizing data' normalizer = normalize(data) dict_of_features[feature+'_'+pre_processing[0]] = data
def plot_features(location_id, door_count_placement_view_pair, start_time, end_time, features, pre_processing=''): """Plots features in pairs by incremental index, e.g. (0,1), (2,3)... ARGS location_id: location_id of installation, eg '55' <int> or <str> door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>) start_time: for prunning. time object with hour and minute time <time> end_time: for prunning. time object with hour and minute time <time> features: list with label (keys) of features used, [<str>,<str>...] pre_processing: pre-processing to be applied; accepts 'regularization' and 'pre_emphasis' with default values <str> """ print 'pre-processing step' standardizer, normalizer, gaussianizer = None, None, None dict_of_features = {} for feature in features: for processing in ['standardize']: pre_processing = [processing] if 'standardize' in pre_processing: standardizer = Standardizer(copy=True, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') print 'data mining step' data, _ = collectData(location_id, door_count_placement_view_pair, start_time, end_time, [feature], adjusted=False, pre_processor=standardizer) if standardizer is not None: print 'standardizer' data = standardizer.transform(data, copy=True) if gaussianizer is not None: print 'gaussianize' from rpy2.robjects.numpy2ri import numpy2ri robjects.conversion.py2ri = numpy2ri #this is slow and hacky! if data.ndim == 2 and data.shape[1] > 1: data_transposed = data.T for idx in range(len(data_transposed)): try: rdata = gaussianizer(data_transposed[idx]) gaussianized_data = np.array(rdata) gaussianized_data = gaussianized_data.reshape( (len(gaussianized_data), )) data[:, idx] = gaussianized_data except Exception, e: print 'gaussianization on idx %d failed' % idx print e else: try: rdata = gaussianizer(data) data = np.array(rdata) except Exception, e: print 'gaussianization failed' print e gaussianizer.mean = data.mean(axis=1) gaussianizer.std = data.mean(axis=1) if 'normalize' in pre_processing: print 'normalizing data' normalizer = normalize(data) dict_of_features[feature + '_' + pre_processing[0]] = data
def collect_train_and_test_data(location_id, door_count_placement_view_pair, trainPer, testPer, features, timezone=None, pre_processing=''): standardizer, normalizer, gaussianizer = None, None, None trainStart = trainPer[0] trainEnd = trainPer[1] predStart = testPer[0] predEnd = testPer[1] if all([ isinstance(location_id, list), isinstance(door_count_placement_view_pair, list), isinstance(timezone, list) ]): train_location_id = location_id[0] train_placement_view_pair = door_count_placement_view_pair[0] train_timezone = timezone[0] test_location_id = location_id[1] test_placement_view_pair = door_count_placement_view_pair[1] test_timezone = timezone[1] else: train_location_id = location_id train_placement_view_pair = door_count_placement_view_pair train_timezone = timezone test_location_id = location_id test_placement_view_pair = door_count_placement_view_pair test_timezone = timezone print train_placement_view_pair, test_placement_view_pair train_start_time = createDay(trainStart, train_timezone) train_end_time = createDay(trainEnd, train_timezone) print train_location_id, train_placement_view_pair, train_start_time, train_end_time print '\nGetting train data' print 'pre-processing step' standardizer = None if 'standardize' in pre_processing: standardizer = Standardizer(copy=copy, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True) if standardizer is not None: train_X = standardizer.transform(train_X, copy=None) if gaussianizer is not None: from rpy2.robjects.numpy2ri import numpy2ri robjects.conversion.py2ri = numpy2ri rtrain_X = gaussianizer(train_X) train_X = np.array(rtrain_X) gaussianizer.mean = train_X.mean(axis=1) gaussianizer.std = train_X.mean(axis=1) #add ones column print 'adding constant to train_X' if len(train_X.shape) > 1: ones_array = np.ones((train_X.shape[0], 1)) train_X = np.append(train_X, ones_array, 1) else: train_X = np.dstack((train_X, np.ones(len(train_X)))) if len(train_X.shape) == 3: train_X = train_X[0] if pre_processing == 'normalize': print 'normalizing data' normalizer = normalize(train_X) print '\nGetting train data' if trainPer == testPer: test_X = train_X test_Y = train_Y else: test_start_time = createDay(predStart, test_timezone) test_end_time = createDay(predEnd, test_timezone) print test_location_id, test_placement_view_pair, test_start_time, test_end_time test_X, test_Y = collectData(test_location_id, test_placement_view_pair, test_start_time, test_end_time, features, adjusted=True) #pre process data if pre_processing == 'standardize': test_X = standardizer.transform(test_X, copy=None) if pre_processing == 'normalize': test_X = normalizer.transform(test_X, copy=None) if pre_processing == 'gaussianize': text_X = (text_X - gaussianizer.mean) / gaussianizer.std #add ones column print 'adding constant to test_X' if len(test_X.shape) > 1: ones_array = np.ones((test_X.shape[0], 1)) test_X = np.append(test_X, ones_array, 1) else: test_X = np.dstack((test_X, np.ones(len(test_X)))) if len(test_X.shape) == 3: test_X = test_X[0] return ((train_X, train_Y), (test_X, test_Y))
def run(location_id, door_count_placement_view_pair, start_time, end_time, features, n_components=16, pre_processing='', BALANCE_DATA=False): """ Fits data to one GMM and plots confusion matrix, prediction and error ellipses location_id: location_id of installation, eg '55' <int> or <str> door_count_placement_view_pair: placement and view id pair, e.g. ('3333230','0') (<str>, <str>) start_time: for prunning. time object with hour and minute time <time> end_time: for prunning. time object with hour and minute time <time> features: list with label (keys) of features used, [<str>,<str>...] n_components: number of mixture components pre_processing: pre-processing to be applied; accepts 'standardize' and 'gaussianize' with default values <str> BALANCE_DATA: hack for trying to balance the data <bool> """ global plotEE, plotPF, plotCM n_folds = 4 print '\npre-processing step' standardizer = None if 'standardize' in pre_processing: standardizer = Standardizer(copy=copy, with_mean=True, with_std=True) if 'gaussianize' in pre_processing: gaussianizer = robjects.r('Gaussianize') train_X, train_Y = collectData(train_location_id, train_placement_view_pair, train_start_time, train_end_time, features, adjusted=True, pre_processor=standardizer) if standardizer is not None: train_X = standardizer.transform(train_X, copy=None) if gaussianizer is not None: from rpy2.robjects.numpy2ri import numpy2ri ro.conversion.py2ri = numpy2ri rtrain_X = gaussianizer(train_X) train_X = np.array(rtrain_X) gaussianizer.mean = train_X.mean(axis=1) gaussianizer.std = train_X.mean(axis=1) #add ones column print 'adding constant to train_X' if len(train_X.shape) > 1: ones_array = np.ones((train_X.shape[0],1)) train_X = np.append(train_X, ones_array, 1) else: train_X= np.dstack((train_X, np.ones(len(train_X)))) if len(train_X.shape) == 3: train_X= train_X[0] if pre_processing == 'normalize': print 'normalizing data' normalizer = normalize(train_X) #use sqrt of target to reduce hypothesis space truth[truth<0] = 0 truth = np.sqrt(truth).astype(int) truth_str = map(str, truth) #hack to better balance the data if BALANCE_DATA: bins = np.bincount(truth) avg = bins.mean() dev = bins.std() tol = 10 tosmall = np.where(bins < avg - tol)[0] tobig = np.where(bins > avg + tol)[0] for item in tosmall: data = np.delete(data, np.where(truth == item)[0], axis=0) truth = np.delete(truth, np.where(truth == item)[0]) for item in tobig: data = np.delete(data, np.where(truth == item)[0], axis=0) truth = np.delete(truth, np.where(truth == item)[0]) #unbalanced targets affects, can't use StratifiedKFold, and, more important, GMM!, which assumes equal probability to all classes print 'Sample size is', len(truth) folds = KFold(len(truth), n_folds=n_folds) #shuffle=True, random_state=4 #to only take the first fold #train_index, test_index = next(iter(folds)) #for plotting idx = 1 for train_index, test_index in folds: X_train = data[train_index] y_train = truth[train_index] X_test = data[test_index] y_test = truth[test_index] # Try GMMs using different types of covariances. classifiers = dict((covar_type, GMM(n_components=n_components, covariance_type=covar_type, params='wmc', init_params='wmc', n_iter=10000)) for covar_type in ['spherical', 'diag', 'tied', 'full']) n_classifiers = len(classifiers) if plotEE: plt.figure(idx, figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, classifier) in enumerate(classifiers.iteritems()): #np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)]) #start classifier with known means classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in np.unique(y_train)]) classifier.fit(X_train) y_train_pred = classifier.predict(X_train) y_train_pred = y_train_pred.astype(int) yresid = y_train - y_train_pred; SSresid = np.sum(yresid**2) SStotal = (len(y_train)-1) * np.var(y_train) train_accuracy = 1 - SSresid/SStotal #rsq isntead of np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 y_test_pred = classifier.predict(X_test) y_test_pred = y_test_pred.astype(int) yresid = y_test - y_test_pred; SSresid = np.sum(yresid**2) SStotal = (len(y_test)-1) * np.var(y_test) test_accuracy = 1 - SSresid/SStotal #rsq instead of np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 """ if features not in results: results[features] = {} results[features][name] = (train_accuracy, test_accuracy) """ if plotEE: plt.figure(idx, figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) fig1 = plt.subplot(2, n_classifiers / 2, index + 1) make_ellipses(classifier, fig1) for n, color in enumerate('rgb'): sample = data[truth == n] plt.scatter(sample[:, 0], sample[:, 1], 0.8, color=color, label=truth_str[n]) for n, color in enumerate('rgb'): sample = X_test[y_test == n] plt.plot(sample[:, 0], sample[:, 1], 'x', color=color) plt.text(0.05, 0.9, 'Train accuracy: %.2f' % train_accuracy, transform=fig1.transAxes) plt.text(0.05, 0.8, 'Test accuracy: %.2f' % test_accuracy, transform=fig1.transAxes) plt.xticks(()) plt.yticks(()) plt.title(name) if plotPF: #plot Ground Truth and Prediction x_train = np.array(range(len(y_train))) x_test = np.array(range(len(y_test))) plt.figure(idx*n_folds + index) fig2, ax2 = plt.subplots(2, 1, 1, figsize=(3 * n_classifiers / 2, 6)) #ax2 = plt.subplot(2, 2, index + 1) ax2[0].plot(x_train, y_train, label='Train Ground Truth') ax2[0].plot(x_train, y_train_pred, label='Train Prediction') ax2[1].plot(x_test, y_test, label='Test Ground Truth') ax2[1].plot(x_test, y_test_pred, label='Test Prediction') """ ax2[0].plot(x_train, y_train, label='Train Ground Truth', linestyle='none', marker='o') ax2[0].plot(x_train, y_train_pred, label='Train Prediction', linestyle='none', marker='o') ax2[1].plot(x_test, y_test, label='Test Ground Truth', linestyle='none', marker='o') ax2[1].plot(x_test, y_test_pred, label='Test Prediction', linestyle='none', marker='o') """ ax2[0].legend(loc='upper right', prop=dict(size=12), numpoints=1) ax2[0].set_title(str(features)) ax2[0].set_xlabel('time') ax2[0].set_ylabel('occupancy') ax2[0].grid() ax2[1].legend(loc='upper right', prop=dict(size=12), numpoints=1) ax2[1].set_title(str(features)) ax2[1].set_xlabel('time') ax2[1].set_ylabel('occupancy') ax2[1].grid() print 'y_train \n', y_train print 'y_train_pred \n',y_train_pred print 'y_test \n',y_test print 'y_test_pred \n',y_test_pred if plotCM: # Plot confusion matrices in a separate window cm = confusion_matrix(y_train, y_train_pred) plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() #plt.figure(idx) #plt.legend(loc='lower right', prop=dict(size=12)) idx += 1 if plotEE or plotPF or plotCM: plt.show()