def extract_background(params, data): """ """ if params['background_rank'] > 0: print_and_flush('Fitting low-rank background...') U, s, Vt = svd(M=data.reshape((-1, data.shape[-1])), n_components=params['background_rank']) U = np.reshape(U * s[None, :], data.shape[:2] + U.shape[-1:]) print_and_flush('Removing background from dataset...') data = data - np.dot(U, Vt) print_and_flush('Background successfully extracted.') return data, (U, Vt) return data, (None, None)
def SVD_classifier(df, period_of_interest, prediction_year=2012, \ epidemic_classification_dict=None, training_year_window='ALL',\ t0_vector=None, p_vector=None, classifier='SVM', modes=[0,1], verbose=False): ''' - p_max, p_min: sets the bounds for the period length vector - period_of_interest = () #initial and final date that contains the period of interest (poi). the period of interest defines the starting and finishing dates for the SVD classifierself. e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end on 01-02 of the next year -prediction_year -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1} ''' #Generate grid based on p and t0 vectors distance_grid = np.zeros([len(p_vector), len(t0_vector)]) years = [] for i in range(df.index.shape[0]): years.append(df.index[i].year) years = list(set(years)) years_before_prediction = years.index(prediction_year) if training_year_window == 'ALL': training_years = years[0:years_before_prediction] n_years = years_before_prediction elif training_year_window < years_before_prediction: training_years = years[years_before_prediction - training_year_window:years_before_prediction] n_years = training_year_window else: print( "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size" .format(training_year_window)) if verbose: print('{0} years detected within dataframe: {1}.'.format( len(years), years)) print('{0} Years before prediction: {1}'.format( n_years, training_years)) # check if t0 dates are within dates_within_poi = [] for d in t0_vector: if '{0}'.format(prediction_year) + d[4:] in df[ period_of_interest[0]:period_of_interest[1]].index: dates_within_poi.append(d) if len(d) > 0: print( '{0} dates from t0_vector are inside period_of_interest range: {1}' .format(len(dates_within_poi), dates_within_poi)) #Enter main loop for i, p in enumerate(p_vector): for j, t0 in enumerate(t0_vector): if verbose: print('Reshaping data') X = SVDC_reshape_yearly_data_stolerman(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) if verbose: print('Reshaping data done') ''' Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign the epidemic classification of year Y to the label for Y-1 ''' if X is not None: X_train = X[:, :-1] X_predict = X[:, -1] Y_train = [] for year in training_years[: -1]: # Can take out of loop but keeping for clear reading purposes Y_train.append(epidemic_classification_dict[year + 1]) Y_train = np.vstack(Y_train) Y_predict = epidemic_classification_dict[prediction_year] # Perform svd U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = projections[:, modes] ''' Now that we got our projections from SVD we can create the classifier ''' mod = svm.SVC(kernel='rbf', gamma=1, C=1, cache_size=400, max_iter=100000) if verbose: ('Fitting with projections shape {0} and target shape {1}'. format(projections.shape, Y_predict)) mod.fit(projections, Y_train.ravel()) pred = mod.predict( np.matmul(X_predict.reshape([1, -1]), U[:, modes])) distance_grid[i, j] = (pred == Y_predict) else: distance_grid[i, j] = -1 return distance_grid
def SVDC_deploy(df, period_of_interest, variables=['precip', 'temp'], add_runoff_binary=False, prediction_year=2012, epidemic_classification_dict=None, first_training_year=2000, t0_vector=None, p_vector=None, classifier='forest', modes=[0, 1], decision_map=None, decision_coordinates=None, decision_values=None, clustering=True, verbose=False, tiebreaker=True): ''' SVD_decision_ensemble performs a decision ensemble based on a series of decision groups generated by a clustering analysis. #Clustering process. After - p_max, p_min: sets the bounds for the period length vector - period_of_interest = () #initial and final date that contains the period of interest (poi). the period of interest defines the starting and finishing dates for the SVD classifierself. e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end on 01-02 of the next year -prediction_year -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1} ''' print('Starting SVDC with following parameters:') print('Features: {0}'.format(variables)) print('SVD modes : {0}'.format(modes)) print('runoff_binary : {0}'.format(add_runoff_binary)) print('Classifier : {0}'.format(classifier)) time.sleep(3) #Find decision groups decision_value_min = decision_values[0] decision_value_max = decision_values[1] if verbose: print('Decision values {0}, {1}'.format(decision_value_min, decision_value_max)) print('decision_map min and max values: {0}, {1}.'.format( np.min(decision_map), np.max(decision_map))) print('MODES:{0}'.format(modes)) #turns a nxm decision map into a set of samples. # matrix = bidimensional numpy array if verbose: print("Plotting decision map. Please verify everything's correct.") a = plt.subplot(1, 1, 1) a_im = a.matshow(decision_map, cmap=plt.cm.hot, aspect='auto', origin='lower') plt.ylabel('Decision Map') a.yaxis.set_label_position("right") a.xaxis.tick_bottom() plt.colorbar(a_im, ax=a) a.set_xticks(list(range(len(decision_coordinates[0]))), minor=False) a.xaxis.set_major_formatter(IndexFormatter(decision_coordinates[0])) a.xaxis.set_major_locator(mticker.MaxNLocator(8)) plt.xticks(rotation=40) a.xaxis.set_major_locator(mticker.MaxNLocator(5)) a.axes.get_xaxis().set_visible(False) #plt.show() plt.close() decision_map[~((decision_map >= decision_value_min) & (decision_map <= decision_value_max))] = 0 rows, columns = np.where((decision_map > 0)) roi = np.vstack([rows, columns]).T cluster_weights = [] cluster_coordinates = [] cluster_t0 = [] cluster_p = [] total_value_sum = 0 if clustering: #We use a clustering algorithm to find the decision clusters db = DBSCAN(eps=5, min_samples=40).fit(roi) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Find coordinates for decision clusters indices = [i for i, label in enumerate(labels) if label > -1] n_clusters = np.max(labels) + 1 for cluster_number in range(n_clusters): cluster_mask = np.equal(labels, cluster_number) cluster_indices = [ i for i, val in enumerate(cluster_mask) if val == 1 ] cluster_sum = 0 for ind in cluster_indices: # For each sample within cluster #get sample coordinates within decision_map p_coordinate = roi[ind, 0] t0_coordinate = roi[ind, 1] # get t0 and p coordinates cluster_t0.append(decision_coordinates[0][t0_coordinate]) cluster_p.append(decision_coordinates[1][p_coordinate]) cluster_sum += decision_map[p_coordinate, t0_coordinate] total_value_sum += cluster_sum cluster_coordinates.append(cluster_indices) cluster_weights.append(cluster_sum) else: #If clustering is set to false, we grab all the regions as one big cluster cluster_indices = list(range(len(rows))) cluster_sum = 0 n_clusters = 1 labels = np.array([1] * len(rows)) if len(cluster_indices) % 2 == 0: cluster_indices.pop() for ind in cluster_indices: # For each sample within cluster #get sample coordinates within decision_map p_coordinate = roi[ind, 0] t0_coordinate = roi[ind, 1] # get t0 and p coordinates cluster_t0.append(decision_coordinates[0][t0_coordinate]) cluster_p.append(decision_coordinates[1][p_coordinate]) cluster_sum += decision_map[p_coordinate, t0_coordinate] total_value_sum += cluster_sum cluster_coordinates.append(cluster_indices) cluster_weights.append(cluster_sum) cluster_weights = np.array(cluster_weights) / total_value_sum #Normalizing all_indices = np.hstack(cluster_coordinates) if verbose: print('{0} decision clusters were found'.format(np.max(labels + 1))) print( 'Plotting clustered grid, displaying only areas of interest and clusters with different colors' ) clustered_grid = np.zeros_like(decision_map) for i, label in enumerate(labels): clustered_grid[roi[i, 0], roi[i, 1]] = label + 1 fig = plt.figure() a = plt.subplot(2, 1, 1) a.matshow(decision_map, cmap=plt.cm.hot, aspect='auto', origin='lower') #a.colorbar() plt.title('Original decision_map') b = plt.subplot(2, 1, 2) b.matshow(clustered_grid, cmap=plt.cm.hot, aspect='auto', origin='lower', vmin=0, vmax=np.max(labels) + 1) # pl is pylab imported a pl plt.cm.hot #a.colorbar() plt.title('Clusters with classifying acc {0} to {1}'.format( decision_value_min, decision_value_max)) #plt.show() time.sleep(1) plt.close() if verbose: print('Cluster_weights = {0}'.format(cluster_weights)) print('cluster_coordinates = {0}'.format(cluster_coordinates)) print('all_indices {0}'.format(all_indices[:])) #Generate grid based on p and t0 vectors distance_grid = np.zeros_like(decision_map) years = [] for i in range(df.index.shape[0]): years.append(df.index[i].year) years = sorted(list(set(years))) if verbose: print(years) if prediction_year in years and first_training_year in years: training_years = years[years.index(first_training_year):years. index(prediction_year)] n_years = len(training_years) else: print('Missing either prediction_year or first_training_year') time.sleep(10) return ''' years_before_prediction = years.index(prediction_year) training_years = years[0:years_before_prediction] n_years = years_before_prediction ''' if verbose: print('{0} years detected within dataframe: {1}.'.format( len(years), years)) print('{0} Years before prediction: {1}'.format( n_years, training_years)) # check if t0 dates are within poi dates_within_poi = [] for d in cluster_t0: if '{0}'.format(prediction_year) + d[4:] in df[ period_of_interest[0]:period_of_interest[1]].index: dates_within_poi.append(d) if len(dates_within_poi) > 0: print( '{0} dates from t0_vector are inside period_of_interest range: {1}' .format(len(dates_within_poi), dates_within_poi)) #Enter main loop print('Initiating heatmap loop.') bar = Bar('Processing', max=len(cluster_p)) for p, t0, ind in zip(cluster_p, cluster_t0, all_indices): bar.next() i = roi[ind, 0] j = roi[ind, 1] if verbose: print('Reshaping data') X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) if verbose: print('Reshaping data done') ''' Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign the epidemic classification of year Y to the label for Y-1 ''' if X is not None: X_train = X[:, :-1] X_predict = X[:, -1] Y_train = [] for year in training_years[: -1]: # Can take out of loop but keeping for clear reading purposes Y_train.append(epidemic_classification_dict[year + 1]) Y_train = np.vstack(Y_train) # Perform svd U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = np.vstack([ projections[:, modes], np.matmul(X_predict.reshape([1, -1]), U[:, modes]) ]) if add_runoff_binary: # This function returns the delta value stated in Stolerman's paper average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) classifier_dataset = np.hstack([projections, average_runoff]) else: classifier_dataset = projections classifier_dataset_train = classifier_dataset[:-1, :] classifier_dataset_predict = classifier_dataset[-1, :] ''' Now that we got our projections from SVD we can create the classifier ''' if classifier == 'svm': mod = svm.SVC(kernel='rbf', gamma=1, C=1, cache_size=400, max_iter=100000) elif classifier == 'forest': mod = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0) if verbose: print('Fitting with projections shape {0}'.format( projections.shape)) print(Y_train, training_years) mod.fit(classifier_dataset_train, Y_train.ravel()) pred = mod.predict(classifier_dataset_predict.reshape(1, -1)) distance_grid[i, j] = pred bar.finish() cluster_decisions = [] for cluster_number in range(n_clusters): accumulated_decision = 0 indices = cluster_coordinates[cluster_number] for p_coordinate, t0_coordinate in roi[indices, :]: accumulated_decision += distance_grid[ p_coordinate, t0_coordinate] * decision_map[ p_coordinate, t0_coordinate] #Decision weighted by classifier accuracy if accumulated_decision > 0: cluster_decisions.append(1) elif accumulated_decision < 0: cluster_decisions.append(-1) else: cluster_decisions.append(0) cluster_decisions = np.array(cluster_decisions) final_decision = np.sum(cluster_decisions * cluster_weights) if verbose: fig, axarr = plt.subplots(3, 1, figsize=[4.5, 10]) #axarr[0]= plt.subplot(3,1,1) a_im = axarr[0].matshow(decision_map, cmap=plt.cm.hot, aspect='auto', origin='lower', vmin=0, vmax=1) axarr[0].axes.get_xaxis().set_visible(False) plt.colorbar(a_im, ax=axarr[0]) axarr[0].set_title( 'Decision map (Year to predict = {2}) \n (classifying acc {0} to {1})' .format(decision_value_min, decision_value_max, prediction_year)) axarr[0].yaxis.set_label_position("right") #b=plt.subplot(3,1,2) b_im = axarr[1].matshow(clustered_grid, cmap=plt.cm.tab20c, aspect='auto', origin='lower', vmin=0, vmax=np.max(labels) + 1) # pl is pylab imported a pl plt.cm.hot plt.colorbar(b_im, ax=axarr[1]) axarr[1].axes.get_xaxis().set_visible(False) axarr[1].yaxis.set_label_position("right") axarr[1].set_title('Clusters (N={0})'.format(n_clusters)) #c=plt.subplot(3,1,3) c_im = axarr[2].matshow(distance_grid, cmap=plt.cm.hot, aspect='auto', origin='lower', vmin=-1, vmax=1) # pl is pylab imported a pl plt.cm.hot plt.colorbar(c_im, ax=axarr[2]) axarr[2].set_title( 'Cluster decisions (final={0})'.format(final_decision)) axarr[2].yaxis.set_label_position("right") axarr[2].set_xticks(list(range(len(decision_coordinates[0]))), minor=False) axarr[2].xaxis.set_major_formatter( IndexFormatter(decision_coordinates[0])) axarr[2].xaxis.set_major_locator(mticker.MaxNLocator(8)) axarr[2].xaxis.tick_bottom() plt.xticks(rotation=40) plt.close() else: axarr = None fig = None votes_against = np.sum(distance_grid[distance_grid == -1]) votes_favor = np.sum(distance_grid[distance_grid == 1]) total_votes = len(cluster_p) if verbose: print( 'Decision ensemble finished with the following votes for each cluster (1 in favor, -1 against) \n' ) for c in range(n_clusters): print('Cluster {0} = {1}'.format(c + 1, cluster_decisions[c])) print( 'Puntual Decision (decision for each classifier) distribution. \n \ {0} in favor ({1}%).\n {2} against ({3}%). \n Total votes {4}'. format(votes_favor, votes_favor / total_votes, votes_against, votes_against / total_votes, total_votes)) return final_decision, cluster_decisions, cluster_weights, distance_grid, fig, votes_favor, total_votes
def SVDC_cross_validation(df, period_of_interest,first_year_insample = 2001,last_year_insample = 2006, prediction_year=2001, \ epidemic_classification_dict=None, training_year_window='ALL', t0_vector=None, \ p_vector=None, classifier='SVM', modes=[0], add_peaks=False,\ add_runoff_binary=False, verbose=True, variables=['precip', 'temp']): ''' - p_max, p_min: sets the bounds for the period length vector - period_of_interest = () #initial and final date that contains the period of interest (poi). the period of interest defines the starting and finishing dates for the SVD classifierself. e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end on 01-02 of the next year -prediction_year -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1} v2.: Version two of heatmap generators utilized 3 modes rather than 2 and also incorporates the average number of peaks as extra dimensions prior to the classifier phase ''' #Generate grid based on p and t0 vectors distance_grid = np.zeros([len(p_vector), len(t0_vector)]) years = [] for i in range(df.index.shape[0]): years.append(df.index[i].year) years = sorted(list(set(years))) years_before_prediction = years.index(last_year_insample) if verbose: print('Years before prediction') print(years_before_prediction) time.sleep(10) if training_year_window == 'ALL': training_years = years[0:years_before_prediction] n_years = years_before_prediction elif training_year_window < years_before_prediction: training_years = years[years_before_prediction - training_year_window:years_before_prediction] n_years = training_year_window else: print( "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size" .format(training_year_window)) if verbose: print('{0} years detected within dataframe: {1}.'.format( len(years), years)) print('{0} Years before prediction: {1}'.format( n_years, training_years)) # check if t0 dates are within dates_within_poi = [] for d in t0_vector: if '{0}'.format(last_year_insample) + d[4:] in df[ period_of_interest[0]:period_of_interest[1]].index: dates_within_poi.append(d) if len(d) > 0: print( '{0} dates from t0_vector are inside period_of_interest range: {1}' .format(len(dates_within_poi), dates_within_poi)) #Enter main loop print('Initiating heatmap loop.') bar = Bar('Processing', max=len(p_vector)) for i, p in enumerate(p_vector): bar.next() for j, t0 in enumerate(t0_vector): if verbose: print('Reshaping data with upper bound : {0}'.format( period_of_interest[0])) time.sleep(10) X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) if verbose: print('Reshaping data done') ''' Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign the epidemic classification of year Y to the label for Y-1 ''' if X is not None: # In the in sample run, the X_predict vector corresponds to the column in the matrix that # Contains the data form prediction_year. Columns are ordered in ascending order (2000,2001,2002...) # Assume prediction year is within in sample window (2001 - 2001 gives 0 column, 2002-2001 gives 1 column, etc) # Also assume there are no missing years X_predict_column = prediction_year - first_year_insample X_train_columns = list(range(n_years)) X_train_columns.remove(X_predict_column) X_train_columns = np.array(X_train_columns) X_train = X[:, X_train_columns] X_predict = X[:, X_predict_column] Y_train = [] in_sample_years = list( range(first_year_insample, last_year_insample + 1)) in_sample_years.remove(prediction_year) for year in in_sample_years: # Can take out of loop but keeping for clear reading purposes Y_train.append(epidemic_classification_dict[year]) Y_train = np.vstack(Y_train) Y_predict = epidemic_classification_dict[prediction_year] if verbose: print('Data: {0}'.format(X)) print('Shape: {0}'.format(X.shape)) print('prediction_year:{0}, column:{1}'.format( prediction_year, X_predict_column)) print('Training years:{0}, columns:{1}'.format( in_sample_years, X_train_columns)) print('Train_x : {0}, predict_x : {1}'.format( X_train, X_predict)) print('DICT: {0}'.format(epidemic_classification_dict)) print('Y_predict : {0}, prediction_year:{1}'.format( Y_predict, prediction_year)) print('Y_train : {0}, in_sample_years:{1}'.format( Y_train, in_sample_years)) time.sleep(100) # Perform svd U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = np.vstack([ projections[:, modes], np.matmul(X_predict.reshape([1, -1]), U[:, modes]) ]) ''' if not np.equal(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]).reshape(1,-1)).all(): print('WARNING! projections and prediction sample matmul are not equal') print(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes])) time.sleep(10) if verbose: print('Verifying predict_projection is correct = {0},{1}, {2}'.format(projections,projection_predict, np.matmul(X_predict.reshape([1,-1]),U[:,modes]))) ''' ''' Merging SVD projections average_peak_frequencies for each year. They should have the same length ''' if add_peaks: # This function returns the delta value stated in Stolerman's paper average_peak_frequencies = SVDC_get_apfs(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) classifier_dataset = np.hstack( [projections, average_peak_frequencies]) else: classifier_dataset = projections if add_runoff_binary: # This function returns the delta value stated in Stolerman's paper average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) classifier_dataset = np.hstack( [projections, average_runoff]) else: classifier_dataset = projections classifier_dataset_train = classifier_dataset[:-1, :] classifier_dataset_predict = classifier_dataset[-1, :] if verbose: print(classifier_dataset_train, classifier_dataset_predict) if classifier == 'svm': mod = svm.SVC(kernel='rbf', gamma=1, C=1, cache_size=400, max_iter=100000) elif classifier == 'forest': mod = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0) if verbose: ('Fitting with projections shape {0} and target shape {1}'. format(classifier_dataset_train.shape, Y_predict)) mod.fit(classifier_dataset_train, Y_train.ravel()) pred = mod.predict(classifier_dataset_predict.reshape(1, -1)) distance_grid[i, j] = (pred == Y_predict) else: distance_grid[i, j] = -1 bar.finish() return distance_grid
year_epidemic_classification = year_epidemic_classification[0:n_years] modes = [0, 1] #starts from zero #Enter main loop for i, p in enumerate(p_vector): for j, t0 in enumerate(t0_vector): print(p, t0) X = SVDC_reshape_yearly_data_stolerman(df, t0=t0, p=p, years, normalize=True) # Perform svd U, sigma, VT = svd(X, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = projections[:, modes] hull_1, hull_2, coordinates_1, coordinates_2 = get_hulls( projections, year_epidemic_classification) intersect_boolean, vertices_1, vertices_2 = intersect_hulls( hull_1, hull_2, coordinates_1, coordinates_2) #if not intersect_boolean: # plot_hulls(projections, hull_1, hull_2, coordinates_1, coordinates_2, year_epidemic_classification, class_color) # plot_polygon(projections, hull_1, hull_2, coordinates_1, coordinates_2, year_epidemic_classification, class_color)