def preprocess_dupage(demographics_file, log_file): #Read datasets into dataframes dupage_demographics = tools.read_file('datasets', demographics_file) dupage_logs = tools.read_file('datasets', log_file) #Remove null values tools.replace_null_values(dupage_logs) tools.replace_null_values(dupage_demographics) #Group rows in dupage_log by subject ID dupage_logs = tools.join_by_column(dupage_logs, 'IDSUBJ', ['R24Barrier', 'R24Action']) #Fill visit_counts column with total number of barriers logged per subject. tools.create_empty_column(dupage_logs, 'visit_counts', len(dupage_logs['IDSUBJ'])) count_visits(dupage_logs) inner_merged = pd.merge(dupage_logs, dupage_demographics, on=['IDSUBJ']) tools.encode_PDBIRTH(inner_merged) encode_income_levels(inner_merged) encode_emp_status(inner_merged) replace_barrier_names(inner_merged) replace_action_names(inner_merged) cols_list = inner_merged.columns.values csv_lists = inner_merged.values tools.write_to_csv(cols_list, csv_lists, 'files', 'preprocessed_dupage.csv') return inner_merged
def feature_correlations(): df = tools.read_file('files', 'dupage_chinatown.csv') df_features = multilabel_classification.select_features(df) cols = df_features.columns.values # Calculate correlations between all features corr = df_features.corr() #Return a list with the given feature, the highest correlation with another feature, and that feature. col_name = 'visit_counts' path = os.environ[ 'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'feature_correlations.csv' #doc = open(os.environ['PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'correlations.txt','w') lst = [] for c in cols: corrs = find_strongest_correlations(c, cols, corr) lst.append(corrs) #doc.write(str(corrs) + '\n') new_df = pd.DataFrame(lst, columns=[ 'Feature', 'Most Positive Correlation', 'Most Positive Correlation-Feature', 'Most Negative Correlation', 'Most Negative Correlation-Feature' ]) print(new_df['Feature']) decode_features(new_df, columns=[ 'Feature', 'Most Positive Correlation-Feature', 'Most Negative Correlation-Feature' ]) new_df.to_csv(path_or_buf=path)
def feature_actions_correlations(): df = tools.read_file('files', 'dupage_chinatown.csv') df_features = multilabel_classification.select_features(df) feature_cols = df_features.columns labels = ['action_1', 'action_2', 'action_3'] Y, mlb = tools.select_multilabels(df, labels) df_actions = pd.DataFrame(Y, columns=mlb.classes_) action_cols = mlb.classes_ length = len(df_actions) ind = range(length) df_features['ind'] = ind df_actions['ind'] = ind merged = pd.merge(df_features, df_actions, on='ind') merged.set_index('ind', inplace=True) print(merged) # Calculate correlations between all features corr = merged.corr() print(corr) #Return a list with the given feature, the highest correlation with another feature, and that feature. path = os.environ[ 'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'feature_action_correlations.csv' #doc = open(os.environ['PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'correlations.txt','w') lst = [] for c in action_cols: corrs = find_strongest_correlations(c, feature_cols, corr) lst.append(corrs) #doc.write(str(corrs) + '\n') new_df = pd.DataFrame(lst, columns=[ 'Actions', 'Most Positive Correlation', 'Most Positive Correlation-Feature', 'Most Negative Correlation', 'Most Negative Correlation-Feature' ]) print(new_df['Actions'].values) decode_features(new_df, columns=[ 'Most Positive Correlation-Feature', 'Most Negative Correlation-Feature' ]) decode_classes(new_df, classes='actions', columns=['Actions']) new_df.to_csv(path_or_buf=path)
def build_model(directory, filename): df = tools.read_file(directory, filename) labels = ['visit_ranges'] print(select_features_visits(df).columns.values) X = select_features_visits(df).values Y = df[labels[0]].values n_inputs = X.shape[1] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, shuffle=True) cls = svm.SVC(kernel='linear', C=1).fit(x_train, y_train) return cls
def eval_barriers(directory, filename): print('barriers') df = tools.read_file(directory, filename) labels = ['barrier_1', 'barrier_2', 'barrier_3'] Y, mlb = tools.select_multilabels(df, labels) X = select_features(df).values ''' x_scalar = StandardScaler() X[:, 0:3] = x_scalar.fit_transform(X[:, 0:3]) ''' X = np.expand_dims(X, 2) evaluate_model(X, Y, mlb.classes_, output_filename='barrier_misclassifications.csv', metrics_filename='barriermetrics.csv')
def eval_actions(directory, filename): print('actions') df = tools.read_file(directory, filename) labels = ['action_1', 'action_2', 'action_3'] Y, mlb = tools.select_multilabels(df, labels) X = select_features(df).values csv_path2 = os.environ[ 'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'y.csv' datay = pd.DataFrame(Y, columns=mlb.classes_).to_csv(path_or_buf=csv_path2) ''' #Scaling should only be applied to non-encoded values x_scalar = StandardScaler() X[:, 0:3] = x_scalar.fit_transform(X[:, 0:3]) ''' X = np.expand_dims(X, 2) evaluate_model(X, Y, mlb.classes_, 'action_misclassifications.csv', 'actionmetrics.csv')
def evaluate_visits_models(directory, filename): df = tools.read_file(directory, filename) labels = ['visit_ranges'] X = select_features_visits(df).values Y = df[labels[0]].values n_inputs = X.shape[1] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, shuffle=True) cls = svm.SVC(kernel='linear', C=1).fit(x_train, y_train) lin = svm.LinearSVC(C=1, max_iter=1000).fit(x_train, y_train) rbf = svm.SVC(kernel='rbf', gamma=0.5, C=1).fit(x_train, y_train) poly = svm.SVC(kernel='poly', degree=3, C=1).fit(x_train, y_train) cls_pred = cls.predict(X) lin_pred = lin.predict(X) poly_pred = poly.predict(X) rbf_pred = rbf.predict(X) cls_accuracy = accuracy_score(Y, cls_pred) cls_f1 = f1_score(Y, cls_pred, average='weighted') #print('Accuracy (Linear Kernel): ', "%.2f" % (cls_accuracy * 100)) #print('F1 (Linear Kernel): ', "%.2f" % (cls_f1 * 100)) lin_accuracy = accuracy_score(Y, lin_pred) lin_f1 = f1_score(Y, lin_pred, average='weighted') #print('Accuracy (Linear Kernel): ', "%.2f" % (lin_accuracy * 100)) #print('F1 (Linear Kernel): ', "%.2f" % (lin_f1 * 100)) poly_accuracy = accuracy_score(Y, poly_pred) poly_f1 = f1_score(Y, poly_pred, average='weighted') #print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy * 100)) #print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1 * 100)) rbf_accuracy = accuracy_score(Y, rbf_pred) rbf_f1 = f1_score(Y, rbf_pred, average='weighted') #print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy * 100)) #print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1 * 100)) ## accuracy #print("acuracy:", metrics.accuracy_score(Y, y_pred=rbf_pred)) # precision score #print("precision:", metrics.precision_score(Y, y_pred=rbf_pred,average=None,labels=['L','M','H','VH'])) # recall score #print("recall", metrics.recall_score(Y, y_pred=rbf_pred,average=None,labels=['L','M','H','VH'])) metric_dict = metrics.classification_report(Y, y_pred=rbf_pred, labels=['L', 'M', 'H', 'VH'], output_dict=True) pth = os.environ[ 'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'rbfmetrics.csv' rbfmetrics = pd.DataFrame(metric_dict) rbfmetrics['metrics'] = ['Precision', 'Recall', 'F1 Score', 'Support'] rbfmetrics.to_csv(path_or_buf=pth) # creating a confusion matrix #print(confusion_matrix(Y, cls_pred, labels=['L','M','H','VH'])) linearsvc_con = confusion_matrix(Y, lin_pred, labels=['L', 'M', 'H', 'VH']) poly_con = confusion_matrix(Y, poly_pred, labels=['L', 'M', 'H', 'VH']) rbf_con = confusion_matrix(Y, rbf_pred, labels=['L', 'M', 'H', 'VH']) linearsvc_df = pd.DataFrame( linearsvc_con, columns=['linsvc_L', 'linsvc_M', 'linsvc_H', 'linsvc_VH']) linearsvc_df['ranges'] = ['L', 'M', 'H', 'VH'] poly_df = pd.DataFrame(poly_con, columns=['poly_L', 'poly_M', 'poly_H', 'poly_VH']) poly_df['ranges'] = ['L', 'M', 'H', 'VH'] rbf_df = pd.DataFrame(rbf_con, columns=['rbf_L', 'rbf_M', 'rbf_H', 'rbf_VH']) rbf_df['ranges'] = ['L', 'M', 'H', 'VH'] joined_df = pd.merge(rbf_df, poly_df, on='ranges') joined_df = pd.merge(joined_df, linearsvc_df, on='ranges') joined_df.set_index('ranges', inplace=True) joined_df['classes'] = ['L', 'M', 'H', 'VH'] print(joined_df) pth = os.environ[ 'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'visitmetrics.csv' joined_df.to_csv(path_or_buf=pth)
def preprocess_chinatown(demographics_file, log_file): chinatown_demographics = tools.read_file('datasets', demographics_file) chinatown_logs = tools.read_file('datasets', log_file) chinatown_logs.rename(columns={'Preferred language': 'PDLANG'}, inplace=True) chinatown_logs.rename( columns={"Record ID (automatically assigned)": 'IDSUBJ'}, inplace=True) chinatown_demographics.rename(columns={'age ': 'PDAGE'}, inplace=True) chinatown_demographics.rename(columns={'zip_code': 'PDZIP'}, inplace=True) chinatown_demographics.rename(columns={'record_id': 'IDSUBJ'}, inplace=True) chinatown_demographics.rename(columns={'native_land': 'PDBIRTH'}, inplace=True) chinatown_demographics.rename(columns={'marital status ': 'PDMSTAT'}, inplace=True) chinatown_demographics.rename( columns={'family_members_in_household': 'PDHSIZE'}, inplace=True) chinatown_demographics.rename(columns={'education': 'PDEDU'}, inplace=True) chinatown_demographics.rename(columns={'household_income': 'PDINCOME'}, inplace=True) chinatown_demographics.rename(columns={'occupational status': 'PDEMP'}, inplace=True) chinatown_demographics = chinatown_demographics.sort_values(by='IDSUBJ') chinatown_logs['IDSUBJ'] = chinatown_logs['IDSUBJ'].astype(float) merged_dataset = pd.merge(chinatown_demographics, chinatown_logs, on='IDSUBJ') tools.replace_null_values(merged_dataset) #encode_PDLANG(chinatown_logs) remove_nonenglish_characters(merged_dataset) encode_features(merged_dataset) #17 barriers = [] for col in merged_dataset.columns: if 'Barrier (choose one)' in col: barriers.append(col) merged_dataset['R24Barrier'] = merged_dataset[barriers[:]].agg(','.join, axis=1) actions = [] for col in merged_dataset.columns: if 'Action taken (choose one)' in col: actions.append(col) merged_dataset['R24Action'] = merged_dataset[actions[:]].agg(','.join, axis=1) for num in range(len(merged_dataset['R24Barrier'])): nones = 0 for col in merged_dataset.columns: if 'Date of service' in col: if merged_dataset.at[num, col] == 'None': nones += 1 for n in range(nones): temp = merged_dataset['R24Barrier'][num] if 'None' in temp: temp = temp.replace(',None', '') merged_dataset.at[num, 'R24Barrier'] = temp for n in range(nones): temp = merged_dataset['R24Action'][num] if 'None' in temp: temp = temp.replace(',None', '') merged_dataset.at[num, 'R24Action'] = temp replace_barrier_names(merged_dataset) replace_action_names(merged_dataset) cols_list = merged_dataset.columns.values csv_lists = merged_dataset.values tools.write_to_csv(cols_list, csv_lists, 'files', 'preprocessed_chinatown.csv') return merged_dataset