Beispiel #1
0
def preprocess_dupage(demographics_file, log_file):
    #Read datasets into dataframes
    dupage_demographics = tools.read_file('datasets', demographics_file)
    dupage_logs = tools.read_file('datasets', log_file)

    #Remove null values
    tools.replace_null_values(dupage_logs)
    tools.replace_null_values(dupage_demographics)

    #Group rows in dupage_log by subject ID
    dupage_logs = tools.join_by_column(dupage_logs, 'IDSUBJ',
                                       ['R24Barrier', 'R24Action'])

    #Fill visit_counts column with total number of barriers logged per subject.
    tools.create_empty_column(dupage_logs, 'visit_counts',
                              len(dupage_logs['IDSUBJ']))
    count_visits(dupage_logs)

    inner_merged = pd.merge(dupage_logs, dupage_demographics, on=['IDSUBJ'])
    tools.encode_PDBIRTH(inner_merged)
    encode_income_levels(inner_merged)
    encode_emp_status(inner_merged)

    replace_barrier_names(inner_merged)
    replace_action_names(inner_merged)

    cols_list = inner_merged.columns.values
    csv_lists = inner_merged.values
    tools.write_to_csv(cols_list, csv_lists, 'files',
                       'preprocessed_dupage.csv')
    return inner_merged
Beispiel #2
0
def feature_correlations():
    df = tools.read_file('files', 'dupage_chinatown.csv')
    df_features = multilabel_classification.select_features(df)
    cols = df_features.columns.values

    # Calculate correlations between all features
    corr = df_features.corr()

    #Return a list with the given feature, the highest correlation with another feature, and that feature.
    col_name = 'visit_counts'
    path = os.environ[
        'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'feature_correlations.csv'
    #doc = open(os.environ['PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'correlations.txt','w')
    lst = []
    for c in cols:
        corrs = find_strongest_correlations(c, cols, corr)
        lst.append(corrs)
        #doc.write(str(corrs) + '\n')
    new_df = pd.DataFrame(lst,
                          columns=[
                              'Feature', 'Most Positive Correlation',
                              'Most Positive Correlation-Feature',
                              'Most Negative Correlation',
                              'Most Negative Correlation-Feature'
                          ])
    print(new_df['Feature'])
    decode_features(new_df,
                    columns=[
                        'Feature', 'Most Positive Correlation-Feature',
                        'Most Negative Correlation-Feature'
                    ])

    new_df.to_csv(path_or_buf=path)
Beispiel #3
0
def feature_actions_correlations():
    df = tools.read_file('files', 'dupage_chinatown.csv')
    df_features = multilabel_classification.select_features(df)
    feature_cols = df_features.columns

    labels = ['action_1', 'action_2', 'action_3']
    Y, mlb = tools.select_multilabels(df, labels)
    df_actions = pd.DataFrame(Y, columns=mlb.classes_)
    action_cols = mlb.classes_

    length = len(df_actions)
    ind = range(length)
    df_features['ind'] = ind
    df_actions['ind'] = ind
    merged = pd.merge(df_features, df_actions, on='ind')
    merged.set_index('ind', inplace=True)
    print(merged)

    # Calculate correlations between all features
    corr = merged.corr()
    print(corr)

    #Return a list with the given feature, the highest correlation with another feature, and that feature.

    path = os.environ[
        'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'feature_action_correlations.csv'
    #doc = open(os.environ['PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'correlations.txt','w')
    lst = []
    for c in action_cols:
        corrs = find_strongest_correlations(c, feature_cols, corr)
        lst.append(corrs)
        #doc.write(str(corrs) + '\n')
    new_df = pd.DataFrame(lst,
                          columns=[
                              'Actions', 'Most Positive Correlation',
                              'Most Positive Correlation-Feature',
                              'Most Negative Correlation',
                              'Most Negative Correlation-Feature'
                          ])
    print(new_df['Actions'].values)
    decode_features(new_df,
                    columns=[
                        'Most Positive Correlation-Feature',
                        'Most Negative Correlation-Feature'
                    ])
    decode_classes(new_df, classes='actions', columns=['Actions'])
    new_df.to_csv(path_or_buf=path)
Beispiel #4
0
def build_model(directory, filename):

    df = tools.read_file(directory, filename)
    labels = ['visit_ranges']
    print(select_features_visits(df).columns.values)
    X = select_features_visits(df).values
    Y = df[labels[0]].values
    n_inputs = X.shape[1]

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.7,
                                                        shuffle=True)

    cls = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)

    return cls
Beispiel #5
0
def eval_barriers(directory, filename):
    print('barriers')
    df = tools.read_file(directory, filename)

    labels = ['barrier_1', 'barrier_2', 'barrier_3']

    Y, mlb = tools.select_multilabels(df, labels)
    X = select_features(df).values
    '''
    x_scalar = StandardScaler()
    X[:, 0:3] = x_scalar.fit_transform(X[:, 0:3])
    '''
    X = np.expand_dims(X, 2)
    evaluate_model(X,
                   Y,
                   mlb.classes_,
                   output_filename='barrier_misclassifications.csv',
                   metrics_filename='barriermetrics.csv')
Beispiel #6
0
def eval_actions(directory, filename):
    print('actions')
    df = tools.read_file(directory, filename)

    labels = ['action_1', 'action_2', 'action_3']

    Y, mlb = tools.select_multilabels(df, labels)
    X = select_features(df).values

    csv_path2 = os.environ[
        'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'y.csv'
    datay = pd.DataFrame(Y, columns=mlb.classes_).to_csv(path_or_buf=csv_path2)
    '''
    #Scaling should only be applied to non-encoded values
    x_scalar = StandardScaler()
    X[:, 0:3] = x_scalar.fit_transform(X[:, 0:3])
    '''
    X = np.expand_dims(X, 2)
    evaluate_model(X, Y, mlb.classes_, 'action_misclassifications.csv',
                   'actionmetrics.csv')
Beispiel #7
0
def evaluate_visits_models(directory, filename):
    df = tools.read_file(directory, filename)
    labels = ['visit_ranges']

    X = select_features_visits(df).values

    Y = df[labels[0]].values
    n_inputs = X.shape[1]

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        shuffle=True)

    cls = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
    lin = svm.LinearSVC(C=1, max_iter=1000).fit(x_train, y_train)
    rbf = svm.SVC(kernel='rbf', gamma=0.5, C=1).fit(x_train, y_train)
    poly = svm.SVC(kernel='poly', degree=3, C=1).fit(x_train, y_train)

    cls_pred = cls.predict(X)
    lin_pred = lin.predict(X)
    poly_pred = poly.predict(X)
    rbf_pred = rbf.predict(X)

    cls_accuracy = accuracy_score(Y, cls_pred)
    cls_f1 = f1_score(Y, cls_pred, average='weighted')
    #print('Accuracy (Linear Kernel): ', "%.2f" % (cls_accuracy * 100))
    #print('F1 (Linear Kernel): ', "%.2f" % (cls_f1 * 100))

    lin_accuracy = accuracy_score(Y, lin_pred)
    lin_f1 = f1_score(Y, lin_pred, average='weighted')
    #print('Accuracy (Linear Kernel): ', "%.2f" % (lin_accuracy * 100))
    #print('F1 (Linear Kernel): ', "%.2f" % (lin_f1 * 100))

    poly_accuracy = accuracy_score(Y, poly_pred)
    poly_f1 = f1_score(Y, poly_pred, average='weighted')
    #print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy * 100))
    #print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1 * 100))

    rbf_accuracy = accuracy_score(Y, rbf_pred)
    rbf_f1 = f1_score(Y, rbf_pred, average='weighted')
    #print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy * 100))
    #print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1 * 100))

    ## accuracy
    #print("acuracy:", metrics.accuracy_score(Y, y_pred=rbf_pred))
    # precision score
    #print("precision:", metrics.precision_score(Y, y_pred=rbf_pred,average=None,labels=['L','M','H','VH']))
    # recall score
    #print("recall", metrics.recall_score(Y, y_pred=rbf_pred,average=None,labels=['L','M','H','VH']))
    metric_dict = metrics.classification_report(Y,
                                                y_pred=rbf_pred,
                                                labels=['L', 'M', 'H', 'VH'],
                                                output_dict=True)
    pth = os.environ[
        'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'rbfmetrics.csv'
    rbfmetrics = pd.DataFrame(metric_dict)
    rbfmetrics['metrics'] = ['Precision', 'Recall', 'F1 Score', 'Support']
    rbfmetrics.to_csv(path_or_buf=pth)

    # creating a confusion matrix
    #print(confusion_matrix(Y, cls_pred, labels=['L','M','H','VH']))
    linearsvc_con = confusion_matrix(Y, lin_pred, labels=['L', 'M', 'H', 'VH'])
    poly_con = confusion_matrix(Y, poly_pred, labels=['L', 'M', 'H', 'VH'])
    rbf_con = confusion_matrix(Y, rbf_pred, labels=['L', 'M', 'H', 'VH'])

    linearsvc_df = pd.DataFrame(
        linearsvc_con,
        columns=['linsvc_L', 'linsvc_M', 'linsvc_H', 'linsvc_VH'])
    linearsvc_df['ranges'] = ['L', 'M', 'H', 'VH']
    poly_df = pd.DataFrame(poly_con,
                           columns=['poly_L', 'poly_M', 'poly_H', 'poly_VH'])
    poly_df['ranges'] = ['L', 'M', 'H', 'VH']
    rbf_df = pd.DataFrame(rbf_con,
                          columns=['rbf_L', 'rbf_M', 'rbf_H', 'rbf_VH'])
    rbf_df['ranges'] = ['L', 'M', 'H', 'VH']
    joined_df = pd.merge(rbf_df, poly_df, on='ranges')
    joined_df = pd.merge(joined_df, linearsvc_df, on='ranges')
    joined_df.set_index('ranges', inplace=True)
    joined_df['classes'] = ['L', 'M', 'H', 'VH']
    print(joined_df)
    pth = os.environ[
        'PYTHONPATH'] + os.path.sep + 'files' + os.path.sep + 'visitmetrics.csv'
    joined_df.to_csv(path_or_buf=pth)
def preprocess_chinatown(demographics_file, log_file):
    chinatown_demographics = tools.read_file('datasets', demographics_file)
    chinatown_logs = tools.read_file('datasets', log_file)

    chinatown_logs.rename(columns={'Preferred language': 'PDLANG'},
                          inplace=True)
    chinatown_logs.rename(
        columns={"Record ID (automatically assigned)": 'IDSUBJ'}, inplace=True)
    chinatown_demographics.rename(columns={'age  ': 'PDAGE'}, inplace=True)
    chinatown_demographics.rename(columns={'zip_code': 'PDZIP'}, inplace=True)
    chinatown_demographics.rename(columns={'record_id': 'IDSUBJ'},
                                  inplace=True)
    chinatown_demographics.rename(columns={'native_land': 'PDBIRTH'},
                                  inplace=True)
    chinatown_demographics.rename(columns={'marital status ': 'PDMSTAT'},
                                  inplace=True)
    chinatown_demographics.rename(
        columns={'family_members_in_household': 'PDHSIZE'}, inplace=True)
    chinatown_demographics.rename(columns={'education': 'PDEDU'}, inplace=True)
    chinatown_demographics.rename(columns={'household_income': 'PDINCOME'},
                                  inplace=True)
    chinatown_demographics.rename(columns={'occupational status': 'PDEMP'},
                                  inplace=True)

    chinatown_demographics = chinatown_demographics.sort_values(by='IDSUBJ')
    chinatown_logs['IDSUBJ'] = chinatown_logs['IDSUBJ'].astype(float)

    merged_dataset = pd.merge(chinatown_demographics,
                              chinatown_logs,
                              on='IDSUBJ')
    tools.replace_null_values(merged_dataset)
    #encode_PDLANG(chinatown_logs)
    remove_nonenglish_characters(merged_dataset)
    encode_features(merged_dataset)
    #17
    barriers = []
    for col in merged_dataset.columns:
        if 'Barrier (choose one)' in col:
            barriers.append(col)
    merged_dataset['R24Barrier'] = merged_dataset[barriers[:]].agg(','.join,
                                                                   axis=1)

    actions = []
    for col in merged_dataset.columns:
        if 'Action taken (choose one)' in col:
            actions.append(col)
    merged_dataset['R24Action'] = merged_dataset[actions[:]].agg(','.join,
                                                                 axis=1)

    for num in range(len(merged_dataset['R24Barrier'])):
        nones = 0
        for col in merged_dataset.columns:
            if 'Date of service' in col:
                if merged_dataset.at[num, col] == 'None':
                    nones += 1
        for n in range(nones):
            temp = merged_dataset['R24Barrier'][num]
            if 'None' in temp:
                temp = temp.replace(',None', '')
            merged_dataset.at[num, 'R24Barrier'] = temp
        for n in range(nones):
            temp = merged_dataset['R24Action'][num]
            if 'None' in temp:
                temp = temp.replace(',None', '')
            merged_dataset.at[num, 'R24Action'] = temp

    replace_barrier_names(merged_dataset)
    replace_action_names(merged_dataset)
    cols_list = merged_dataset.columns.values
    csv_lists = merged_dataset.values
    tools.write_to_csv(cols_list, csv_lists, 'files',
                       'preprocessed_chinatown.csv')
    return merged_dataset