Beispiel #1
0
def _main_():

    # set pandas viewing options
    desired_width = 320
    pd.set_option('display.width', desired_width)

    # open qs_metadata.xlsx and create df from first sheet
    path = os.getcwd()
    filename = '/qs_metadata.xlsx'
    meta_dfs = CM.xl_to_dfs(path, filename)
    sheetname = 'Sheet1'
    meta_df = meta_dfs[sheetname]

    # identify which columns have relevant metadata in them (might change this get all column names)
    meta_cols = list(meta_df)

    # build dict in format: 'column_name': [filter_values]
    q_meta_dict = {}
    for col in meta_cols:
        q_meta_dict[col] = CM.distinct_from_df(meta_df, col)

    # build dicts in format: 'filter_value': [question_ids]. Nest inside q_meta_dict
    q_meta_shelve = {}
    for key in q_meta_dict.keys():
        if len(q_meta_dict[key]) > 0:
            col_dict = {}
            for filter_val in q_meta_dict[key]:
                id_list = ids_to_list(meta_df, 'id', str(key), filter_val)
                col_dict[filter_val] = id_list
                q_meta_shelve[key] = col_dict

    # create shelve dict
    q_meta_name = 'qs_metadata'
    create_write_shelve(q_meta_shelve, q_meta_name)
Beispiel #2
0
def _main_():
    # make the damn ric dict: ricname: datasourceID (except CII & OSVP, number is not datasourceid)
    rics = {
        'MaRS Discovery District': {
            'db_name': 'MaRS Discovery District',
            'code': 7
        },
        'RIC Centre': {
            'db_name': 'RIC Centre',
            'code': 9
        },
        'Innovation Factory': {
            'db_name': 'Innovation Factory',
            'code': 12
        },
        'NWOIC': {
            'db_name': 'NWO Innovation Centre',
            'code': 14
        },
        'Invest Ottawa': {
            'db_name': 'Invest Ottawa',
            'code': 16
        },
        'IION': {
            'db_name': 'IION',
            'code': 5
        },
        'CII': {
            'db_name': 'MaRS Centre for Impact Investing',
            'code': -1
        },
        'OSVP': {
            'db_name': 'Ontario Scale-Up Voucher Program',
            'code': -1
        },
        'Innovation Guelph': {
            'db_name': 'Innovation Guelph',
            'code': 15
        },
        'WEtech': {
            'db_name': 'WEtech',
            'code': 2
        },
        'SSMIC': {
            'db_name': 'SSMIC',
            'code': 3
        },
        'TechAlliance': {
            'db_name': 'TechAlliance',
            'code': 6
        },
        'Haltech': {
            'db_name': 'Haltech',
            'code': 8
        },
        'Spark Centre': {
            'db_name': 'Spark Centre',
            'code': 10
        },
        'NORCAT': {
            'db_name': 'NORCAT',
            'code': 1
        },
        'VentureLAB': {
            'db_name': 'ventureLAB',
            'code': 11
        },
        'Innovate Niagara': {
            'db_name': 'Innovate Niagara',
            'code': 17
        },
        'Launch Lab': {
            'db_name': 'Launch Lab',
            'code': 13
        }
        # ,'Communitech': {'db_name': 'Communitech', 'code': 4}
    }

    with shelve.open(q_meta_name, 'r') as qs_metadata:

        print("Creating ric_qs dict")
        ric_qs = {}
        for ric in rics:
            if ric in list(qs_metadata['addedby'].keys()):
                ric_qids = include_list(ric)
                ric_qs[ric] = ric_qids
            # elif ric.lower() == 'communitech':
            #     ric_qs[ric] = qs_metadata['which_survey']['COMMUNITECH']
            else:
                ric_qs[ric] = qs_metadata['core/noncore']['core']

    print("Reading qs_metadata.xlsx to df")
    cwd = os.getcwd()
    user_path = os.path.expanduser("~")
    filename = '/qs_metadata.xlsx'
    meta_dfs = CM.xl_to_dfs(cwd, filename)
    sheetname = 'Sheet1'
    meta_df = meta_dfs[sheetname]

    # create master data dict with qid: concatted name (i.e., <survey_section - readable_name>)
    print("Creating master data dict")
    meta_df = meta_df.sort_values(by=['q_num'], ascending=[True])
    meta_df['col_title'] = meta_df['survey_section'].astype(
        str) + ' - ' + meta_df['readable_name']
    data_dict = meta_df[['id', 'col_title', 'title', 'q_num']]

    # split master data dict into one for each ric
    print("Splitting master data dict into 1 per RIC")
    ric_data_dicts = {}
    for ric in ric_qs.keys():
        qids_df = pd.DataFrame(ric_qs[ric], columns=['id'])
        ric_data_dict = pd.merge(qids_df, data_dict, how='inner', on=['id'])
        ric_data_dict.sort_values(by='q_num', inplace=True)
        ric_data_dicts[ric] = ric_data_dict

    # read questions and options from DB
    print("Reading questions and options from DB into qsos df")
    qsos_sql = CM.get_config("config_sql.ini", "ann_survey_18", "all_qsos")
    qsos = DB.pandas_read(qsos_sql)

    # add col_title column to qsos df
    qsos = pd.merge(qsos,
                    meta_df[['id', 'col_title', 'q_num']],
                    how='left',
                    left_on='qid',
                    right_on='id')
    qsos.drop('id', inplace=True, axis=1)
    print("Transforming qsos df")

    # put flag on 'ESSAY', 'TABLE', 'TEXTBOX', 'MENU', 'RADIO' so that their col_title does not change in next step
    qsos['multi_options'] = qsos.q_type.apply(multi_options)

    # for options, make col_title = col_title + "Option: " + [o_label]
    qsos['col_title'] = qsos.apply(opt_col_title, axis=1)
    qsos = qsos[qsos['q_num'] > 0]

    # capture correct order for columns for use later in formatting pivoted datasheets
    col_title_order = pd.Series(qsos.q_num.values,
                                index=qsos.col_title).to_dict()

    # read answers from DB
    print("Reading answers from DB into ans df")
    ans_sql = CM.get_config("config_sql.ini", "ann_survey_18",
                            "sel_ann_survey_res")
    ans = DB.pandas_read(ans_sql)

    # separate process for Communitech shared ventures
    # 1. get list of Communitech shared client answers
    print("Reading Communitech shared clients")
    comm_sql = CM.get_config("config_sql.ini", "ann_survey_18",
                             "sel_communitech_shared")
    comm_ans = DB.pandas_read(comm_sql)
    # 2. concat with rest of answers (?)
    ans = pd.concat([ans, comm_ans])

    # clean ans
    print("Cleaning ans df")
    ans.dropna(subset=['Answer'], inplace=True)
    ans['Answer'] = ans.apply(replacements, axis=1)
    ans['page_pipe'] = ans['page_pipe'].fillna('')

    # for each RIC
    print("\nPer RIC df datasheet creation:")
    for ric in ric_qs:

        # if ric == 'MaRS Discovery District':
        # turn that RIC's qid list into df
        print("\nRIC: {}".format(ric))
        print("Creating df of questions for {}".format(ric))
        qs_df = pd.DataFrame(ric_qs[ric], columns=['qid'])
        qs_df['ric'] = rics[ric]['db_name']

        # left join that df with qsos df on qid
        qs_df = pd.merge(qs_df, qsos, how='left', on='qid')

        # left join resulting df with ans df
        print("Left join qs with ans")
        ric_survey_results = pd.merge(
            qs_df,
            ans,
            how='left',
            left_on=['qid', 'oid', 'ric'],
            right_on=['QuestionID', 'OptionID', 'RIC_Program'])

        # drop empty answers and sort
        print("Clean ans")
        ric_survey_results = ric_survey_results[pd.notnull(
            ric_survey_results['Answer'])]
        ric_survey_results.sort_values(by='q_num', inplace=True)

        # ric_survey_results.dropna(subset=['Answer'])
        print("Pivot into datasheet for {}".format(ric))
        ric_datasheet = ric_survey_results[[
            'resp_id', 'CompanyID', 'col_title', 'Answer', 'page_pipe'
        ]].drop_duplicates()
        ric_datasheet['col_title'] = ric_datasheet[
            'col_title'] + ' ' + ric_datasheet['page_pipe'].astype(str)
        ric_datasheet['rid_cid'] = ric_datasheet['resp_id'].astype(
            float).astype(str) + '-' + ric_datasheet['CompanyID'].astype(str)
        ric_datasheet = ric_datasheet[['rid_cid', 'col_title', 'Answer']]

        try:
            ric_datasheet = ric_datasheet.pivot(index='rid_cid',
                                                columns='col_title',
                                                values='Answer')
            # ric_datasheet = pd.pivot_table(ric_datasheet, values='Answer', columns='col_title', index='rid_cid')

            ric_datasheet.reset_index(inplace=True)

            ric_datasheet['resp_id'], ric_datasheet[
                'CompanyID'] = ric_datasheet['rid_cid'].str.split('-', 1).str
            ric_datasheet.drop('rid_cid', axis=1, inplace=True)
            ric_datasheet = ric_datasheet.apply(pd.to_numeric, errors='ignore')

            # remove non-consenting responses
            for val in list(ric_datasheet):
                if 'consent' in str(val.lower()):
                    consent_col = val
                    ric_datasheet[consent_col] = ric_datasheet[
                        consent_col].str.replace(u"\u2019", "'")
                    ric_datasheet = ric_datasheet[
                        ric_datasheet[consent_col] != "I don't give consent"]
                    consent_col = ''
                    break

            # re-order columns to reflect q_num ordering
            cols = list(ric_datasheet)
            rid_cid = cols[-2:]
            q_cols = cols[:-2]
            ordered_q_cols = []
            for q in q_cols:
                if q[-2:] == '.0':
                    ordered_q_cols.append([col_title_order[q[:-8]], q])
                else:
                    ordered_q_cols.append([col_title_order[q.strip()], q])
            ordered_q_cols.sort()
            for i in range(len(ordered_q_cols)):
                ordered_q_cols[i] = ordered_q_cols[i][1]
            cols = rid_cid + ordered_q_cols
            ric_datasheet = ric_datasheet[cols]

            save_path = path_xl(
                user_path=user_path,
                path_extension=
                "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/",
                filename=ric + '.xlsx')

            # pull out social impact companies separately for use later in CII datasheet
            if ric == 'MaRS Discovery District':
                soc_imp_df = ric_datasheet[
                    ric_datasheet['social_impact - Motives '] == 'Yes']

            if ric != 'CII':
                # save to disc
                results_sheets = [ric_datasheet, ric_data_dicts[ric]]
                sheetnames = ['SurveyData', 'DataDictionary']
                save_xls(results_sheets, save_path, sheetnames)
                print("Wrote to {}".format(save_path))
            else:
                print('Add extra tabs to {} datasheet'.format(ric))
                results_sheets = [
                    ric_datasheet, soc_imp_df, ric_data_dicts[ric],
                    ric_data_dicts['MaRS Discovery District']
                ]
                sheetnames = [
                    'CII_SurveyData', 'All_RICs_SocialImpact_SurveyData',
                    'CII_DataDict', 'MaRS_DataDict'
                ]
                save_xls(results_sheets, save_path, sheetnames)
                print("Wrote to {}".format(save_path))

        except ValueError as ex:
            print("!\nERROR FOR {}: {}\n!\n".format(ric, ex))

            # save conflicting answer values when pivot fails
            save_path = path_xl(
                user_path=user_path,
                path_extension=
                "Box Sync/Workbench/BAP/Annual Survey FY2018/DEV - Results to RICs/__dupies/",
                filename=ric + '_dupies' + '.xlsx')
            save_xls([
                ric_datasheet[ric_datasheet.duplicated(
                    ['rid_cid', 'col_title'], keep=False)]
            ], save_path, ['dupies'])
            continue
        pass