Example #1
0
#df_crowd.dtypes
#type(alt_names[0])

df_expert = prepare_expert_data(data_folder, alternative_map)

df_expert_crowd = pd.concat([df_expert, df_crowd], ignore_index=True)

n_crowd = len(df_crowd['voter'].unique())
#df_expert_crowd['voter'].value_counts()

#a = df_crowd.sort_values(['voter', 'vote'], ascending = (True,True))

#df_crowd[df_crowd[['voter', 'vote']].duplicated()]

############# Aggregate data
crowd_agg = get_aggregated_data(df_crowd, alt_names)
expert_agg = get_aggregated_data(df_expert, alt_names)
#expert_agg = aggregate_experts(expert_agg[alt_names], points_rank, team_size, alt_names)
expert_crowd_agg = get_aggregated_data(df_expert_crowd, alt_names)

######## get sparse data and mapping data set of user id and alternative ids
#df_sparse = convert_data_to_sparse_and_create_mapping(df_expert_crowd)[0]

ratings, alts_lookup, voters_lookup = create_ratings_and_mapping(
    expert_crowd_agg, alt_names)
train, test = train_test_split(ratings, mask_test_size)
#print(df_sparse)
# Check sparsity of data

print("Sparsity of data is: {:.2f} %. ".format(calculate_sparsity(ratings)))
"""
Example #2
0
df_journal['rate']= df_journal['rate'].astype('float')

df_selected_expert =  df_journal # df_science # 
expert_type = 'journal' # 'science' #   

#alts_dict = dict(zip(alternative_map['alternative_id'] , alternative_map['alternative_name']))


#### transacional data of expert and crowd that labeled same alternatives as experts
df_expert_crowd = pd.concat([df_selected_expert, df_crowd], ignore_index=True)
#n_crowd = len(df_crowd['voter'].unique())

############# Aggregate data
#crowd_agg = get_aggregated_data(df_crowd, alt_names)
#expert_agg = get_aggregated_data(df_selected_expert, alt_names)
expert_crowd_agg = get_aggregated_data(df_expert_crowd, alt_names)


'''
Create data for factorization
'''

_, alts_lookup, voters_lookup = create_ratings_and_mapping(expert_crowd_agg, alt_names)

#voter_dict = dict(zip(voters_lookup['voter_id'], voters_lookup['voter']))

##### replace voters name with ids in all dataframes
# df_crowd = pd.merge( voters_lookup,df_crowd, how = 'inner', on = 'voter').drop('voter', axis = 1)
# df_expert_crowd = pd.merge( voters_lookup, df_expert_crowd, how = 'inner', on = 'voter').drop('voter', axis = 1)
# df_selected_expert = pd.merge(voters_lookup,  df_selected_expert, how = 'inner', on = 'voter').drop('voter', axis = 1)
# crowd_agg = pd.merge(voters_lookup,  crowd_agg, how = 'inner', on = 'voter').drop('voter', axis = 1)
def Load_TX_Data(expert_type):

    ### Load Data
    exp = pd.read_excel(open('TX_data/BG/Eksperti_Anketa_BG.xlsx', 'rb'),
                        sheet_name='OdgovoriIzUpitnika')
    drv = pd.read_excel(open('TX_data/BG/Vozaci_Anketa_BG.xlsx', 'rb'),
                        sheet_name='OdgovoriIzUpitnika')
    crd = pd.read_excel(open('TX_data/BG/Korisnici_Anketa_BG.xlsx', 'rb'),
                        sheet_name='OdgovoriIzUpitnika')

    ### List of questions needed for analysis
    questions = [
        'Карактеристике возила [Тип/каросерија возила]',
        'Карактеристике возила [Димензије (ширина врата, гепек...)]',
        'Карактеристике возила [Лак улазак/излазак]',
        'Комфор у возилу [Удобност седишта]',
        'Комфор у возилу [Климатизација и грејање]',
        'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]',
        'Комуникациона опрема [Навигациона мапа (ГПС)]',
        'Комуникациона опрема [Флексибилно плаћање (новац, картица)]',
        'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]',
        'Безбедност и дизајн [Старост возила]',
        'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]',
        'Безбедност и дизајн [Тип/марка и боја возила]',
        'Еколошка подобност [Ниво буке]',
        'Еколошка подобност [Ниво  аерозагађења]',
        'Еколошка подобност [Чиста погонска енергија]'
    ]

    #### dict for renaming questions of crowd questionnaire
    crd_rename_dict = {
        '15.1.Карактеристике возила [Тип/каросерија возила]':
        'Карактеристике возила [Тип/каросерија возила]',
        '15.2.Карактеристике возила [Димензије (ширина врата, гепек...)]':
        'Карактеристике возила [Димензије (ширина врата, гепек...)]',
        '15.3.Карактеристике возила [Лак улазак/излазак]':
        'Карактеристике возила [Лак улазак/излазак]',
        '15.4.Комфор у возилу [Удобност седишта]':
        'Комфор у возилу [Удобност седишта]',
        '15.5.Комфор у возилу [Климатизација и грејање]':
        'Комфор у возилу [Климатизација и грејање]',
        '15.6.Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]':
        'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]',
        '15.7.Комуникациона опрема [Навигациона мапа (ГПС)]':
        'Комуникациона опрема [Навигациона мапа (ГПС)]',
        '15.8.Комуникациона опрема [Прикључак за мобилни телефон]':
        'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]',
        '15.9.Комуникациона опрема [Флексибилно плаћање (новац, картица)]':
        'Комуникациона опрема [Флексибилно плаћање (новац, картица)]',
        '15.10.Безбедност и дизајн [Старост возила]':
        'Безбедност и дизајн [Старост возила]',
        '15.11.Безбедност и дизајн [Опрема у возилу (појас, airbag, ...)]':
        'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]',
        '15.2.Безбедност и дизајн [Тип/марка и боја возила]':
        'Безбедност и дизајн [Тип/марка и боја возила]',
        '15.13.Еколошка подобност [Ниво буке]':
        'Еколошка подобност [Ниво буке]',
        '15.14.Еколошка подобност [Ниво  аерозагађења]':
        'Еколошка подобност [Ниво  аерозагађења]',
        '15.15.Еколошка подобност [Чиста погонска енергија]':
        'Еколошка подобност [Чиста погонска енергија]'
    }

    ## dict for renaming questions of driver questionnaire
    driver_rename_dct = {
        '2.7.1.1.Pristup u vozilo/Tip,karosterija':
        'Карактеристике возила [Тип/каросерија возила]',
        '2.7.1.2.Pristup u vozilo/Dimenzije':
        'Карактеристике возила [Димензије (ширина врата, гепек...)]',
        '2.7.1.3.Pristup u vozilo/Lak Ulazak izlazak':
        'Карактеристике возила [Лак улазак/излазак]',
        '2.7.2.1.Komfor u vozilu/Udobnost sedišta':
        'Комфор у возилу [Удобност седишта]',
        '2.7.2.2.Komfor u vozilu/Klimatizacija i grejanje':
        'Комфор у возилу [Климатизација и грејање]',
        '2.7.2.3.Komfor u vozilu/Čistoća vozila':
        'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]',
        '2.7.3.3.Komunikaciona oprema/GPS':
        'Комуникациона опрема [Навигациона мапа (ГПС)]',
        '2.7.3.1.Komunikaciona oprema/Aplikacija':
        'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]',
        '2.7.3.2.Komunikaciona oprema/Radio veza':
        'Комуникациона опрема [Флексибилно плаћање (новац, картица)]',
        '2.7.4.1.Bezbednost i dizajn/Starost vozila':
        'Безбедност и дизајн [Старост возила]',
        '2.7.4.2.Bezbednost i dizajn/Starost vozila':
        'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]',
        '2.7.4.3.Bezbednost i dizajn/Tip, marka i boja':
        'Безбедност и дизајн [Тип/марка и боја возила]',
        '2.7.5.1.Ekološka podobnost/Nivo buke':
        'Еколошка подобност [Ниво буке]',
        '2.7.5.2.Ekološka podobnost/Nivo aerozagadjenja':
        'Еколошка подобност [Ниво  аерозагађења]',
        '2.7.5.3.Ekološka podobnost/Čista pogonska energija':
        'Еколошка подобност [Чиста погонска енергија]'
    }

    ### remap textual answers to numbers
    exp = remap_answers_tx(exp)
    crd = remap_answers_tx(crd)

    ### rename colums
    drv = drv.rename(columns=driver_rename_dct)
    crd = crd.rename(columns=crd_rename_dict)

    ### select only attributes of interest
    e = exp[questions]
    d = drv[questions]
    c = crd[questions]

    #### create id of every user
    e['id'] = e.index
    d['id'] = d.index
    c['id'] = c.index

    #### create transactional data
    exp_trans = pd.melt(e,
                        id_vars=['id'],
                        value_vars=questions,
                        var_name='question',
                        value_name='rate')

    drv_trans = pd.melt(d,
                        id_vars=['id'],
                        value_vars=questions,
                        var_name='question',
                        value_name='rate')

    crd_trans = pd.melt(c,
                        id_vars=['id'],
                        value_vars=questions,
                        var_name='question',
                        value_name='rate')

    #### create id for each question
    exp_trans['question_id'] = exp_trans.groupby('question').ngroup()

    #### create question map (id to original text)
    question_map = exp_trans[['question_id', 'question'
                              ]].drop_duplicates().reset_index().drop('index',
                                                                      axis=1)
    #pd.Series(question_map.question_id.values,index=question_map.question).to_dict()
    drv_trans = pd.merge(drv_trans, question_map, on='question')
    drv_trans['rate'] = pd.to_numeric(drv_trans['rate'], errors='coerce')

    crd_trans = pd.merge(crd_trans, question_map, on='question')
    crd_trans['rate'] = pd.to_numeric(crd_trans['rate'], errors='coerce')

    ### get all alternative names (ids)
    alt_names = list(question_map['question_id'].sort_values())

    # create user names (with specified types of each voter)
    exp_trans['voter'] = exp_trans.apply(
        lambda x: 'traffic_' + str(x['id']) + '_expert', axis=1)
    drv_trans['voter'] = drv_trans.apply(
        lambda x: 'driver_' + str(x['id']) + '_expert', axis=1)
    crd_trans['voter'] = crd_trans.apply(lambda x: str(x['id']) + '_crowd',
                                         axis=1)

    #### select attributes for analysis
    df_expert = exp_trans[['voter', 'question_id', 'rate']]
    df_crowd = crd_trans[['voter', 'question_id', 'rate']]
    df_driver = drv_trans[['voter', 'question_id', 'rate']]

    df_expert['rate'] = df_expert['rate'].astype('float')
    df_crowd['rate'] = df_crowd['rate'].astype('float')
    df_driver['rate'] = df_driver['rate'].astype('float')

    df_crowd = df_crowd.dropna()
    df_expert = df_expert.dropna()
    df_driver = df_driver.dropna()
    ## filter possible errors and remove missing values
    df_expert = df_expert.loc[(df_expert['rate'] <= 3)
                              & (df_expert['rate'] > 0)]
    df_driver = df_driver.loc[(df_driver['rate'] <= 3)
                              & (df_driver['rate'] > 0)]
    df_crowd = df_crowd.loc[(df_crowd['rate'] <= 3) & (df_crowd['rate'] > 0)]

    all_votes = df_crowd.append(df_expert).append(df_driver)

    if expert_type == 'driver':
        df_selected_expert = df_driver
    else:
        df_selected_expert = df_expert

    df_expert_crowd = pd.concat([df_selected_expert, df_crowd],
                                ignore_index=True)
    #n_crowd = len(df_crowd['voter'].unique())

    ############# Aggregate data
    crowd_agg = get_aggregated_data(df_crowd,
                                    alt_names,
                                    index_column='voter',
                                    column='question_id',
                                    value='rate')
    expert_agg = get_aggregated_data(df_selected_expert,
                                     alt_names,
                                     index_column='voter',
                                     column='question_id',
                                     value='rate')
    expert_crowd_agg = get_aggregated_data(df_expert_crowd,
                                           alt_names,
                                           index_column='voter',
                                           column='question_id',
                                           value='rate')

    ############ Create user mapping

    _, _, voter_map = create_ratings_and_mapping(expert_crowd_agg,
                                                 alt_names,
                                                 voter_col='voter')

    ##### replace voters name with ids in all dataframes
    df_crowd = pd.merge(voter_map, df_crowd, how='inner',
                        on='voter').drop('voter', axis=1)
    df_expert_crowd = pd.merge(voter_map,
                               df_expert_crowd,
                               how='inner',
                               on='voter').drop('voter', axis=1)
    df_selected_expert = pd.merge(voter_map,
                                  df_selected_expert,
                                  how='inner',
                                  on='voter').drop('voter', axis=1)

    crowd_agg = pd.merge(voter_map, crowd_agg, how='inner',
                         on='voter').drop('voter', axis=1)
    cr_voter = crowd_agg['voter_id']
    crowd_agg = crowd_agg[alt_names].replace(0, np.nan)
    crowd_agg['voter_id'] = cr_voter

    expert_agg = pd.merge(voter_map, expert_agg, how='inner',
                          on='voter').drop('voter', axis=1)
    exp_voter = expert_agg['voter_id']
    expert_agg = expert_agg[alt_names].replace(0, np.nan)
    expert_agg['voter_id'] = exp_voter
    #### extract expert and crowd ids for similarity
    expert_ids = get_user_ids_from_mapping(voter_map, 'expert')
    crowd_ids = get_user_ids_from_mapping(voter_map, 'crowd')

    df_alt_votes = get_aggregated_data(pd.concat(
        [df_crowd, df_selected_expert]),
                                       voter_map['voter_id'],
                                       index_column='question_id',
                                       column='voter_id',
                                       value='rate')
    qu = df_alt_votes['question_id']
    df_alt_votes = df_alt_votes[crowd_ids + expert_ids].replace(0, np.nan)
    df_alt_votes['question_id'] = qu

    result_dict = {
        'question_map': question_map,
        'alt_names': alt_names,
        'df_crowd': df_crowd,
        'df_selected_expert': df_selected_expert,
        'df_driver': df_driver,
        'df_traffic': df_expert,
        'all_votes': all_votes,
        'expert_ids': expert_ids,
        'crowd_ids': crowd_ids,
        'df_alt_votes': df_alt_votes
    }

    return result_dict
def experiment_artifical_data(df_expert_crowd, max_grade=10):

    df_expert_crowd = df_expert_crowd.rename(columns={
        'votes': 'rate',
        'group': 'voter',
        'case': 'alternative_name'
    })
    df_expert_crowd['voter'] = df_expert_crowd[
        'voter'] + '_' + df_expert_crowd['id'].astype(str)
    df_expert_crowd['rate'] = df_expert_crowd['rate'].astype('float')

    alternative_map = crete_alternatives_map(
        df_expert_crowd, alternative_name='alternative_name')
    #alt_names = list(alternative_map['alternative_id'].unique())

    voter_lookup = df_expert_crowd.copy()
    voter_lookup['voter_id'] = voter_lookup.groupby('voter').ngroup()
    voter_lookup = voter_lookup[[
        'voter', 'voter_id'
    ]].drop_duplicates().reset_index().drop('index', axis=1)
    voter_lookup = voter_lookup.sort_values('voter_id')

    df_expert_crowd = pd.merge(df_expert_crowd,
                               alternative_map,
                               on='alternative_name')[[
                                   'voter', 'alternative_id', 'rate'
                               ]]
    df_expert_crowd = pd.merge(voter_lookup, df_expert_crowd,
                               on='voter').drop('voter', axis=1)

    expert_ids = get_user_ids_from_mapping(voter_lookup, 'expert')
    crowd_ids = get_user_ids_from_mapping(voter_lookup, 'crowd')

    #df_expert = df_expert_crowd[df_expert_crowd['voter_id'].isin(expert_ids)]
    #df_crowd = df_expert_crowd[df_expert_crowd['voter_id'].isin(crowd_ids)]
    '''
    Optimize grade absolute
    
    '''

    df_alt_votes = get_aggregated_data(df_expert_crowd,
                                       voter_lookup['voter_id'],
                                       index_column='alternative_id',
                                       column='voter_id',
                                       value='rate')

    result_optm_abs0 = pd.DataFrame(df_alt_votes['alternative_id'],
                                    columns=(['alternative_id']))
    result_optm_abs1 = pd.DataFrame(df_alt_votes['alternative_id'],
                                    columns=(['alternative_id']))

    result_optm_abs0['optimal_grade'] = df_alt_votes[crowd_ids].apply(
        lambda x: np.median(x), axis=1)
    result_optm_abs0['alpha'] = 0.0

    result_optm_abs1['optimal_grade'] = df_alt_votes[expert_ids].apply(
        lambda x: np.median(x), axis=1)
    result_optm_abs1['alpha'] = 1.0

    result_optm_abs = pd.concat([result_optm_abs0, result_optm_abs1])

    result_optm_abs = calculate_satisfaction_absolute(df_alt_votes,
                                                      result_optm_abs,
                                                      max_grade, expert_ids,
                                                      crowd_ids)

    # del(result_optm_abs0)
    # del(result_optm_abs1)
    '''
    ################################ Results
    '''
    ###### nash

    cons = [{'type': 'eq', 'fun': lambda_const}]
    bnds = ((0.01, 0.99), (0.01, 0.99), (1, 10))

    res_nash = nash_results(df_alt_votes,
                            max_grade,
                            crowd_ids,
                            expert_ids,
                            cons,
                            bnds,
                            lambda_expert=0.5)

    #res_nash.to_csv('results/results_nash'  + ' .csv')

    ###### kalai

    res_kalai = kalai_results(df_alt_votes, result_optm_abs, max_grade,
                              crowd_ids, expert_ids)

    res_baseline = calculate_baseline_stats_satisfaction(
        df_alt_votes,
        max_grade,
        crowd_ids,
        expert_ids,
        stats=['np.mean', 'np.median', 'mode'])

    res_overal_sat = avg_satisfaction_by_group(res_kalai, res_nash,
                                               res_baseline).reset_index()

    max_satisfaction = result_optm_abs[[
        'alternative_id', 'crowd_sat', 'expert_sat'
    ]].groupby(by='alternative_id').agg('max').reset_index()
    max_satisfaction = max_satisfaction.rename(columns={
        'crowd_sat': 'max_crowd_sat',
        'expert_sat': 'max_expert_sat'
    })
    max_satisfaction['max_satisfaction_sum'] = max_satisfaction[
        'max_crowd_sat'] + max_satisfaction['max_expert_sat']
    max_satisfaction['max_satisfaction_area'] = max_satisfaction[
        'max_crowd_sat'] * max_satisfaction['max_expert_sat']

    min_satisfaction = result_optm_abs[[
        'alternative_id', 'crowd_sat', 'expert_sat'
    ]].groupby(by='alternative_id').agg('min').reset_index()
    min_satisfaction = min_satisfaction.rename(columns={
        'crowd_sat': 'min_crowd_sat',
        'expert_sat': 'min_expert_sat'
    })
    min_satisfaction['min_satisfaction_sum'] = min_satisfaction[
        'min_crowd_sat'] + min_satisfaction['min_expert_sat']
    min_satisfaction['min_satisfaction_area'] = min_satisfaction[
        'min_crowd_sat'] * min_satisfaction['min_expert_sat']

    ref_satisfaction = pd.merge(max_satisfaction,
                                min_satisfaction,
                                on='alternative_id')

    res_nash = relative_detail_satisfaction_nash(res_nash, max_satisfaction)
    res_kalai = relative_detail_satisfaction_kalai(res_kalai, max_satisfaction)
    res_baseline = relative_detail_satisfaction_baseline(
        res_baseline, max_satisfaction)

    ##### Calculate gain
    res_nash['gain_ratio'] = pd.merge(
        ref_satisfaction, res_nash, on='alternative_id'
    ).apply(lambda x: np.abs(
        ((x['lambda_exp'] * x['max_expert_sat'] +
          (1 - x['lambda_exp']) * x['min_expert_sat']) / x['max_expert_sat']) -
        ((x['lambda_exp'] * x['min_crowd_sat'] +
          (1 - x['lambda_exp']) * x['max_crowd_sat']) / x['max_crowd_sat'])),
            axis=1)

    res_kalai['gain_ratio'] = pd.merge(
        ref_satisfaction, res_kalai, on='alternative_id'
    ).apply(lambda x: np.abs(
        ((x['lambda_exp'] * x['max_expert_sat'] +
          (1 - x['lambda_exp']) * x['min_expert_sat']) / x['max_expert_sat']) -
        ((x['lambda_exp'] * x['min_crowd_sat'] +
          (1 - x['lambda_exp']) * x['max_crowd_sat']) / x['max_crowd_sat'])),
            axis=1)

    ## ----------------------------------------------------------------------------

    # res_relative_sat_ext = relative_overall_satisfaction(res_nash_extreme, res_kalai_extreme, res_baseline_extreme, max_satisfaction)
    res_relative_sat = relative_overall_satisfaction(res_nash, res_kalai,
                                                     res_baseline,
                                                     ref_satisfaction)
    res_relative_sat

    #################### Result analysis - lower uncertanty

    #df_crowd_sample = df_crowd.groupby('vote', group_keys = False).apply(lambda x: x.sample(min(len(x),3)))
    res_kalai = pd.merge(alternative_map, res_kalai, on='alternative_id')
    res_nash = pd.merge(alternative_map, res_nash, on='alternative_id')
    res_baseline = pd.merge(alternative_map, res_baseline, on='alternative_id')

    # res_kalai_extreme = pd.merge(alternative_map, res_kalai_extreme, on = 'alternative_id')
    # res_nash_extreme = pd.merge(alternative_map, res_nash_extreme, on = 'alternative_id')

    return res_kalai, res_nash, res_baseline, res_overal_sat, res_relative_sat