#df_crowd.dtypes #type(alt_names[0]) df_expert = prepare_expert_data(data_folder, alternative_map) df_expert_crowd = pd.concat([df_expert, df_crowd], ignore_index=True) n_crowd = len(df_crowd['voter'].unique()) #df_expert_crowd['voter'].value_counts() #a = df_crowd.sort_values(['voter', 'vote'], ascending = (True,True)) #df_crowd[df_crowd[['voter', 'vote']].duplicated()] ############# Aggregate data crowd_agg = get_aggregated_data(df_crowd, alt_names) expert_agg = get_aggregated_data(df_expert, alt_names) #expert_agg = aggregate_experts(expert_agg[alt_names], points_rank, team_size, alt_names) expert_crowd_agg = get_aggregated_data(df_expert_crowd, alt_names) ######## get sparse data and mapping data set of user id and alternative ids #df_sparse = convert_data_to_sparse_and_create_mapping(df_expert_crowd)[0] ratings, alts_lookup, voters_lookup = create_ratings_and_mapping( expert_crowd_agg, alt_names) train, test = train_test_split(ratings, mask_test_size) #print(df_sparse) # Check sparsity of data print("Sparsity of data is: {:.2f} %. ".format(calculate_sparsity(ratings))) """
df_journal['rate']= df_journal['rate'].astype('float') df_selected_expert = df_journal # df_science # expert_type = 'journal' # 'science' # #alts_dict = dict(zip(alternative_map['alternative_id'] , alternative_map['alternative_name'])) #### transacional data of expert and crowd that labeled same alternatives as experts df_expert_crowd = pd.concat([df_selected_expert, df_crowd], ignore_index=True) #n_crowd = len(df_crowd['voter'].unique()) ############# Aggregate data #crowd_agg = get_aggregated_data(df_crowd, alt_names) #expert_agg = get_aggregated_data(df_selected_expert, alt_names) expert_crowd_agg = get_aggregated_data(df_expert_crowd, alt_names) ''' Create data for factorization ''' _, alts_lookup, voters_lookup = create_ratings_and_mapping(expert_crowd_agg, alt_names) #voter_dict = dict(zip(voters_lookup['voter_id'], voters_lookup['voter'])) ##### replace voters name with ids in all dataframes # df_crowd = pd.merge( voters_lookup,df_crowd, how = 'inner', on = 'voter').drop('voter', axis = 1) # df_expert_crowd = pd.merge( voters_lookup, df_expert_crowd, how = 'inner', on = 'voter').drop('voter', axis = 1) # df_selected_expert = pd.merge(voters_lookup, df_selected_expert, how = 'inner', on = 'voter').drop('voter', axis = 1) # crowd_agg = pd.merge(voters_lookup, crowd_agg, how = 'inner', on = 'voter').drop('voter', axis = 1)
def Load_TX_Data(expert_type): ### Load Data exp = pd.read_excel(open('TX_data/BG/Eksperti_Anketa_BG.xlsx', 'rb'), sheet_name='OdgovoriIzUpitnika') drv = pd.read_excel(open('TX_data/BG/Vozaci_Anketa_BG.xlsx', 'rb'), sheet_name='OdgovoriIzUpitnika') crd = pd.read_excel(open('TX_data/BG/Korisnici_Anketa_BG.xlsx', 'rb'), sheet_name='OdgovoriIzUpitnika') ### List of questions needed for analysis questions = [ 'Карактеристике возила [Тип/каросерија возила]', 'Карактеристике возила [Димензије (ширина врата, гепек...)]', 'Карактеристике возила [Лак улазак/излазак]', 'Комфор у возилу [Удобност седишта]', 'Комфор у возилу [Климатизација и грејање]', 'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]', 'Комуникациона опрема [Навигациона мапа (ГПС)]', 'Комуникациона опрема [Флексибилно плаћање (новац, картица)]', 'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]', 'Безбедност и дизајн [Старост возила]', 'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]', 'Безбедност и дизајн [Тип/марка и боја возила]', 'Еколошка подобност [Ниво буке]', 'Еколошка подобност [Ниво аерозагађења]', 'Еколошка подобност [Чиста погонска енергија]' ] #### dict for renaming questions of crowd questionnaire crd_rename_dict = { '15.1.Карактеристике возила [Тип/каросерија возила]': 'Карактеристике возила [Тип/каросерија возила]', '15.2.Карактеристике возила [Димензије (ширина врата, гепек...)]': 'Карактеристике возила [Димензије (ширина врата, гепек...)]', '15.3.Карактеристике возила [Лак улазак/излазак]': 'Карактеристике возила [Лак улазак/излазак]', '15.4.Комфор у возилу [Удобност седишта]': 'Комфор у возилу [Удобност седишта]', '15.5.Комфор у возилу [Климатизација и грејање]': 'Комфор у возилу [Климатизација и грејање]', '15.6.Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]': 'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]', '15.7.Комуникациона опрема [Навигациона мапа (ГПС)]': 'Комуникациона опрема [Навигациона мапа (ГПС)]', '15.8.Комуникациона опрема [Прикључак за мобилни телефон]': 'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]', '15.9.Комуникациона опрема [Флексибилно плаћање (новац, картица)]': 'Комуникациона опрема [Флексибилно плаћање (новац, картица)]', '15.10.Безбедност и дизајн [Старост возила]': 'Безбедност и дизајн [Старост возила]', '15.11.Безбедност и дизајн [Опрема у возилу (појас, airbag, ...)]': 'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]', '15.2.Безбедност и дизајн [Тип/марка и боја возила]': 'Безбедност и дизајн [Тип/марка и боја возила]', '15.13.Еколошка подобност [Ниво буке]': 'Еколошка подобност [Ниво буке]', '15.14.Еколошка подобност [Ниво аерозагађења]': 'Еколошка подобност [Ниво аерозагађења]', '15.15.Еколошка подобност [Чиста погонска енергија]': 'Еколошка подобност [Чиста погонска енергија]' } ## dict for renaming questions of driver questionnaire driver_rename_dct = { '2.7.1.1.Pristup u vozilo/Tip,karosterija': 'Карактеристике возила [Тип/каросерија возила]', '2.7.1.2.Pristup u vozilo/Dimenzije': 'Карактеристике возила [Димензије (ширина врата, гепек...)]', '2.7.1.3.Pristup u vozilo/Lak Ulazak izlazak': 'Карактеристике возила [Лак улазак/излазак]', '2.7.2.1.Komfor u vozilu/Udobnost sedišta': 'Комфор у возилу [Удобност седишта]', '2.7.2.2.Komfor u vozilu/Klimatizacija i grejanje': 'Комфор у возилу [Климатизација и грејање]', '2.7.2.3.Komfor u vozilu/Čistoća vozila': 'Комфор у возилу [Чистоћа возила (спољашњост и унутрашњост)]', '2.7.3.3.Komunikaciona oprema/GPS': 'Комуникациона опрема [Навигациона мапа (ГПС)]', '2.7.3.1.Komunikaciona oprema/Aplikacija': 'Комуникациона опрема [Опрема за резервацију вожњи (апликација, радио веза, ...)]', '2.7.3.2.Komunikaciona oprema/Radio veza': 'Комуникациона опрема [Флексибилно плаћање (новац, картица)]', '2.7.4.1.Bezbednost i dizajn/Starost vozila': 'Безбедност и дизајн [Старост возила]', '2.7.4.2.Bezbednost i dizajn/Starost vozila': 'Безбедност и дизајн [Опрема у возилу (airbag,АБС ...)]', '2.7.4.3.Bezbednost i dizajn/Tip, marka i boja': 'Безбедност и дизајн [Тип/марка и боја возила]', '2.7.5.1.Ekološka podobnost/Nivo buke': 'Еколошка подобност [Ниво буке]', '2.7.5.2.Ekološka podobnost/Nivo aerozagadjenja': 'Еколошка подобност [Ниво аерозагађења]', '2.7.5.3.Ekološka podobnost/Čista pogonska energija': 'Еколошка подобност [Чиста погонска енергија]' } ### remap textual answers to numbers exp = remap_answers_tx(exp) crd = remap_answers_tx(crd) ### rename colums drv = drv.rename(columns=driver_rename_dct) crd = crd.rename(columns=crd_rename_dict) ### select only attributes of interest e = exp[questions] d = drv[questions] c = crd[questions] #### create id of every user e['id'] = e.index d['id'] = d.index c['id'] = c.index #### create transactional data exp_trans = pd.melt(e, id_vars=['id'], value_vars=questions, var_name='question', value_name='rate') drv_trans = pd.melt(d, id_vars=['id'], value_vars=questions, var_name='question', value_name='rate') crd_trans = pd.melt(c, id_vars=['id'], value_vars=questions, var_name='question', value_name='rate') #### create id for each question exp_trans['question_id'] = exp_trans.groupby('question').ngroup() #### create question map (id to original text) question_map = exp_trans[['question_id', 'question' ]].drop_duplicates().reset_index().drop('index', axis=1) #pd.Series(question_map.question_id.values,index=question_map.question).to_dict() drv_trans = pd.merge(drv_trans, question_map, on='question') drv_trans['rate'] = pd.to_numeric(drv_trans['rate'], errors='coerce') crd_trans = pd.merge(crd_trans, question_map, on='question') crd_trans['rate'] = pd.to_numeric(crd_trans['rate'], errors='coerce') ### get all alternative names (ids) alt_names = list(question_map['question_id'].sort_values()) # create user names (with specified types of each voter) exp_trans['voter'] = exp_trans.apply( lambda x: 'traffic_' + str(x['id']) + '_expert', axis=1) drv_trans['voter'] = drv_trans.apply( lambda x: 'driver_' + str(x['id']) + '_expert', axis=1) crd_trans['voter'] = crd_trans.apply(lambda x: str(x['id']) + '_crowd', axis=1) #### select attributes for analysis df_expert = exp_trans[['voter', 'question_id', 'rate']] df_crowd = crd_trans[['voter', 'question_id', 'rate']] df_driver = drv_trans[['voter', 'question_id', 'rate']] df_expert['rate'] = df_expert['rate'].astype('float') df_crowd['rate'] = df_crowd['rate'].astype('float') df_driver['rate'] = df_driver['rate'].astype('float') df_crowd = df_crowd.dropna() df_expert = df_expert.dropna() df_driver = df_driver.dropna() ## filter possible errors and remove missing values df_expert = df_expert.loc[(df_expert['rate'] <= 3) & (df_expert['rate'] > 0)] df_driver = df_driver.loc[(df_driver['rate'] <= 3) & (df_driver['rate'] > 0)] df_crowd = df_crowd.loc[(df_crowd['rate'] <= 3) & (df_crowd['rate'] > 0)] all_votes = df_crowd.append(df_expert).append(df_driver) if expert_type == 'driver': df_selected_expert = df_driver else: df_selected_expert = df_expert df_expert_crowd = pd.concat([df_selected_expert, df_crowd], ignore_index=True) #n_crowd = len(df_crowd['voter'].unique()) ############# Aggregate data crowd_agg = get_aggregated_data(df_crowd, alt_names, index_column='voter', column='question_id', value='rate') expert_agg = get_aggregated_data(df_selected_expert, alt_names, index_column='voter', column='question_id', value='rate') expert_crowd_agg = get_aggregated_data(df_expert_crowd, alt_names, index_column='voter', column='question_id', value='rate') ############ Create user mapping _, _, voter_map = create_ratings_and_mapping(expert_crowd_agg, alt_names, voter_col='voter') ##### replace voters name with ids in all dataframes df_crowd = pd.merge(voter_map, df_crowd, how='inner', on='voter').drop('voter', axis=1) df_expert_crowd = pd.merge(voter_map, df_expert_crowd, how='inner', on='voter').drop('voter', axis=1) df_selected_expert = pd.merge(voter_map, df_selected_expert, how='inner', on='voter').drop('voter', axis=1) crowd_agg = pd.merge(voter_map, crowd_agg, how='inner', on='voter').drop('voter', axis=1) cr_voter = crowd_agg['voter_id'] crowd_agg = crowd_agg[alt_names].replace(0, np.nan) crowd_agg['voter_id'] = cr_voter expert_agg = pd.merge(voter_map, expert_agg, how='inner', on='voter').drop('voter', axis=1) exp_voter = expert_agg['voter_id'] expert_agg = expert_agg[alt_names].replace(0, np.nan) expert_agg['voter_id'] = exp_voter #### extract expert and crowd ids for similarity expert_ids = get_user_ids_from_mapping(voter_map, 'expert') crowd_ids = get_user_ids_from_mapping(voter_map, 'crowd') df_alt_votes = get_aggregated_data(pd.concat( [df_crowd, df_selected_expert]), voter_map['voter_id'], index_column='question_id', column='voter_id', value='rate') qu = df_alt_votes['question_id'] df_alt_votes = df_alt_votes[crowd_ids + expert_ids].replace(0, np.nan) df_alt_votes['question_id'] = qu result_dict = { 'question_map': question_map, 'alt_names': alt_names, 'df_crowd': df_crowd, 'df_selected_expert': df_selected_expert, 'df_driver': df_driver, 'df_traffic': df_expert, 'all_votes': all_votes, 'expert_ids': expert_ids, 'crowd_ids': crowd_ids, 'df_alt_votes': df_alt_votes } return result_dict
def experiment_artifical_data(df_expert_crowd, max_grade=10): df_expert_crowd = df_expert_crowd.rename(columns={ 'votes': 'rate', 'group': 'voter', 'case': 'alternative_name' }) df_expert_crowd['voter'] = df_expert_crowd[ 'voter'] + '_' + df_expert_crowd['id'].astype(str) df_expert_crowd['rate'] = df_expert_crowd['rate'].astype('float') alternative_map = crete_alternatives_map( df_expert_crowd, alternative_name='alternative_name') #alt_names = list(alternative_map['alternative_id'].unique()) voter_lookup = df_expert_crowd.copy() voter_lookup['voter_id'] = voter_lookup.groupby('voter').ngroup() voter_lookup = voter_lookup[[ 'voter', 'voter_id' ]].drop_duplicates().reset_index().drop('index', axis=1) voter_lookup = voter_lookup.sort_values('voter_id') df_expert_crowd = pd.merge(df_expert_crowd, alternative_map, on='alternative_name')[[ 'voter', 'alternative_id', 'rate' ]] df_expert_crowd = pd.merge(voter_lookup, df_expert_crowd, on='voter').drop('voter', axis=1) expert_ids = get_user_ids_from_mapping(voter_lookup, 'expert') crowd_ids = get_user_ids_from_mapping(voter_lookup, 'crowd') #df_expert = df_expert_crowd[df_expert_crowd['voter_id'].isin(expert_ids)] #df_crowd = df_expert_crowd[df_expert_crowd['voter_id'].isin(crowd_ids)] ''' Optimize grade absolute ''' df_alt_votes = get_aggregated_data(df_expert_crowd, voter_lookup['voter_id'], index_column='alternative_id', column='voter_id', value='rate') result_optm_abs0 = pd.DataFrame(df_alt_votes['alternative_id'], columns=(['alternative_id'])) result_optm_abs1 = pd.DataFrame(df_alt_votes['alternative_id'], columns=(['alternative_id'])) result_optm_abs0['optimal_grade'] = df_alt_votes[crowd_ids].apply( lambda x: np.median(x), axis=1) result_optm_abs0['alpha'] = 0.0 result_optm_abs1['optimal_grade'] = df_alt_votes[expert_ids].apply( lambda x: np.median(x), axis=1) result_optm_abs1['alpha'] = 1.0 result_optm_abs = pd.concat([result_optm_abs0, result_optm_abs1]) result_optm_abs = calculate_satisfaction_absolute(df_alt_votes, result_optm_abs, max_grade, expert_ids, crowd_ids) # del(result_optm_abs0) # del(result_optm_abs1) ''' ################################ Results ''' ###### nash cons = [{'type': 'eq', 'fun': lambda_const}] bnds = ((0.01, 0.99), (0.01, 0.99), (1, 10)) res_nash = nash_results(df_alt_votes, max_grade, crowd_ids, expert_ids, cons, bnds, lambda_expert=0.5) #res_nash.to_csv('results/results_nash' + ' .csv') ###### kalai res_kalai = kalai_results(df_alt_votes, result_optm_abs, max_grade, crowd_ids, expert_ids) res_baseline = calculate_baseline_stats_satisfaction( df_alt_votes, max_grade, crowd_ids, expert_ids, stats=['np.mean', 'np.median', 'mode']) res_overal_sat = avg_satisfaction_by_group(res_kalai, res_nash, res_baseline).reset_index() max_satisfaction = result_optm_abs[[ 'alternative_id', 'crowd_sat', 'expert_sat' ]].groupby(by='alternative_id').agg('max').reset_index() max_satisfaction = max_satisfaction.rename(columns={ 'crowd_sat': 'max_crowd_sat', 'expert_sat': 'max_expert_sat' }) max_satisfaction['max_satisfaction_sum'] = max_satisfaction[ 'max_crowd_sat'] + max_satisfaction['max_expert_sat'] max_satisfaction['max_satisfaction_area'] = max_satisfaction[ 'max_crowd_sat'] * max_satisfaction['max_expert_sat'] min_satisfaction = result_optm_abs[[ 'alternative_id', 'crowd_sat', 'expert_sat' ]].groupby(by='alternative_id').agg('min').reset_index() min_satisfaction = min_satisfaction.rename(columns={ 'crowd_sat': 'min_crowd_sat', 'expert_sat': 'min_expert_sat' }) min_satisfaction['min_satisfaction_sum'] = min_satisfaction[ 'min_crowd_sat'] + min_satisfaction['min_expert_sat'] min_satisfaction['min_satisfaction_area'] = min_satisfaction[ 'min_crowd_sat'] * min_satisfaction['min_expert_sat'] ref_satisfaction = pd.merge(max_satisfaction, min_satisfaction, on='alternative_id') res_nash = relative_detail_satisfaction_nash(res_nash, max_satisfaction) res_kalai = relative_detail_satisfaction_kalai(res_kalai, max_satisfaction) res_baseline = relative_detail_satisfaction_baseline( res_baseline, max_satisfaction) ##### Calculate gain res_nash['gain_ratio'] = pd.merge( ref_satisfaction, res_nash, on='alternative_id' ).apply(lambda x: np.abs( ((x['lambda_exp'] * x['max_expert_sat'] + (1 - x['lambda_exp']) * x['min_expert_sat']) / x['max_expert_sat']) - ((x['lambda_exp'] * x['min_crowd_sat'] + (1 - x['lambda_exp']) * x['max_crowd_sat']) / x['max_crowd_sat'])), axis=1) res_kalai['gain_ratio'] = pd.merge( ref_satisfaction, res_kalai, on='alternative_id' ).apply(lambda x: np.abs( ((x['lambda_exp'] * x['max_expert_sat'] + (1 - x['lambda_exp']) * x['min_expert_sat']) / x['max_expert_sat']) - ((x['lambda_exp'] * x['min_crowd_sat'] + (1 - x['lambda_exp']) * x['max_crowd_sat']) / x['max_crowd_sat'])), axis=1) ## ---------------------------------------------------------------------------- # res_relative_sat_ext = relative_overall_satisfaction(res_nash_extreme, res_kalai_extreme, res_baseline_extreme, max_satisfaction) res_relative_sat = relative_overall_satisfaction(res_nash, res_kalai, res_baseline, ref_satisfaction) res_relative_sat #################### Result analysis - lower uncertanty #df_crowd_sample = df_crowd.groupby('vote', group_keys = False).apply(lambda x: x.sample(min(len(x),3))) res_kalai = pd.merge(alternative_map, res_kalai, on='alternative_id') res_nash = pd.merge(alternative_map, res_nash, on='alternative_id') res_baseline = pd.merge(alternative_map, res_baseline, on='alternative_id') # res_kalai_extreme = pd.merge(alternative_map, res_kalai_extreme, on = 'alternative_id') # res_nash_extreme = pd.merge(alternative_map, res_nash_extreme, on = 'alternative_id') return res_kalai, res_nash, res_baseline, res_overal_sat, res_relative_sat