I = [] for inspec_item, id_ in zip(inspec.values.flatten().tolist(), df.id_.values.flatten().tolist()): temp = pd.DataFrame(inspec_item, columns=['date','score','grade','inspector']) temp['id_'] = id_ temp['inspec_id'] = pd.Series(zip(temp.id_,temp.index)).apply(lambda x: '%s_%s' % (x[0],x[1])) I.append(temp) I = pd.concat(I, axis=0).reset_index(drop=True) if drop_flag: I.drop_duplicates(inplace=True) return I def get_features_NC(df, min_date, city_tag, i_cols): I = get_NC_inspections(df) R = lib.state_yelp_reviews(df, min_date, city_tag) y, x = lib.merge_inspec_dates(I, df, R, i_cols) print y.info() X = lib.summarize_reviews(x) return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='outer') # -----------MAIN----------------------------- ############################################### if __name__ == '__main__': NC = open_pickle('../data/char/charlotte_yelp_merge.pkl') df_NC = get_features_NC(NC, '2011-06-30', 'charlotte', ['score','grade']) save_to_pickle(df_NC, '../data/char/charlotte_yelp_features.pkl')
I.violations.fillna('', inplace=True) I['n_violations'] = I.violations.apply(lambda x: len(x.split(',')) if len(x) > 0 else 0) I['id_'] = I.permit_number I['inspec_id'] = I.serial_number I = I[I.permit_number.isin(df.permit_number.unique())] if drop_flag: I.drop_duplicates(inplace=True) return I[I.permit_number.isin(df.permit_number.unique())] def get_features_NV(df, min_date, city_tag, i_cols): if 'id_' not in df.columns: df['id_'] = df.permit_number I = get_NV_inspections(df) R = lib.state_yelp_reviews(df, min_date, city_tag) y, x = lib.merge_inspec_dates(I, df, R, i_cols) X = lib.summarize_reviews(x) return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner') # -----------MAIN----------------------------- ############################################### if __name__ == '__main__': NV = open_pickle('../data/vegas/vegas_yelp_merge.pkl') df_NV = get_features_NV(NV, '1989-07-01', 'vegas', ['demerits','grade', 'n_violations']) save_to_pickle(df_NV, '../data/vegas/vegas_yelp_features.pkl')
V = pd.concat(V, axis=0) V['cdc risk factor'].fillna('', inplace=True) V['critical'] = (~V['cdc risk factor'].isin(['','good retail practice'])).astype(int) df['n_violations'] = V.groupby(['id_','inspec_id']).count().reset_index(level=0).critical df['n_critical'] = V.groupby(['id_','inspec_id']).sum().reset_index(level=0).critical df['n_critical'].fillna(0, inplace=True) df['n_violations'].fillna(0, inplace=True) return V, df def get_features_WI(df, min_date, city_tag, i_cols): I = get_WI_inspections(df) V, I = get_WI_violations(I) R = lib.state_yelp_reviews(df, min_date, city_tag) y, x = lib.merge_inspec_dates(I, df, R, i_cols) X = lib.summarize_reviews(x) return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner') # -----------MAIN----------------------------- ############################################### if __name__ == '__main__': WI = open_pickle('../data/mad/madison_yelp_merge.pkl') df_WI = get_features_WI(WI, '2011-06-30', 'madison', ['n_critical', 'n_violations']) save_to_pickle(df_WI, '../data/mad/madison_yelp_features.pkl')