def get_df(cols, load_from_temp, temp_path): weather_df = load_weather_df(CONFIG.preprocessed_meteo_path_complete) if not load_from_temp: logger.info('Loading train Dataframe...') train_df = load_train_df(CONFIG.preprocessed_train_path_means) logger.info('Loading weather Dataframe...') logger.info('Creating features...') ff = FeatureFactory(train_df, weather_df) for col in cols: logger.info('Creating %s feature...' % col) ff(col) if 'ASS_ASSIGNMENT' not in cols: cols = ['ASS_ASSIGNMENT'] + cols if 'DATE' not in cols: cols = ['DATE'] + cols if 'CSPL_RECEIVED_CALLS' not in cols: cols += ['CSPL_RECEIVED_CALLS'] logger.info('Selecting features...') ff.select_features(cols) train_df = ff.X if temp_path is not None: train_df.to_csv(temp_path) else: assert temp_path is not None logger.info('Loading train Dataframe...') train_df = pd.read_csv(temp_path, encoding='latin-1', index_col=0, parse_dates=['DATE']) weather_df.reset_index(inplace=True) return train_df, weather_df
def complete_meteo_with_zeros(in_path, out_path=None): logger.debug('Loading Dataframe...') weather_df = load_weather_df(in_path) means = weather_df.mean() weather_df.reset_index(inplace=True) logger.debug('Generating empty Dataframe...') d1 = date(2011, 1, 1) d2 = date(2012, 12, 31) dates = [d1] while d1 < d2: d1 += td(days=1) dates.append(d1) zero_df = pd.DataFrame({'DATE': dates}, dtype=weather_df.dtypes['DATE']) logger.debug('Completing Dataframe...') weather_df = zero_df.merge(weather_df.copy(), how='left', on='DATE') weather_df['NUMB_FROZEN_DEPT'].fillna(means['NUMB_FROZEN_DEPT'], inplace=True) weather_df['NUMB_WET_DEPT'].fillna(means['NUMB_WET_DEPT'], inplace=True) weather_df.set_index('DATE', inplace=True) logger.debug('Saving Dataframe...') if out_path is not None: weather_df.to_csv(out_path) return weather_df
def compare_calls(scale, out_path, assignments=None, datetime=None): """ Plot the number of calls to compare them. Parameters ========== scale: 'DATETIME', 'DAY', 'WEEK' or 'YEAR', calls are averaged on all smaller scales, and plotted for larger scales. out_path: str, folder in which figures should be saved. assignments: str or list of str, assignments to take into account. None to take all columns into account. datetime: if 'DATETIME', the datetime to filter on Example ======= Week comparison: For each day of the week, take the average number of calls, then compare for each week of the year. """ assert scale in ['DATETIME', 'DAY', 'WEEK', 'YEAR'] if assignments is not None: if isinstance(assignments, str): assignments = [assignments] assert not set(assignments).difference(CONFIG.submission_assignments) else: assignments = CONFIG.submission_assignments df = load_train_df(CONFIG.preprocessed_train_path) df = df[df["ASS_ASSIGNMENT"].isin(assignments)] # if remove_days_off: # df = df[df["DAY_OFF"] == 0] # df.drop("DAY_OFF", axis=1, inplace=True) ff = FeatureFactory(df) for column in ["WEEK_NUMBER", "WEEK_DAY", "TIME"]: ff(column) df = ff.X if scale == 'DATETIME': assert datetime is not None df = df[ff("WEEK_DAY") == datetime.isoweekday()] df = df[df['TIME'] == datetime.hour + float(datetime.minute)/60] for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment].reset_index() plt.plot(df_assignment['CSPL_RECEIVED_CALLS']) weather_df = load_weather_df(CONFIG.preprocessed_meteo_path) good_days = [d for d in weather_df.index if d.isoweekday() == datetime.isoweekday()] weather_df = weather_df.loc[good_days, :].reset_index() plt.plot(weather_df['NUMB_FROZEN_DEPT']) plt.plot(weather_df['NUMB_WET_DEPT']) plt.savefig(os.path.join(out_path, scale+".jpg")) if scale == 'DAY': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY", "TIME"]) df = grouped["CSPL_RECEIVED_CALLS"].sum().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] for day in range(366): df_day = df_assignment[df_assignment['WEEK_NUMBER'] == int(day/7 + 1)] df_day = df_day[df_day['WEEK_DAY'] == day % 7] plt.plot(df_day['TIME'], df_day["CSPL_RECEIVED_CALLS"]) plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg")) plt.clf() if scale == 'WEEK': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY"]) df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] for week_number in range(53): df_week = df_assignment[df_assignment['WEEK_NUMBER'] == week_number] plt.plot(df_week['WEEK_DAY'], df_week["CSPL_RECEIVED_CALLS"]) plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg")) plt.clf() if scale == 'YEAR': grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER"]) df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index() for assignment in assignments: print(assignment) df_assignment = df[df['ASS_ASSIGNMENT'] == assignment] plt.plot(df_assignment['WEEK_DAY'], df_assignment["CSPL_RECEIVED_CALLS"]) # plt.axis([0, 52, 0, 50]) plt.savefig(os.path.join(out_path, scale+"_absolute_values.jpg")) plt.clf()