コード例 #1
0
def get_df(cols, load_from_temp, temp_path):
    weather_df = load_weather_df(CONFIG.preprocessed_meteo_path_complete)
    if not load_from_temp:
        logger.info('Loading train Dataframe...')
        train_df = load_train_df(CONFIG.preprocessed_train_path_means)
        logger.info('Loading weather Dataframe...')

        logger.info('Creating features...')
        ff = FeatureFactory(train_df, weather_df)
        for col in cols:
            logger.info('Creating %s feature...' % col)
            ff(col)
        if 'ASS_ASSIGNMENT' not in cols:
            cols = ['ASS_ASSIGNMENT'] + cols
        if 'DATE' not in cols:
            cols = ['DATE'] + cols
        if 'CSPL_RECEIVED_CALLS' not in cols:
            cols += ['CSPL_RECEIVED_CALLS']

        logger.info('Selecting features...')
        ff.select_features(cols)
        train_df = ff.X
        if temp_path is not None:
            train_df.to_csv(temp_path)
    else:
        assert temp_path is not None
        logger.info('Loading train Dataframe...')
        train_df = pd.read_csv(temp_path, encoding='latin-1', index_col=0, parse_dates=['DATE'])
    weather_df.reset_index(inplace=True)
    return train_df, weather_df
コード例 #2
0
def complete_meteo_with_zeros(in_path, out_path=None):
    logger.debug('Loading Dataframe...')
    weather_df = load_weather_df(in_path)
    means = weather_df.mean()
    weather_df.reset_index(inplace=True)

    logger.debug('Generating empty Dataframe...')
    d1 = date(2011, 1, 1)
    d2 = date(2012, 12, 31)
    dates = [d1]
    while d1 < d2:
        d1 += td(days=1)
        dates.append(d1)
    zero_df = pd.DataFrame({'DATE': dates}, dtype=weather_df.dtypes['DATE'])

    logger.debug('Completing Dataframe...')
    weather_df = zero_df.merge(weather_df.copy(), how='left', on='DATE')
    weather_df['NUMB_FROZEN_DEPT'].fillna(means['NUMB_FROZEN_DEPT'], inplace=True)
    weather_df['NUMB_WET_DEPT'].fillna(means['NUMB_WET_DEPT'], inplace=True)
    weather_df.set_index('DATE', inplace=True)

    logger.debug('Saving Dataframe...')
    if out_path is not None:
        weather_df.to_csv(out_path)
    return weather_df
コード例 #3
0
def compare_calls(scale, out_path, assignments=None, datetime=None):
    """
    Plot the number of calls to compare them.

    Parameters
    ==========
    scale: 'DATETIME', 'DAY', 'WEEK' or 'YEAR', calls are averaged on all smaller scales,
        and plotted for larger scales.
    out_path: str, folder in which figures should be saved.
    assignments: str or list of str, assignments to take into account.
        None to take all columns into account.
    datetime: if 'DATETIME', the datetime to filter on

    Example
    =======
    Week comparison: For each day of the week, take the average number of calls, then compare for each week of the year.
    """
    assert scale in ['DATETIME', 'DAY', 'WEEK', 'YEAR']
    if assignments is not None:
        if isinstance(assignments, str):
            assignments = [assignments]
        assert not set(assignments).difference(CONFIG.submission_assignments)
    else:
        assignments = CONFIG.submission_assignments

    df = load_train_df(CONFIG.preprocessed_train_path)
    df = df[df["ASS_ASSIGNMENT"].isin(assignments)]
    # if remove_days_off:
    #     df = df[df["DAY_OFF"] == 0]
    #     df.drop("DAY_OFF", axis=1, inplace=True)
    ff = FeatureFactory(df)
    for column in ["WEEK_NUMBER", "WEEK_DAY", "TIME"]:
        ff(column)
    df = ff.X

    if scale == 'DATETIME':
        assert datetime is not None
        df = df[ff("WEEK_DAY") == datetime.isoweekday()]
        df = df[df['TIME'] == datetime.hour + float(datetime.minute)/60]
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment].reset_index()
            plt.plot(df_assignment['CSPL_RECEIVED_CALLS'])
        weather_df = load_weather_df(CONFIG.preprocessed_meteo_path)
        good_days = [d for d in weather_df.index if d.isoweekday() == datetime.isoweekday()]
        weather_df = weather_df.loc[good_days, :].reset_index()
        plt.plot(weather_df['NUMB_FROZEN_DEPT'])
        plt.plot(weather_df['NUMB_WET_DEPT'])
        plt.savefig(os.path.join(out_path, scale+".jpg"))

    if scale == 'DAY':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY", "TIME"])
        df = grouped["CSPL_RECEIVED_CALLS"].sum().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            for day in range(366):
                df_day = df_assignment[df_assignment['WEEK_NUMBER'] == int(day/7 + 1)]
                df_day = df_day[df_day['WEEK_DAY'] == day % 7]
                plt.plot(df_day['TIME'], df_day["CSPL_RECEIVED_CALLS"])
            plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg"))
            plt.clf()
    if scale == 'WEEK':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER", "WEEK_DAY"])
        df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            for week_number in range(53):
                df_week = df_assignment[df_assignment['WEEK_NUMBER'] == week_number]
                plt.plot(df_week['WEEK_DAY'], df_week["CSPL_RECEIVED_CALLS"])
            plt.savefig(os.path.join(out_path, scale+"_"+assignment+".jpg"))
            plt.clf()
    if scale == 'YEAR':
        grouped = df.groupby(["ASS_ASSIGNMENT", "WEEK_NUMBER"])
        df = grouped["CSPL_RECEIVED_CALLS"].mean().reset_index()
        for assignment in assignments:
            print(assignment)
            df_assignment = df[df['ASS_ASSIGNMENT'] == assignment]
            plt.plot(df_assignment['WEEK_DAY'], df_assignment["CSPL_RECEIVED_CALLS"])
        # plt.axis([0, 52, 0, 50])
        plt.savefig(os.path.join(out_path, scale+"_absolute_values.jpg"))
        plt.clf()