def test_is_holiday(self): d = SchoolHolidayDates() self.assertTrue(d.is_holiday(datetime.date(2017, 12, 25))) self.assertFalse(d.is_holiday(datetime.date(2017, 12, 1))) with self.assertRaisesRegexp(UnsupportedYearException, 'No data for year: 2005'): d.is_holiday(datetime.date(2005, 2, 7))
def processing(dataInt): ## missing value df = dataInt.copy() df_num = df.drop(['timestamp','loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'], axis=1) df_NonNum = df.select_dtypes(include=[np.object]) imputed_training_mice = mice(df_num.values) data_mice = pd.DataFrame(imputed_training_mice, columns = df_num.columns, index = list(df.index.values)) dClean = data_mice.join(df_NonNum) ## drop variable inutile d_tr = dClean.drop(['loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'], axis=1) ## create extra attribute conv(d_tr) d_tr['timestamp'] = pd.to_datetime(d_tr.timestamp, format = '%Y-%m-%dT%H:%M:%S.%f') ## create season and rangeInYear s = pd.to_datetime(pd.Series(d_tr['timestamp'])) d_tr['rangeInYear'] = s.dt.strftime('%j').astype(int) d_tr['season'] = d_tr['rangeInYear'].apply(lambda d : get_season(d)) #create jours working days d_tr['is_business_day'] = d_tr['datetime_perso'].apply(lambda e : int(business_day(e))) # Is it an holiday for zone A, B or C? d = SchoolHolidayDates() d_tr['is_holiday'] = d_tr['datetime_perso'].apply(lambda f : int(d.is_holiday(datetime.date(f)))) dataInt1 = d_tr.drop(['rangeInYear', 'datetime_perso', 'date', 'timestamp'], axis=1) return (dataInt1)
class OperationsStatsTransformer(BaseTransformer): DATE_COLUMNS = ["date_heure_reception_alerte_locale", "date_heure_reception_alerte"] def __init__(self, filepath): super(OperationsStatsTransformer, self).__init__(filepath) self.bank_holidays = {} self.school_holiday = SchoolHolidayDates() def transform(self, output): df = self.read_csv() df["phase_journee"] = df.apply(lambda row: self.phase_journee(row), axis=1) df["est_jour_ferie"] = df.apply(lambda row: self.est_jour_ferie(row), axis=1) df["est_vacances_scolaires"] = df.apply( lambda row: self.est_vacances_scolaires(row), axis=1 ) df.drop(["latitude", "longitude"] + self.DATE_COLUMNS, axis=1, inplace=True) self.to_csv(df, output) def phase_journee(self, row): heure_locale = row["date_heure_reception_alerte_locale"] heure_utc = row["date_heure_reception_alerte"] latitude, longitude = row["latitude"], row["longitude"] try: sunrise, sunset = SunriseSunset(heure_utc, latitude, longitude).calculate() except ValueError: return np.nan # coucher du soleil -> lever du soleil if sunset <= heure_utc <= sunrise: return "nuit" # 12:00 -> 13:59 elif 12 <= heure_locale.hour <= 13: return "déjeuner" # lever du soleil -> 10:59 elif heure_locale.hour <= 11: return "matinée" # 14:00 -> coucher du soleil elif heure_locale.hour >= 14: return "après-midi" else: raise ValueError("Date is invalid " + str(heure_locale)) def est_jour_ferie(self, row): date = row["date_heure_reception_alerte_locale"].date() if date.year not in self.bank_holidays: dates = JoursFeries.for_year(date.year).values() self.bank_holidays[date.year] = dates return date in self.bank_holidays[date.year] def est_vacances_scolaires(self, row): try: date = row["date_heure_reception_alerte_locale"].date() return self.school_holiday.is_holiday(date) except UnsupportedYearException: return np.nan
def test_is_holiday(self): d = SchoolHolidayDates() self.assertTrue(d.is_holiday(datetime.date(2017, 12, 25))) self.assertFalse(d.is_holiday(datetime.date(2017, 12, 1))) with self.assertRaisesRegexp( UnsupportedYearException, "No data for year: 1985" ): d.is_holiday(datetime.date(1985, 2, 7)) with self.assertRaisesRegexp(ValueError, "date should be a datetime.date"): d.is_holiday(datetime.datetime(2017, 12, 1, 2, 0))
def test_is_holiday(self): d = SchoolHolidayDates() self.assertTrue(d.is_holiday(datetime.date(2017, 12, 25))) self.assertFalse(d.is_holiday(datetime.date(2017, 12, 1))) with self.assertRaisesRegexp(UnsupportedYearException, "No data for year: 1985"): d.is_holiday(datetime.date(1985, 2, 7)) with self.assertRaisesRegexp(ValueError, "date should be a datetime.date"): d.is_holiday(datetime.datetime(2017, 12, 1, 2, 0))
d_tr['timestamp'] = pd.to_datetime(d_tr.timestamp, format='%Y-%m-%dT%H:%M:%S.%f') ## create season and rangeInYear s = pd.to_datetime(pd.Series(d_tr['timestamp'])) d_tr['rangeInYear'] = s.dt.strftime('%j').astype(int) ## create jours working days d_tr['is_business_day'] = d_tr['datetime_perso'].apply( lambda e: int(business_day(e))) # Is it an holiday for zone A, B or C? d = SchoolHolidayDates() d_tr['is_holiday'] = d_tr['datetime_perso'].apply( lambda f: int(d.is_holiday(datetime.date(f)))) d_tr['season'] = d_tr['rangeInYear'].apply(lambda d: get_season(d)) d_tr = d_tr.drop(['rangeInYear', 'datetime_perso', 'date', 'timestamp'], axis=1) d_ready = pd.merge(d_tr, dataOut, on='ID', how='left') ## gerer les dummies et les variables num train_new = d_ready[d_ready['consumption_1'].notnull()] train_new = train_new.drop(['ID'], axis=1) test_new = d_ready[d_ready['consumption_1'].isnull()] test_new = test_new.drop(['consumption_1', 'consumption_2', 'ID'], axis=1) train_new.shape test_new.shape
ts2.head() df.tail() ## create season and rangeInYear s = pd.Series(dataInt['timestamp']) s = pd.to_datetime(s) dataInt['rangeInYear'] = s.dt.strftime('%j').astype(int) #dataInt_Xy = dataInt_Xy.drop_duplicates(subset = ['rangeInYear']) # A ENLRVER dataInt['season'] = dataInt['rangeInYear'].apply(lambda d : get_season(d)) ## create jours working days dataInt['is_business_day'] = dataInt['datetime_perso'].apply(lambda e : int(business_day(e))) # Is it an holiday for zone A, B or C? d = SchoolHolidayDates() dataInt['is_holiday'] = dataInt['datetime_perso'].apply(lambda f : int(d.is_holiday(datetime.date(f)))) dataInt.groupby('month')['consumption_1'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'monthly consum 1') dataInt.groupby('month')['consumption_2'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'monthly consum 2') dataInt.groupby('day of week')['consumption_1'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'daily consum 1') dataInt.groupby('day of week')['consumption_2'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'daily consum 2') dataInt.groupby('hour')['consumption_1'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'hourly consum 1') dataInt.groupby('hour')['consumption_2'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'hourly consum 2') dataInt.groupby('is_business_day')['consumption_1'].mean().plot.bar(fontsize=14, figsize=(10,7), title= ' business day consum 1') dataInt.groupby('is_business_day')['consumption_2'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'business day consum 2') dataInt.groupby('is_holiday')['consumption_1'].mean().plot.bar(fontsize=14, figsize=(10,7), title= ' holiday day consum 1') dataInt.groupby('is_holiday')['consumption_2'].mean().plot.bar(fontsize=14, figsize=(10,7), title= 'holiday day consum 2')