def get_dataset(self, key, info): if self._data is None: self.read() if key.name in ['latitude', 'longitude']: lons, lats = self.get_lonlats() if key.name == 'latitude': return Projectable(lats, id=key) else: return Projectable(lons, id=key) avhrr_channel_index = {'1': 0, '2': 1, '3a': 2, '3b': 2, '4': 3, '5': 4} index = avhrr_channel_index[key.name] mask = False if key.name in ['3a', '3b'] and self._is3b is None: ch3a = bfield(self._data["id"]["id"], 10) self._is3b = np.logical_not(ch3a) if key.name == '3a': mask = np.tile(self._is3b, (1, 2048)) elif key.name == '3b': mask = np.tile(np.logical_not(self._is3b), (1, 2048)) data = self._data["image_data"][:, :, index] if key.calibration == 'counts': return Projectable(data, mask=mask, area=self.get_lonlats(), units='1') pg_spacecraft = ''.join(self.platform_name.split()).lower() jdays = (np.datetime64(self.start_time) - np.datetime64(str( self.year) + '-01-01T00:00:00Z')) / np.timedelta64(1, 'D') if index < 2 or key.name == '3a': data = calibrate_solar(data, index, self.year, jdays, pg_spacecraft) units = '%' if index > 2 or key.name == '3b': if self.times is None: self.times = time_seconds(self._data["timecode"], self.year) line_numbers = ( np.round((self.times - self.times[-1]) / np.timedelta64(166666667, 'ns'))).astype(np.int) line_numbers -= line_numbers[0] if self.prt is None: self.prt, self.ict, self.space = self.get_telemetry() chan = index + 1 data = calibrate_thermal(data, self.prt, self.ict[:, chan - 3], self.space[:, chan - 3], line_numbers, chan, pg_spacecraft) units = 'K' # TODO: check if entirely masked before returning return Projectable(data, mask=mask, units=units)
def _coerce_scalar_to_timedelta_type(r, unit='ns'): # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case def conv(v): if _np_version_under1p7: return timedelta(microseconds=v/1000.0) return np.timedelta64(v) if isinstance(r, compat.string_types): converter = _get_string_converter(r, unit=unit) r = converter() r = conv(r) elif r == tslib.iNaT: return r elif isinstance(r, np.timedelta64): r = r.astype("m8[{0}]".format(unit.lower())) elif is_integer(r): r = tslib.cast_from_unit(r, unit) r = conv(r) if _np_version_under1p7: if not isinstance(r, timedelta): raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) if compat.PY3: # convert to microseconds in timedelta64 r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000)) else: return r if isinstance(r, timedelta): r = np.timedelta64(r) elif not isinstance(r, np.timedelta64): raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) return r.astype('timedelta64[ns]')
def __init__(self): data_frame = get_joined_frame() add_standardized_period(data_frame) # build a timeseries for each indicator/region pair timeseries_list = get_timeseries_list(data_frame) deviations = [] for timeseries in timeseries_list: date_diffs = timeseries.period_end.diff() # leap-year hack: convert 366 day differences to 365 day differences date_diffs[date_diffs == np.timedelta64(366, "D")] = np.timedelta64(365, "D") # a little unusual that it's a series, but there can be a tie for most frequent mode_series = date_diffs.mode() if mode_series.empty: continue deviation_rows = timeseries[date_diffs != mode_series[0]] if not deviation_rows.empty: deviations.append(deviation_rows) self.violation_values = pd.concat(deviations)
def test_timedelta64_conversions(self): startdate = Series(date_range('2013-01-01', '2013-01-03')) enddate = Series(date_range('2013-03-01', '2013-03-03')) s1 = enddate - startdate s1[2] = np.nan for m in [1, 3, 10]: for unit in ['D', 'h', 'm', 's', 'ms', 'us', 'ns']: # op expected = s1.apply(lambda x: x / np.timedelta64(m, unit)) result = s1 / np.timedelta64(m, unit) assert_series_equal(result, expected) if m == 1 and unit != 'ns': # astype result = s1.astype("timedelta64[{0}]".format(unit)) assert_series_equal(result, expected) # reverse op expected = s1.apply( lambda x: Timedelta(np.timedelta64(m, unit)) / x) result = np.timedelta64(m, unit) / s1 # astype s = Series(date_range('20130101', periods=3)) result = s.astype(object) self.assertIsInstance(result.iloc[0], datetime) self.assertTrue(result.dtype == np.object_) result = s1.astype(object) self.assertIsInstance(result.iloc[0], timedelta) self.assertTrue(result.dtype == np.object_)
def test_ufunc_coercions(self): idx = date_range('2011-01-01', periods=3, freq='2D', name='x') delta = np.timedelta64(1, 'D') for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = date_range('2011-01-02', periods=3, freq='2D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '2D' for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = date_range('2010-12-31', periods=3, freq='2D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '2D' delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), np.timedelta64(3, 'D')]) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], freq='3D', name='x') tm.assert_index_equal(result, exp) assert result.freq == '3D' for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], freq='D', name='x') tm.assert_index_equal(result, exp) assert result.freq == 'D'
def formula(individu, period, parameters): assiette_allegement = individu('assiette_allegement', period) contrat_de_travail_duree = individu('contrat_de_travail_duree', period) TypesContratDeTravailDuree = contrat_de_travail_duree.possible_values contrat_de_travail_debut = individu('contrat_de_travail_debut', period) contrat_de_travail_fin = individu('contrat_de_travail_fin', period) effectif_entreprise = individu('effectif_entreprise', period) smic_proratise = individu('smic_proratise', period) zone_revitalisation_rurale = individu('zone_revitalisation_rurale', period) duree_cdd_eligible = contrat_de_travail_fin > contrat_de_travail_debut + timedelta64(365, 'D') # TODO: move to parameters file contrat_de_travail_eligible = ( contrat_de_travail_duree == TypesContratDeTravailDuree.cdi) + ( (contrat_de_travail_duree == TypesContratDeTravailDuree.cdd) * (duree_cdd_eligible) ) duree_validite = ( datetime64(period.start) + timedelta64(1, 'D') - contrat_de_travail_debut ).astype('timedelta64[Y]') < timedelta64(1, 'Y') eligible = ( contrat_de_travail_eligible * (effectif_entreprise <= 50) * zone_revitalisation_rurale * duree_validite ) taux_max = .281 if period.start.year < 2015 else .2655 # TODO: move to parameters file seuil_max = 2.4 seuil_min = 1.5 taux_exoneration = compute_taux_exoneration(assiette_allegement, smic_proratise, taux_max, seuil_max, seuil_min) exoneration_cotisations_zrr = taux_exoneration * assiette_allegement * eligible return exoneration_cotisations_zrr
def test_timestamp_and_series(self): timestamp_series = Series(date_range('2014-03-17', periods=2, freq='D', tz='US/Eastern')) first_timestamp = timestamp_series[0] delta_series = Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')]) assert_series_equal(timestamp_series - first_timestamp, delta_series) assert_series_equal(first_timestamp - timestamp_series, -delta_series)
def testResampleData(self): # test upsampling by a factor of 2 timestamps = numpy.array([numpy.datetime64( datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) + datetime.timedelta(hours=i)) for i in xrange(8)]) values = numpy.linspace(0, 7, 8) newSamplingInterval = numpy.timedelta64(1800, 's') (newTimeStamps, newValues) = param_finder._resampleData(timestamps, values, newSamplingInterval) trueNewTimeStamps = numpy.array([numpy.datetime64( datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) + datetime.timedelta(hours=0.5 * i)) for i in xrange(15)]) self.assertTrue(numpy.allclose(newValues, numpy.linspace(0, 7, 15))) timestampError = (numpy.sum( numpy.abs(newTimeStamps - trueNewTimeStamps))).item().total_seconds() self.assertAlmostEqual(timestampError, 0) # test down-sampling by a factor of 2 newSamplingInterval = numpy.timedelta64(7200, 's') (newTimeStamps, newValues) = param_finder._resampleData(timestamps, values, newSamplingInterval) trueNewTimeStamps = numpy.array([numpy.datetime64( datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) + datetime.timedelta(hours=2 * i)) for i in xrange(4)]) timestampError = (numpy.sum( numpy.abs(newTimeStamps - trueNewTimeStamps))).item().total_seconds() self.assertTrue(numpy.allclose(newValues, numpy.linspace(0, 6, 4))) self.assertAlmostEqual(timestampError, 0)
def _as_timedelta64_scalar(time, unit=None): unit_args = [unit] if unit else [] flt_unit = unit if unit else 's' # turn 'H:M:S.ms', 'M:S.ms', 'S.ms' into floating point seconds if isinstance(time, string_types):# and ':' in time: time = [float(t) for t in time.lstrip('T').split(':')][::-1] if len(time) > 1 and unit is not None: raise ValueError("When giving time as a string, units are automatic") if len(time) > 3: raise ValueError("Timedelta as string only goes up to hours") t_flt = 0.0 for factor, t in zip([1, 60, 60 * 60], time): t_flt += factor * t time = t_flt flt_unit = 's' # turn floating point time into integer with the correct unit if is_datetime_like(time): time = as_datetime64(time) - as_datetime64(np.timedelta64(0, 's')) elif isinstance(time, (np.timedelta64, timedelta)): time = np.timedelta64(time).astype(_format_unit(unit, base=DELTA_BASE)) elif isinstance(time, (int, float, np.integer, np.floating)): orig_time, orig_flt_unit = time, flt_unit unit_idx = TIME_UNITS.index(flt_unit) while not np.isclose(time, int(np.round(time)), rtol=1e-4, atol=1e-18): if unit_idx <= 0: raise ValueError("Floating point time {0} [{1}] is too precise " "for any time unit?".format(orig_time, orig_flt_unit)) unit_idx -= 1 time *= TIME_SCALE[unit_idx] flt_unit = TIME_UNITS[unit_idx] time = np.timedelta64(int(np.round(time)), flt_unit) unit, unit_args = flt_unit, [flt_unit] return np.timedelta64(time, *unit_args)
def test_cf_timedelta(self): examples = [ ('1D', 'days', np.int64(1)), (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), ('1h', 'hours', np.int64(1)), ('1ms', 'milliseconds', np.int64(1)), ('1us', 'microseconds', np.int64(1)), (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), (['30m', '60m'], 'hours', [0.5, 1.0]), ] if pd.__version__ >= '0.16': # not quite sure why, but these examples don't work on older pandas examples.extend([(np.timedelta64('NaT', 'ns'), 'days', np.nan), (['NaT', 'NaT'], 'days', [np.nan, np.nan])]) for timedeltas, units, numbers in examples: timedeltas = pd.to_timedelta(timedeltas, box=False) numbers = np.array(numbers) expected = numbers actual, _ = conventions.encode_cf_timedelta(timedeltas, units) self.assertArrayEqual(expected, actual) self.assertEqual(expected.dtype, actual.dtype) if units is not None: expected = timedeltas actual = conventions.decode_cf_timedelta(numbers, units) self.assertArrayEqual(expected, actual) self.assertEqual(expected.dtype, actual.dtype) expected = np.timedelta64('NaT', 'ns') actual = conventions.decode_cf_timedelta(np.array(np.nan), 'days') self.assertArrayEqual(expected, actual)
def test_timedelta_conversions(self): assert (Timedelta(timedelta(seconds=1)) == np.timedelta64(1, 's').astype('m8[ns]')) assert (Timedelta(timedelta(microseconds=1)) == np.timedelta64(1, 'us').astype('m8[ns]')) assert (Timedelta(timedelta(days=1)) == np.timedelta64(1, 'D').astype('m8[ns]'))
def close_gaps(ts, verbose = False): ts = ts.copy() ts.data = ts.data.sort_index() if type(ts.data).__name__ == 'Panel': data = ts.data.items.values index = ts.data.items else: data = ts.data.index.values index = ts.data.index index_df = _pd.DataFrame(index = index) dt = data[1:] - data[:-1] dt = dt / _np.timedelta64(1,'s') median = _np.median(dt) if median > (1.1 * ts._data_period) or median < (0.9 * ts._data_period): _warnings.warn('There is a periode and median missmatch (%0.1f,%0.1f), this is either due to an error in the assumed period or becuase there are too many gaps in the _timeseries.'%(median,ts._data_period)) point_dist = (index.values[1:] - index.values[:-1]) / _np.timedelta64(1, 's') where = point_dist > 2 * ts._data_period off_periods = _np.array([index[:-1][where], index[1:][where]]).transpose() if verbose: print('found %i gaps'%(off_periods.shape[0])) for i, op in enumerate(off_periods): no_periods = round((op[1] - op[0])/ _np.timedelta64(1,'s')) / ts._data_period out = _pd.date_range(start = op[0], periods= no_periods, freq= '%i s'%ts._data_period) out = out[1:] out = _pd.DataFrame(index = out) index_df = _pd.concat([index_df, out]) index_df.sort_index(inplace=True) ts.data = ts.data.reindex(index_df.index) return ts
def rolling_correlation(data, correlant, window, min_good_ratio = 0.67, verbose = True): "time as here: http://docs.scipy.org/doc/numpy/reference/arrays.datetime.html#datetime-units" correlant = correlant.align_to(data) # I do align before merge, because it is more suffisticated! merged = data.copy() merged.data['correlant'] = correlant.data data_period = _np.timedelta64(int(merged._data_period), 's') window = _np.timedelta64(window[0], window[1]) window = int(window/data_period) if verbose: print('Each window contains %s data points of which at least %s are not nan.'%(window, int(window * min_good_ratio))) min_good = window * min_good_ratio size = merged.data.shape[0]-window + 1 timestamps = _pd.TimeSeries(_pd.to_datetime(_pd.Series(_np.zeros(size)))) pear_r = _np.zeros(size) for i in range(size): secment = TimeSeries(merged.data.iloc[i:i+window,:]) secment._data_period = merged._data_period # print(secment.data.dropna().shape[0] < min_good) if secment.data.dropna().shape[0] < min_good: pear_r[i]= _np.nan else: corr = secment.correlate_to(secment, data_column=merged.data.columns[0], correlant_column=merged.data.columns[1]) pear_r[i] = corr.pearson_r[0] timestamps.iloc[i] = secment.data.index[0] + ((secment.data.index[-1] - secment.data.index[0])/2.) # break pear_r_ts = TimeSeries(_pd.DataFrame(pear_r, index = timestamps, columns = ['pearson_r'])) pear_r_ts._data_period = merged._data_period pear_r_ts._y_label = 'r' return pear_r_ts
def test_timestamp_and_series(self): timestamp_series = Series(date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern")) first_timestamp = timestamp_series[0] delta_series = Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) assert_series_equal(timestamp_series - first_timestamp, delta_series) assert_series_equal(first_timestamp - timestamp_series, -delta_series)
def test_timedelta_ops_scalar(self): _skip_if_numpy_not_friendly() # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') expected_add = pd.to_datetime('20130101 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta(10,unit='s'), timedelta(seconds=10), np.timedelta64(10,'s'), np.timedelta64(10000000000,'ns'), pd.offsets.Second(10)]: result = base + offset self.assertEquals(result, expected_add) result = base - offset self.assertEquals(result, expected_sub) base = pd.to_datetime('20130102 09:01:12.123456') expected_add = pd.to_datetime('20130103 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta('1 day, 00:00:10'), pd.to_timedelta('1 days, 00:00:10'), timedelta(days=1,seconds=10), np.timedelta64(1,'D')+np.timedelta64(10,'s'), pd.offsets.Day()+pd.offsets.Second(10)]: result = base + offset self.assertEquals(result, expected_add) result = base - offset self.assertEquals(result, expected_sub)
def test_timedelta_conversions(self): self.assertEqual(ct(timedelta(seconds=1)), np.timedelta64(1, 's').astype('m8[ns]')) self.assertEqual(ct(timedelta(microseconds=1)), np.timedelta64(1, 'us').astype('m8[ns]')) self.assertEqual(ct(timedelta(days=1)), np.timedelta64(1, 'D').astype('m8[ns]'))
def test_nat_items(self): # not a datetime nadt_no_unit = np.datetime64("NaT") nadt_s = np.datetime64("NaT", "s") nadt_d = np.datetime64("NaT", "ns") # not a timedelta natd_no_unit = np.timedelta64("NaT") natd_s = np.timedelta64("NaT", "s") natd_d = np.timedelta64("NaT", "ns") dts = [nadt_no_unit, nadt_s, nadt_d] tds = [natd_no_unit, natd_s, natd_d] for a, b in itertools.product(dts, dts): self._assert_func(a, b) self._assert_func([a], [b]) self._test_not_equal([a], b) for a, b in itertools.product(tds, tds): self._assert_func(a, b) self._assert_func([a], [b]) self._test_not_equal([a], b) for a, b in itertools.product(tds, dts): self._test_not_equal(a, b) self._test_not_equal(a, [b]) self._test_not_equal([a], [b]) self._test_not_equal([a], np.datetime64("2017-01-01", "s")) self._test_not_equal([b], np.datetime64("2017-01-01", "s")) self._test_not_equal([a], np.timedelta64(123, "s")) self._test_not_equal([b], np.timedelta64(123, "s"))
def test_split_key_cmp(self): dt1 = numpy.datetime64("2015-01-01T15:03") dt1_1 = numpy.datetime64("2015-01-01T15:03") dt2 = numpy.datetime64("2015-01-05T15:03") td = numpy.timedelta64(60, 's') td2 = numpy.timedelta64(300, 's') self.assertEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt1, td)) self.assertEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt1_1, td)) self.assertNotEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt2, td)) self.assertNotEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt1, td2)) self.assertLess( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt2, td)) self.assertLessEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt1, td), carbonara.SplitKey.from_timestamp_and_sampling(dt1, td)) self.assertGreater( carbonara.SplitKey.from_timestamp_and_sampling(dt2, td), carbonara.SplitKey.from_timestamp_and_sampling(dt1, td)) self.assertGreaterEqual( carbonara.SplitKey.from_timestamp_and_sampling(dt2, td), carbonara.SplitKey.from_timestamp_and_sampling(dt2, td))
def calcReturn(log, transactions, date): startt = str(np.datetime64(date + 'T00:01'))[:16] endt = str(np.datetime64(date + 'T23:59'))[:16] starty = str(np.datetime64(date + 'T00:01') - np.timedelta64(1, 'D'))[:16] endy = str(np.datetime64(date + 'T23:59') - np.timedelta64(1, 'D'))[:16] dft = log[startt : endt] dfy = log[starty : endy] try: tdf = transactions[date] except: tdf = [] if len(dfy) > 0: startValue = float(dfy.tail(1)['EUR'] + \ dfy.tail(1)['BTC'] * dfy.tail(1)['Bid']) openPrice = float((dfy.tail(1)['Bid'] + dfy.tail(1)['Ask'])/2) else: startValue = float(dft.head(1)['EUR'] + \ dft.head(1)['BTC'] * dft.head(1)['Bid']) openPrice = float((dft.head(1)['Bid'] + dft.head(1)['Ask'])/2) endValue = float(dft.tail(1)['EUR'] + \ dft.tail(1)['BTC'] * dft.tail(1)['Bid']) closePrice = float((dft.tail(1)['Bid'] + dft.tail(1)['Ask'])/2) if len(tdf) != 0: tdf['EUR']=tdf['AmountBTC']*(tdf['Bid']+tdf['Ask'])/2 + tdf['AmountEUR'] endValue = endValue - float(tdf.sum()['EUR']) retStrategy= endValue / startValue - 1 retHold = closePrice / openPrice - 1 return openPrice, closePrice, retHold, retStrategy
def test_74_percentile_serialized(self): ts = carbonara.TimeSerie.from_data( [datetime64(2014, 1, 1, 12, 0, 0), datetime64(2014, 1, 1, 12, 0, 4), datetime64(2014, 1, 1, 12, 0, 9)], [3, 5, 6]) ts = self._resample(ts, numpy.timedelta64(60, 's'), '74pct') self.assertEqual(1, len(ts)) self.assertEqual(5.48, ts[datetime64(2014, 1, 1, 12, 0, 0)][1]) # Serialize and unserialize key = ts.get_split_key() o, s = ts.serialize(key) saved_ts = carbonara.AggregatedTimeSerie.unserialize( s, key, ts.aggregation) self.assertEqual(ts.aggregation, saved_ts.aggregation) ts = carbonara.TimeSerie.from_data( [datetime64(2014, 1, 1, 12, 0, 0), datetime64(2014, 1, 1, 12, 0, 4), datetime64(2014, 1, 1, 12, 0, 9)], [3, 5, 6]) ts = self._resample(ts, numpy.timedelta64(60, 's'), '74pct') saved_ts.merge(ts) self.assertEqual(1, len(ts)) self.assertEqual(5.48, ts[datetime64(2014, 1, 1, 12, 0, 0)][1])
def test_fetch_nano(self): ts = {'sampling': numpy.timedelta64(200, 'ms'), 'size': 10, 'agg': 'mean'} tsb = carbonara.BoundTimeSerie(block_size=ts['sampling']) tsb.set_values(numpy.array([ (datetime64(2014, 1, 1, 11, 46, 0, 200123), 4), (datetime64(2014, 1, 1, 11, 46, 0, 340000), 8), (datetime64(2014, 1, 1, 11, 47, 0, 323154), 50), (datetime64(2014, 1, 1, 11, 48, 0, 590903), 4), (datetime64(2014, 1, 1, 11, 48, 0, 903291), 4)], dtype=carbonara.TIMESERIES_ARRAY_DTYPE), before_truncate_callback=functools.partial( self._resample_and_merge, agg_dict=ts)) tsb.set_values(numpy.array([ (datetime64(2014, 1, 1, 11, 48, 0, 821312), 5)], dtype=carbonara.TIMESERIES_ARRAY_DTYPE), before_truncate_callback=functools.partial( self._resample_and_merge, agg_dict=ts)) self.assertEqual([ (datetime64(2014, 1, 1, 11, 46, 0, 200000), 6.0), (datetime64(2014, 1, 1, 11, 47, 0, 200000), 50.0), (datetime64(2014, 1, 1, 11, 48, 0, 400000), 4.0), (datetime64(2014, 1, 1, 11, 48, 0, 800000), 4.5) ], list(ts['return'].fetch())) self.assertEqual(numpy.timedelta64(200000000, 'ns'), ts['return'].aggregation.granularity)
def time_features_enricher(dataset): """ Feature engineering on time related fields :param dataset: train/test dataset """ dataset['date_time_dt'] = pd.to_datetime(dataset.date_time, format = '%Y-%m-%d %H:%M:%S') dataset['date_time_dow'] = dataset.date_time_dt.dt.dayofweek dataset['date_time_hour'] = dataset.date_time_dt.dt.hour dataset['date_time_month'] = dataset.date_time_dt.dt.month dataset.loc[dataset.srch_ci == '2161-10-00', 'srch_ci'] = '2016-01-20' #handle one error format case in test set dataset['srch_ci_dt'] = pd.to_datetime(dataset.srch_ci, format = '%Y-%m-%d') dataset['srch_ci_dow'] = dataset.srch_ci_dt.dt.dayofweek dataset['srch_ci_month'] = dataset.srch_ci_dt.dt.month dataset['srch_co_dt'] = pd.to_datetime(dataset.srch_co, format = '%Y-%m-%d') dataset['srch_co_dow'] = dataset.srch_co_dt.dt.dayofweek dataset['srch_co_month'] = dataset.srch_co_dt.dt.month dataset['booking_window'] = (dataset['srch_ci_dt'] - dataset['date_time_dt'])/np.timedelta64(1, 'D') dataset['booking_window'].fillna(1000, inplace=True) dataset['booking_window'] = map(int, dataset['booking_window']) dataset['length_of_stay'] = (dataset['srch_co_dt'] - dataset['srch_ci_dt'])/np.timedelta64(1, 'D')
def function(self, simulation, period): period = period.this_month assiette_allegement = simulation.calculate('assiette_allegement', period) contrat_de_travail_duree = simulation.calculate('contrat_de_travail_duree', period) # 0: CDI, 1:CDD contrat_de_travail_debut = simulation.calculate('contrat_de_travail_debut', period) contrat_de_travail_fin = simulation.calculate('contrat_de_travail_fin', period) effectif_entreprise = simulation.calculate('effectif_entreprise', period) smic_proratise = simulation.calculate('smic_proratise', period) zone_revitalisation_rurale = simulation.calculate('zone_revitalisation_rurale', period) duree_cdd_eligible = contrat_de_travail_fin > contrat_de_travail_debut + timedelta64(365, 'D') # TODO: move to legislation parameters file contrat_de_travail_eligible = ( contrat_de_travail_duree == 0) + ( (contrat_de_travail_duree == 1) * (duree_cdd_eligible) ) duree_validite = ( datetime64(period.start) + timedelta64(1, 'D') - contrat_de_travail_debut ).astype('timedelta64[Y]') < timedelta64(1, 'Y') eligible = ( contrat_de_travail_eligible * (effectif_entreprise <= 50) * zone_revitalisation_rurale * duree_validite ) taux_max = .281 if period.start.year < 2015 else .2655 # TODO: move to legislation parameters file seuil_max = 2.4 seuil_min = 1.5 taux_exoneration = compute_taux_exoneration(assiette_allegement, smic_proratise, taux_max, seuil_max, seuil_min) exoneration_cotisations_zrr = taux_exoneration * assiette_allegement * eligible return period, exoneration_cotisations_zrr
def transform_data(train, test): """ Transform train and test data to include new variables. """ # Time initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') # Arbitrary date chosen d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') for mn in train['time'].values) train['hour'] = d_times.hour train['weekday'] = d_times.weekday train['day_of_month'] = d_times.day train['month'] = d_times.month train['year'] = d_times.year d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') for mn in test['time'].values) test['hour'] = d_times.hour test['weekday'] = d_times.weekday test['day_of_month'] = d_times.day test['month'] = d_times.month test['year'] = d_times.year # Accuracy train['accuracy'] = np.log10(train['accuracy']) * 10.0 test['accuracy'] = np.log10(test['accuracy']) * 10.0 # Combine x and y attributes eps = 0.00001 train['x_d_y'] = train.x.values / (train.y.values + eps) test['x_d_y'] = test.x.values / (test.y.values + eps) train['x_t_y'] = train.x.values * train.y.values test['x_t_y'] = test.x.values * test.y.values # Return data return train, test
def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, enable_cftimeindex): if enable_cftimeindex: pytest.importorskip('cftime') cftime = _import_cftime() units = 'days since 0001-01-01' times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D') times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D') noleap_time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) noleap_time2 = cftime.date2num(times2.to_pydatetime(), units, calendar=calendar) mdim_time = np.empty((len(noleap_time1), 2), ) mdim_time[:, 0] = noleap_time1 mdim_time[:, 1] = noleap_time2 expected1 = times1.values expected2 = times2.values actual = coding.times.decode_cf_datetime( mdim_time, units, calendar=calendar, enable_cftimeindex=enable_cftimeindex) assert actual.dtype == np.dtype('M8[ns]') abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 assert (abs_diff1 <= np.timedelta64(1, 's')).all() assert (abs_diff2 <= np.timedelta64(1, 's')).all()
def PlotEwmaPredictions(daily, name): """ """ # use EWMA to estimate slopes filled = timeseries.FillMissing(daily) filled['slope'] = pandas.ewma(filled.ppg.diff(), span=180) filled[-1:] # extract the last inter and slope start = filled.index[-1] inter = filled.ewma[-1] slope = filled.slope[-1] # reindex the DataFrame, adding a year to the end dates = pandas.date_range(filled.index.min(), filled.index.max() + np.timedelta64(365, 'D')) predicted = filled.reindex(dates) # generate predicted values and add them to the end predicted['date'] = predicted.index one_day = np.timedelta64(1, 'D') predicted['days'] = (predicted.date - start) / one_day predict = inter + slope * predicted.days predicted.ewma.fillna(predict, inplace=True) # plot the actual values and predictions thinkplot.Scatter(daily.ppg, alpha=0.1, label=name) thinkplot.Plot(predicted.ewma) thinkplot.Save()
def get_holidays(self, start, end, cal="FX"): # TODO use Pandas CustomBusinessDays to get more calendars holidays_list = [] if cal == "FX": # filter for Christmas & New Year's Day for i in range(1970, 2020): holidays_list.append(str(i) + "-12-25") holidays_list.append(str(i) + "-01-01") if cal == "WEEKDAY": bday = CustomBusinessDay(weekmask="Sat Sun") holidays_list = pandas.date_range(start, end, freq=bday) holidays_list = pandas.to_datetime(holidays_list).order() # floor start date start = np.datetime64(start) - np.timedelta64(1, "D") # ceiling end date end = np.datetime64(end) + np.timedelta64(1, "D") holidays_list = [x for x in holidays_list if x >= start and x <= end] return pandas.to_datetime(holidays_list)
def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') expected_add = pd.to_datetime('20130101 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), np.timedelta64(10, 's'), np.timedelta64(10000000000, 'ns'), pd.offsets.Second(10)]: result = base + offset assert result == expected_add result = base - offset assert result == expected_sub base = pd.to_datetime('20130102 09:01:12.123456') expected_add = pd.to_datetime('20130103 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta('1 day, 00:00:10'), pd.to_timedelta('1 days, 00:00:10'), timedelta(days=1, seconds=10), np.timedelta64(1, 'D') + np.timedelta64(10, 's'), pd.offsets.Day() + pd.offsets.Second(10)]: result = base + offset assert result == expected_add result = base - offset assert result == expected_sub
def test_timedelta(self, freq): index = date_range('1/1/2000', periods=50, freq=freq) shifted = index + timedelta(1) back = shifted + timedelta(-1) tm.assert_index_equal(index, back) if freq == 'D': expected = pd.tseries.offsets.Day(1) assert index.freq == expected assert shifted.freq == expected assert back.freq == expected else: # freq == 'B' assert index.freq == pd.tseries.offsets.BusinessDay(1) assert shifted.freq is None assert back.freq == pd.tseries.offsets.BusinessDay(1) result = index - timedelta(1) expected = index + timedelta(-1) tm.assert_index_equal(result, expected) # GH4134, buggy with timedeltas rng = date_range('2013', '2014') s = Series(rng) result1 = rng - pd.offsets.Hour(1) result2 = DatetimeIndex(s - np.timedelta64(100000000)) result3 = rng - np.timedelta64(100000000) result4 = DatetimeIndex(s - pd.offsets.Hour(1)) tm.assert_index_equal(result1, result4) tm.assert_index_equal(result2, result3)
def test_is_datetimelike_array_all_nan_nat_like(self): arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), np.timedelta64('nat')]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, pd.NaT]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, np.nan], dtype=object) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) assert lib.is_datetime_with_singletz_array( np.array([pd.Timestamp('20130101', tz='US/Eastern'), pd.Timestamp('20130102', tz='US/Eastern')], dtype=object)) assert not lib.is_datetime_with_singletz_array( np.array([pd.Timestamp('20130101', tz='US/Eastern'), pd.Timestamp('20130102', tz='CET')], dtype=object))
def BCEuropeanCallDirichlet(self, V, t, op): V[0] = 0 V[-1] = self.S[-1] - op.getStrike()*np.exp(-self.getR()*t/np.timedelta64(365, 'D')) return V
def test_convert_timedelta_type_non_pandas_types() -> None: assert bus.convert_timedelta_type( datetime.timedelta(3000)) == 259200000000.0 assert bus.convert_timedelta_type(np.timedelta64(3000, 'ms')) == 3000.
import numpy as np import pandas as pd from datetime import datetime import pytest import empyrical from vectorbt import defaults from vectorbt.records.drawdowns import Drawdowns from tests.utils import isclose day_dt = np.timedelta64(86400000000000) index = pd.DatetimeIndex([ datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 3), datetime(2018, 1, 4), datetime(2018, 1, 5) ]) ts = pd.DataFrame( { 'a': [1, 2, 3, 4, 5], 'b': [5, 4, 3, 2, 1], 'c': [1, 2, 3, 2, 1] }, index=index) ret = ts.pct_change() defaults.returns['year_freq'] = '252 days' # same as empyrical
def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None, observation_period_end=None, freq='D', freq_multiplier=1): """ Return summary data from transactions. This transforms a Dataframe of transaction data of the form: customer_id, datetime [, monetary_value] to a Dataframe of the form: customer_id, frequency, recency, T [, monetary_value] Parameters ---------- transactions: :obj: DataFrame a Pandas DataFrame that contains the customer_id col and the datetime col. customer_id_col: string the column in transactions DataFrame that denotes the customer_id datetime_col: string the column in transactions that denotes the datetime the purchase was made. monetary_value_col: string, optional the columns in the transactions that denotes the monetary value of the transaction. Optional, only needed for customer lifetime value estimation models. observation_period_end: datetime, optional a string or datetime to denote the final date of the study. Events after this date are truncated. If not given, defaults to the max 'datetime_col'. datetime_format: string, optional a string that represents the timestamp format. Useful if Pandas can't understand the provided format. freq: string, optional Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects freq_multiplier: int, optional Default 1, could be use to get exact recency and T, i.e. with freq='W' row for user id_sample=1 will be recency=30 and T=39 while data in CDNOW summary are different. Exact values could be obtained with freq='D' and freq_multiplier=7 which will lead to recency=30.43 and T=38.86 Returns ------- :obj: Dataframe: customer_id, frequency, recency, T [, monetary_value] """ if observation_period_end is None: observation_period_end = pd.to_datetime(transactions[datetime_col].max(), format=datetime_format).to_period(freq).to_timestamp() else: observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp() # label all of the repeated transactions repeated_transactions = _find_first_transactions( transactions, customer_id_col, datetime_col, monetary_value_col, datetime_format, observation_period_end, freq ) # reset datetime_col to timestamp repeated_transactions[datetime_col] = pd.Index(repeated_transactions[datetime_col]).to_timestamp() # count all orders by customer. customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count']) # subtract 1 from count, as we ignore their first order. customers['frequency'] = customers['count'] - 1 customers['T'] = (observation_period_end - customers['min']) / np.timedelta64(1, freq) / freq_multiplier customers['recency'] = (customers['max'] - customers['min']) / np.timedelta64(1, freq) / freq_multiplier summary_columns = ['frequency', 'recency', 'T'] if monetary_value_col: # create an index of all the first purchases first_purchases = repeated_transactions[repeated_transactions['first']].index # by setting the monetary_value cells of all the first purchases to NaN, # those values will be excluded from the mean value calculation repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0) summary_columns.append('monetary_value') return customers[summary_columns].astype(float)
# from bokeh.models.ranges import FactorRange import numpy as np from datetime import date, datetime, timedelta def nptodt(dt): return datetime.strftime( datetime.strptime(str(dt), '%Y-%m-%d').date(), '%b-%d') output_file('bokehtrial.html') N = 50 stdate = np.datetime64('2020-01-30') enddate = np.datetime64( (datetime.utcnow() + timedelta(hours=5, minutes=30)).date()) N = int((enddate - stdate) / np.timedelta64(1, 'D')) x = [str(x) for x in range(1, N + 1)] dates = [nptodt(x) for x in np.arange(stdate, stdate + N)] stdate = date(2020, 1, 30) enddate = (datetime.utcnow() + timedelta(hours=5, minutes=30) - timedelta(days=1)).date() # Confirmed newconf = np.array([np.random.randint(N) for i in range(N)]) cumconf = np.zeros(N) cumconf[0] = newconf[0] for i in range(1, N): cumconf[i] = newconf[i] + cumconf[i - 1] oldconf = list(cumconf - newconf) cumconf = list(cumconf) newconf = list(newconf)
def moving_average(time_series, window_len): """Calculates the moving average of an unevenly spaced time series. This moving average implementation weights each value by the time it remained unchanged, which conceptually matches smart recording on GPS devices: a sample is taken when some value changes sufficiently, so before a new sample is taken the previous one is assumed to be more or less constant. The term "area" below means a sum of time-weighted values. This implementation follows the SMA_last algorithm proposed in (Eckner, 2017) (see README for citation). Args: time_series: A pandas.Series of the values to average, indexed with timestamps. window_len: The size of the moving average window, in seconds. Returns: A numpy array of length len(time_series) containing the moving average values """ # Re-index the time series with duration in seconds from the first value time_series.index = ((time_series.index - time_series.index[0]) / np.timedelta64(1, 's')).astype('int') window_area = time_series.iloc[0] * window_len # It may not always be possible to construct a window of length exactly equal # to window_len using timestamps present in the data. To handle this, the left # side of the window is allowed to fall between timestamps (the right side is # always fixed to a timestamp in the data). Therefore we need to separately # compute the area of the inter-timestamp region on the left side of the # window so that it can be added to the window area. left_area is that value. left_area = window_area out = np.zeros(len(time_series)) out[0] = time_series.iloc[0] # i is the left side of the window and j is the right i = 0 for j in xrange(1, len(time_series)): # Remove the last iteration's left_area as a new right window bound may # change the left_area required in this iteration window_area -= left_area # Expand window to the right window_area += time_series.iloc[j - 1] * (time_series.index[j] - time_series.index[j - 1]) # Shrink window from the left if expanding to the right has created too # large a window. new_left_time may fall between timestamps present in the # data, which is fine, since that's handled by left_area. new_left_time = time_series.index[j] - window_len while time_series.index[i] < new_left_time: window_area -= time_series.iloc[i] * (time_series.index[i + 1] - time_series.index[i]) i += 1 # Add left side inter-timestamp area to window left_area = time_series.iloc[max( 0, i - 1)] * (time_series.index[i] - new_left_time) window_area += left_area out[j] = window_area / window_len return out
def BCEuropeanPutDirichlet(self, V, t, op): V[0] = op.getStrike()*np.exp(-self.getR()*t/np.timedelta64(365, 'D')) V[-1] = 0 return V
def getMatchingEvents(self, solve=True): """Return a list of dictionaries matching input parameters. Args: solve (bool): If set to True, then this method should return a list with a maximum of one event. Returns: list: List of event dictionaries, with fields: - time Event time (UTC) - lat Event latitude - lon Event longitude - depth Event depth - mag Event magnitude """ jpyear = str(self.jptime.year) jpquarter = str(QUARTERS[self.jptime.month]) if len(jpquarter) == 1: jpquarter = '0' + jpquarter url = SEARCH_URL.replace('YEAR', jpyear) url = url.replace('QUARTER', jpquarter) req = requests.get(url) data = req.text soup = BeautifulSoup(data, features="lxml") select = soup.find('select') options = select.find_all('option') times = [] lats = [] lons = [] depths = [] mags = [] values = [] for option in options: if 'Data not found' in option.text: break eventstr = option.contents[0] timestr = re.search(TIMEPAT, eventstr).group() latstr = re.search(LATPAT, eventstr).group() lonstr = re.search(LONPAT, eventstr).group() depstr = re.search(DEPPAT, eventstr).group() magstr = re.search(MAGPAT, eventstr).group() lat = float(latstr.replace('N', '')) lon = float(lonstr.replace('E', '')) depth = float(depstr.replace('km', '')) mag = float(magstr.replace('M', '')) etime = datetime.strptime(timestr, TIMEFMT) times.append(np.datetime64(etime)) lats.append(lat) lons.append(lon) depths.append(depth) mags.append(mag) values.append(option.get('value')) events = [] if not len(times): return events times = np.array(times) lats = np.array(lats) lons = np.array(lons) depths = np.array(depths) mags = np.array(mags) values = np.array(values) distances = geodetic_distance(self.lon, self.lat, lons, lats) didx = distances <= self.radius jptime = np.datetime64(self.jptime) # dtimes is in microseconds dtimes = np.abs(jptime - times) tidx = dtimes <= np.timedelta64(int(self.dt), 's') etimes = times[didx & tidx] elats = lats[didx & tidx] elons = lons[didx & tidx] edepths = depths[didx & tidx] emags = mags[didx & tidx] evalues = values[didx & tidx] for etime, elat, elon, edep, emag, evalue in zip( etimes, elats, elons, edepths, emags, evalues): jtime = UTCDateTime(str(etime)) utime = jtime - JST_OFFSET edict = { 'time': utime, 'lat': elat, 'lon': elon, 'depth': edep, 'mag': emag, 'cgi_value': evalue } events.append(edict) if solve and len(events) > 1: event = self.solveEvents(events) events = [event] return events
def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array( [np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, pd.Timestamp('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # different type of nat arr = np.array([np.timedelta64('nat'), np.datetime64('2011-01-02')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.datetime64('2011-01-02'), np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # should be datetime? arr = np.array( [np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array( [pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed'
t=set(activities_6) train_data['same_time_activ_6'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x])) t=set(activities_8) train_data['same_time_activ_8'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x])) t=set(activities_10) train_data['same_time_activ_10'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x])) # number of selected activities per person train_data['occur']=train_data.people_id train_data.occur=train_data.people_id.apply(dict(train_data.people_id.value_counts()).get) #mean of the time interval between activities for pep , df in train_data.groupby('people_id')['date_x']: df=pd.DataFrame(df) df.sort(columns='date_x',ascending=False,inplace=True) l=list(set(df.date_x.values)) if len(l)>1: mean_time= (sum([l[i]-l[i+1] for i in range(0,len(l)-1,1)])/np.timedelta64(1,'D'))/(len(df.date_x.values)-1) people.loc[people.people_id==pep,'mean_time']=mean_time else: people.loc[people.people_id==pep,'mean_time']=0 train_data=pd.merge(train_data,people.loc[:,['people_id','mean_time']],on='people_id') #percentage of groups that are in the test and not in the train test_train.loc[test_train.group_1.isin(groups)==False,'group_1'].shape[0]/test_train.shape[0] #the first and the last activitie selected first_activitie= train_data.loc[:,['people_id','date_x','activity_category']].sort(columns=['people_id','date_x']).drop_duplicates(['people_id'] ,keep='first') first_activitie.rename(columns = {'activity_category':'first activity'} , inplace = True) first_activitie.drop('date_x',axis=1,inplace=True) last_activity = train_data.loc[:,['people_id','date_x','activity_category']].sort(columns=['people_id','date_x']).drop_duplicates(['people_id'],keep='last') last_activity.rename(columns = {'activity_category':'last_activity'} , inplace=True) last_activity.drop('date_x',axis=1,inplace=True)
def gantt(data, monthly=True): """ Make a Gantt plot, which shows the temporal data availability for each station. Parameters ---------- data : pandas DataFrame A Pandas daily DataFrame with DatetimeIndex where each column corresponds to a station.. monthly : boolean, default True Defines if the availability count of the data will be monthly to obtain a more fluid graph. Returns ------- fig : plotly Figure """ date_index = pd.date_range(data.index[0], data.index[-1], freq='D') data = data.reindex(date_index) periods = [] for column in data.columns: series = data[column] if monthly: missing = series.isnull().groupby( pd.Grouper(freq='1MS')).sum().to_frame() series_drop = missing.loc[ missing[column] < 7] # A MONTH WITHOUT 7 DATA IS CONSIDERED A MISSING MONTH DELTA = 'M' else: series_drop = series.dropna() DELTA = 'D' if series_drop.shape[0] > 1: task = column resource = 'Available data' start = str(series_drop.index[0].year) + '-' + str( series_drop.index[0].month) + '-' + str( series_drop.index[0].day) finish = 0 for i in range(len(series_drop)): if i != 0 and round( (series_drop.index[i] - series_drop.index[i - 1]) / np.timedelta64(1, DELTA), 0) != 1: finish = str( series_drop.index[i - 1].year) + '-' + str( series_drop.index[i - 1].month) + '-' + str( series_drop.index[i - 1].day) periods.append( dict(Task=task, Start=start, Finish=finish, Resource=resource)) start = str(series_drop.index[i].year) + '-' + str( series_drop.index[i].month) + '-' + str( series_drop.index[i].day) finish = 0 finish = str(series_drop.index[-1].year) + '-' + str( series_drop.index[-1].month) + '-' + str( series_drop.index[-1].day) periods.append( dict(Task=task, Start=start, Finish=finish, Resource=resource)) else: print('Station {} has no months with significant data'.format( column)) periods = pd.DataFrame(periods) start_year = periods['Start'].apply(lambda x: int(x[:4])).min() finish_year = periods['Start'].apply(lambda x: int(x[:4])).max() colors = {'Available data': 'rgb(0,191,255)'} fig = ff.create_gantt(periods, colors=colors, index_col='Resource', show_colorbar=True, showgrid_x=True, showgrid_y=True, group_tasks=True) fig.layout.xaxis.tickvals = pd.date_range('1/1/' + str(start_year), '12/31/' + str(finish_year + 1), freq='2AS') fig.layout.xaxis.ticktext = pd.date_range('1/1/' + str(start_year), '12/31/' + str(finish_year + 1), freq='2AS').year return fig
def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == 'floating' # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([None, np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' # pd.NaT arr = np.array([pd.NaT]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([pd.NaT, np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT, np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([None, pd.NaT, None]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' # np.datetime64(nat) arr = np.array([np.datetime64('nat')]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' # datetime / timedelta mixed arr = np.array( [pd.NaT, np.datetime64('nat'), np.timedelta64('nat'), np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed'
m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"] na_vals = ( [ None, NaT, float("NaN"), complex("NaN"), np.nan, np.float64("NaN"), np.float32("NaN"), np.complex64(np.nan), np.complex128(np.nan), np.datetime64("NaT"), np.timedelta64("NaT"), ] + [np.datetime64("NaT", unit) for unit in m8_units] + [np.timedelta64("NaT", unit) for unit in m8_units] ) inf_vals = [ float("inf"), float("-inf"), complex("inf"), complex("-inf"), np.inf, np.NINF, ] int_na_vals = [
class TestTypeInference(object): # Dummy class used for testing with Python objects class Dummy(): pass def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # see pandas/conftest.py inferred_dtype, values = any_skipna_inferred_dtype # make sure the inferred dtype of the fixture is as requested assert inferred_dtype == lib.infer_dtype(values, skipna=True) @pytest.mark.parametrize('skipna', [True, False]) def test_length_zero(self, skipna): result = lib.infer_dtype(np.array([], dtype='i4'), skipna=skipna) assert result == 'integer' result = lib.infer_dtype([], skipna=skipna) assert result == 'empty' # GH 18004 arr = np.array( [np.array([], dtype=object), np.array([], dtype=object)]) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'empty' def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='i4') result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' def test_deprecation(self): # GH 24050 arr = np.array([1, 2, 3], dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = lib.infer_dtype(arr) # default: skipna=None -> warn assert result == 'integer' def test_bools(self): arr = np.array([True, False, True, True, True], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, False, True, 'foo'], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([True, False, True], dtype=bool) result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, np.nan, False], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='f4') result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, 4, 5], dtype='f8') result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([1.0, 2.0, Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' # complex is compatible with nan, so skipna has no effect @pytest.mark.parametrize('skipna', [True, False]) def test_complex(self, skipna): # gets cast to complex on array construction arr = np.array([1.0, 2.0, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' arr = np.array([1.0, 2.0, 1 + 1j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'mixed' # gets cast to complex on array construction arr = np.array([1, np.nan, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' arr = np.array([1.0, np.nan, 1 + 1j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'mixed' # complex with nans stays complex arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' # test smaller complex dtype; will pass through _try_infer_map fastpath arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' def test_string(self): pass def test_unicode(self): arr = [u'a', np.nan, u'c'] result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' arr = [u'a', np.nan, u'c'] result = lib.infer_dtype(arr, skipna=True) expected = 'unicode' if PY2 else 'string' assert result == expected @pytest.mark.parametrize('dtype, missing, skipna, expected', [(float, np.nan, False, 'floating'), (float, np.nan, True, 'floating'), (object, np.nan, False, 'floating'), (object, np.nan, True, 'empty'), (object, None, False, 'mixed'), (object, None, True, 'empty')]) @pytest.mark.parametrize('box', [pd.Series, np.array]) def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 arr = box([missing, missing], dtype=dtype) result = lib.infer_dtype(arr, skipna=skipna) assert result == expected def test_datetime(self): dates = [datetime(2012, 1, x) for x in range(1, 20)] index = Index(dates) assert index.inferred_type == 'datetime64' def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array( [np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, pd.Timestamp('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # different type of nat arr = np.array([np.timedelta64('nat'), np.datetime64('2011-01-02')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.datetime64('2011-01-02'), np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # should be datetime? arr = np.array( [np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array( [pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' def test_infer_dtype_timedelta(self): arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D')], dtype=object) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([timedelta(1), timedelta(2)]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, Timedelta('1 days')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1)]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, pd.Timedelta('1 days'), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D'), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' def test_infer_dtype_period(self): # GH 13664 arr = np.array( [pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='D')]) assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array( [pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='M')]) assert lib.infer_dtype(arr, skipna=True) == 'period' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Period('2011-01', freq='D')]) assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array([n, pd.Period('2011-01', freq='D'), n]) assert lib.infer_dtype(arr, skipna=True) == 'period' # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' @pytest.mark.parametrize( "data", [[datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], [Timestamp("20170612"), Timestamp("20170311")], [ Timestamp("20170612", tz='US/Eastern'), Timestamp("20170311", tz='US/Eastern') ], [date(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')], [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)]]) def test_infer_datetimelike_array_datetime(self, data): assert lib.infer_datetimelike_array(data) == "datetime" @pytest.mark.parametrize("data", [[ timedelta(2017, 6, 12), timedelta(2017, 3, 11) ], [timedelta(2017, 6, 12), date(2017, 3, 11) ], [np.timedelta64(2017, "D"), np.timedelta64(6, "s") ], [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)]]) def test_infer_datetimelike_array_timedelta(self, data): assert lib.infer_datetimelike_array(data) == "timedelta" def test_infer_datetimelike_array_date(self): arr = [date(2017, 6, 12), date(2017, 3, 11)] assert lib.infer_datetimelike_array(arr) == "date" @pytest.mark.parametrize( "data", [["2017-06-12", "2017-03-11"], [20170612, 20170311], [20170612.5, 20170311.8], [Dummy(), Dummy()], [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')], [Timestamp("20170612"), 20170311], [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')]]) def test_infer_datetimelike_array_mixed(self, data): assert lib.infer_datetimelike_array(data) == "mixed" @pytest.mark.parametrize( "first, expected", [[[None], "mixed"], [[np.nan], "mixed"], [[pd.NaT], "nat"], [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"], [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], [[date(2017, 6, 12), pd.NaT], "date"], [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"]]) @pytest.mark.parametrize("second", [None, np.nan]) def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): first.append(second) assert lib.infer_datetimelike_array(first) == expected def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == 'floating' # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([None, np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' # pd.NaT arr = np.array([pd.NaT]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([pd.NaT, np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT, np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([None, pd.NaT, None]) assert lib.infer_dtype(arr, skipna=False) == 'datetime' # np.datetime64(nat) arr = np.array([np.datetime64('nat')]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr, skipna=False) == 'timedelta' # datetime / timedelta mixed arr = np.array( [pd.NaT, np.datetime64('nat'), np.timedelta64('nat'), np.nan]) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' def test_is_datetimelike_array_all_nan_nat_like(self): arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) arr = np.array( [np.nan, pd.NaT, np.datetime64('nat'), np.timedelta64('nat')]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, pd.NaT]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) arr = np.array([np.nan, np.nan], dtype=object) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) assert lib.is_datetime_with_singletz_array( np.array([ pd.Timestamp('20130101', tz='US/Eastern'), pd.Timestamp('20130102', tz='US/Eastern') ], dtype=object)) assert not lib.is_datetime_with_singletz_array( np.array([ pd.Timestamp('20130101', tz='US/Eastern'), pd.Timestamp('20130102', tz='CET') ], dtype=object)) @pytest.mark.parametrize("func", [ 'is_datetime_array', 'is_datetime64_array', 'is_bool_array', 'is_timedelta_or_timedelta64_array', 'is_date_array', 'is_time_array', 'is_interval_array', 'is_period_array' ]) def test_other_dtypes_for_array(self, func): func = getattr(lib, func) arr = np.array(['foo', 'bar']) assert not func(arr) arr = np.array([1, 2]) assert not func(arr) def test_date(self): dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) assert index.inferred_type == 'date' dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] result = lib.infer_dtype(dates, skipna=False) assert result == 'mixed' result = lib.infer_dtype(dates, skipna=True) assert result == 'date' def test_is_numeric_array(self): assert lib.is_float_array(np.array([1, 2.0])) assert lib.is_float_array(np.array([1, 2.0, np.nan])) assert not lib.is_float_array(np.array([1, 2])) assert lib.is_integer_array(np.array([1, 2])) assert not lib.is_integer_array(np.array([1, 2.0])) def test_is_string_array(self): assert lib.is_string_array(np.array(['foo', 'bar'])) assert not lib.is_string_array( np.array(['foo', 'bar', np.nan], dtype=object), skipna=False) assert lib.is_string_array(np.array(['foo', 'bar', np.nan], dtype=object), skipna=True) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): r = (5, 6) values = [r] result = lib.to_object_array_tuples(values) try: # make sure record array works from collections import namedtuple record = namedtuple('record', 'x y') r = record(5, 6) values = [r] result = lib.to_object_array_tuples(values) # noqa except ImportError: pass def test_object(self): # GH 7431 # cannot infer more than this as only a single element arr = np.array([None], dtype='O') result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' result = lib.infer_dtype(arr, skipna=True) assert result == 'empty' def test_to_object_array_width(self): # see gh-13320 rows = [[1, 2, 3], [4, 5, 6]] expected = np.array(rows, dtype=object) out = lib.to_object_array(rows) tm.assert_numpy_array_equal(out, expected) expected = np.array(rows, dtype=object) out = lib.to_object_array(rows, min_width=1) tm.assert_numpy_array_equal(out, expected) expected = np.array([[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object) out = lib.to_object_array(rows, min_width=5) tm.assert_numpy_array_equal(out, expected) def test_is_period(self): assert lib.is_period(pd.Period('2011-01', freq='M')) assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M')) assert not lib.is_period(pd.Timestamp('2011-01')) assert not lib.is_period(1) assert not lib.is_period(np.nan) def test_categorical(self): # GH 8974 from pandas import Categorical, Series arr = Categorical(list('abc')) result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical' arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical'
def dcr_coin(self): """ Pulls Coinmetrics v2 API Community - adds coin age metric (days) - adds coin age metric (supply) = Supply / 21M - adds Bittrex early price data not included in coinmetrics from csv OUTPUT DATAFRAME COLUMNS: 'date', 'blk','age_days','age_sply','btc_blk_est', 'DailyIssuedNtv', 'DailyIssuedUSD', 'inf_pct_ann', 'S2F', 'AdrActCnt', 'BlkCnt', 'BlkSizeByte', 'BlkSizeMeanByte', 'CapMVRVCur', 'CapMrktCurUSD', 'CapRealUSD', 'DiffMean', 'FeeMeanNtv','FeeMeanUSD', 'FeeMedNtv', 'FeeMedUSD', 'FeeTotNtv', 'FeeTotUSD', 'PriceBTC', 'PriceUSD', 'PriceRealUSD', 'SplyCur', 'TxCnt', 'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValAdjUSD', 'TxTfrValMeanNtv', 'TxTfrValMeanUSD', 'TxTfrValMedNtv', 'TxTfrValMedUSD', 'TxTfrValNtv', 'TxTfrValUSD', 'notes' """ df = Coinmetrics_api('dcr', "2016-02-08", today).convert_to_pd() #Calculate coin age since launch in days df['age_days'] = (df[['date']] - df.loc[0, ['date']]) / np.timedelta64( 1, 'D') #Calculate coin age since launch in terms of supply df['age_sply'] = df['SplyCur'] / 21e6 print( '...adding PriceUSD and CapMrktCurUSD for $0.49 (founders, 8/9-Feb-2016)' ) print('and Bittrex (10-02-2016 to 16-05-2016)...') #Import Early price data --> # founders $0.49 for 8/9 Feb 2016 # Bitrex up to 16-May-2016 (saved in relative link csv) df_early = pd.read_csv( r"dcronchain\resources\data\dcr_pricedata_2016-02-08_2016-05-16.csv" ) df_early['date'] = pd.to_datetime( df_early['date'], utc=True) #Convert to correct datetime format df['notes'] = str('') # add notes for storing data for i in df_early['date']: #swap in early price data #Add Early PriceUSD Data df.loc[df.date == i, 'PriceUSD'] = float(df_early.loc[df_early.date == i, 'PriceUSD']) #Add Early PriceBTC Data df.loc[df.date == i, 'PriceBTC'] = float(df_early.loc[df_early.date == i, 'PriceBTC']) #Add Early MarketCap Data df.loc[df.date == i, 'CapMrktCurUSD'] = (df.loc[df.date == i, 'PriceUSD'] * df.loc[df.date == i, 'SplyCur']) #Add Notes df.loc[df.date == i, 'notes'] = df_early.loc[df_early.date == i, 'notes'] # Restructure final dataset df = df[[ 'date', 'blk', 'age_days', 'age_sply', 'btc_blk_est', 'DailyIssuedNtv', 'DailyIssuedUSD', 'inf_pct_ann', 'S2F', 'AdrActCnt', 'BlkCnt', 'BlkSizeByte', 'BlkSizeMeanByte', 'CapMVRVCur', 'CapMrktCurUSD', 'CapRealUSD', 'DiffMean', 'FeeMeanNtv', 'FeeMeanUSD', 'FeeMedNtv', 'FeeMedUSD', 'FeeTotNtv', 'FeeTotUSD', 'PriceBTC', 'PriceUSD', 'PriceRealUSD', 'SplyCur', 'TxCnt', 'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValAdjUSD', 'TxTfrValMeanNtv', 'TxTfrValMeanUSD', 'TxTfrValMedNtv', 'TxTfrValMedUSD', 'TxTfrValNtv', 'TxTfrValUSD', 'notes' ]] #Reformat datetime #df['date'] = df['date'].dt.strftime('%d-%m-%y') return df
from sklearn.metrics import confusion_matrix df_response = pd.read_csv('/content/drive/MyDrive/CRM/Retail_Data_Response.csv') df_transactions = pd.read_csv('/content/drive/MyDrive/CRM/Retail_Data_Transactions.csv', parse_dates=['trans_date']) df_response.head() df_transactions.head() print(df_transactions['trans_date'].min()) print(df_transactions['trans_date'].max()) campaign_date = dt.datetime(2015,3,17) df_transactions['recent']= campaign_date - df_transactions['trans_date'] df_transactions['recent'].astype('timedelta64[D]') df_transactions['recent']=df_transactions['recent'] / np.timedelta64(1, 'D') df_transactions.head() ## create data set with RFM variables df_rfm = df_transactions.groupby('customer_id').agg({'recent': lambda x:x.min(), # Recency 'customer_id': lambda x: len(x), # Frequency 'tran_amount': lambda x: x.sum()}) # Monetary Value df_rfm.rename(columns={'recent': 'recency', 'customer_id': 'frequency', 'tran_amount': 'monetary_value'}, inplace=True) #df_rfm['ticket_size'] = df_rfm['monetary_value'] / df_rfm['frequency']
def test_short_format_converters(self): def conv(v): return v.astype('m8[ns]') self.assertEqual(ct('10'), np.timedelta64(10, 'ns')) self.assertEqual(ct('10ns'), np.timedelta64(10, 'ns')) self.assertEqual(ct('100'), np.timedelta64(100, 'ns')) self.assertEqual(ct('100ns'), np.timedelta64(100, 'ns')) self.assertEqual(ct('1000'), np.timedelta64(1000, 'ns')) self.assertEqual(ct('1000ns'), np.timedelta64(1000, 'ns')) self.assertEqual(ct('1000NS'), np.timedelta64(1000, 'ns')) self.assertEqual(ct('10us'), np.timedelta64(10000, 'ns')) self.assertEqual(ct('100us'), np.timedelta64(100000, 'ns')) self.assertEqual(ct('1000us'), np.timedelta64(1000000, 'ns')) self.assertEqual(ct('1000Us'), np.timedelta64(1000000, 'ns')) self.assertEqual(ct('1000uS'), np.timedelta64(1000000, 'ns')) self.assertEqual(ct('1ms'), np.timedelta64(1000000, 'ns')) self.assertEqual(ct('10ms'), np.timedelta64(10000000, 'ns')) self.assertEqual(ct('100ms'), np.timedelta64(100000000, 'ns')) self.assertEqual(ct('1000ms'), np.timedelta64(1000000000, 'ns')) self.assertEqual(ct('-1s'), -np.timedelta64(1000000000, 'ns')) self.assertEqual(ct('1s'), np.timedelta64(1000000000, 'ns')) self.assertEqual(ct('10s'), np.timedelta64(10000000000, 'ns')) self.assertEqual(ct('100s'), np.timedelta64(100000000000, 'ns')) self.assertEqual(ct('1000s'), np.timedelta64(1000000000000, 'ns')) self.assertEqual(ct('1d'), conv(np.timedelta64(1, 'D'))) self.assertEqual(ct('-1d'), -conv(np.timedelta64(1, 'D'))) self.assertEqual(ct('1D'), conv(np.timedelta64(1, 'D'))) self.assertEqual(ct('10D'), conv(np.timedelta64(10, 'D'))) self.assertEqual(ct('100D'), conv(np.timedelta64(100, 'D'))) self.assertEqual(ct('1000D'), conv(np.timedelta64(1000, 'D'))) self.assertEqual(ct('10000D'), conv(np.timedelta64(10000, 'D'))) # space self.assertEqual(ct(' 10000D '), conv(np.timedelta64(10000, 'D'))) self.assertEqual(ct(' - 10000D '), -conv(np.timedelta64(10000, 'D'))) # invalid self.assertRaises(ValueError, ct, '1foo') self.assertRaises(ValueError, ct, 'foo')
with pytest.raises(TypeError, match=r"Expected integer or floating point"): da_time.interpolate_na("t", max_gap="1H", use_coordinate=False) with pytest.raises(ValueError, match=r"Could not convert 'huh' to timedelta64"): da_time.interpolate_na("t", max_gap="huh") @requires_bottleneck @pytest.mark.parametrize( "time_range_func", [pd.date_range, pytest.param(xr.cftime_range, marks=requires_cftime)], ) @pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.to_dataset(name="a")]) @pytest.mark.parametrize( "max_gap", ["3H", np.timedelta64(3, "h"), pd.to_timedelta("3H")] ) def test_interpolate_na_max_gap_time_specifier( da_time, max_gap, transform, time_range_func ): da_time["t"] = time_range_func("2001-01-01", freq="H", periods=11) expected = transform( da_time.copy(data=[np.nan, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan, 10]) ) actual = transform(da_time).interpolate_na("t", max_gap=max_gap) assert_allclose(actual, expected) @requires_bottleneck @pytest.mark.parametrize( "coords",
""" import pandas as pd import numpy as np import matplotlib.pyplot as plt '''data exploring''' inputfile = "F:\spyder\datamining\chapter10\demo\data\water_heater.xls" data = pd.read_excel(inputfile) data[u'发生时间'] = pd.to_datetime(data[u'发生时间'], format='%Y%m%d%H%M%S') data = data[data[u'水流量'] > 0] '''Pandas计算出的时间间隔数据的类型是np.timedelta64, 不是Python标准库中的timedelta类型,因此没有total_minutes()函数, 需要除以np.timedelta64的1分钟来计算间隔了多少分钟。''' data[u'用水停顿时间间隔'] = data[u'发生时间'].diff() / np.timedelta64(1, 'm') #计算间隔了多少分钟 data = data.fillna(0) '''step1: check maximum, minimum of each column''' data_explore = data.describe().T print(data_explore) data_explore['null'] = len(data) - data_explore['count'] # numbers of nulls explore = data_explore[['min', 'max', 'null']] explore.columns = [u'最小值', u'最大值', u'空值数'] print(explore) '''step2: Discretization and surface division''' Ti = list(data[u'用水停顿时间间隔']) #将要面元化的数据转化为一维列表 timegaplist = [ 0.0, 0.1, 0.2, 0.3, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2100 ] #确定划分区间 cats = pd.cut(Ti, timegaplist, right=False) #包括区间左端,类似于[0, 0.1),(默认则包含又端点) x = pd.value_counts(cats)
df_inicial = df_inicial.assign(dias=pd.to_datetime(df_inicial['date']) - data_base) df_inicial = df_inicial.assign( vacinados=pd.to_numeric(df_inicial['people_fully_vaccinated'])) # primeiro dia de ocorrencia de imunização completa print('A primeira imunização completa no Brasil ocorreu em:', data_base) # gerando a coluna com dias de vacinação df_inicial = df_inicial.assign(dias=pd.to_datetime(df_inicial['date']) - data_base) df_inicial = df_inicial.assign( vacinados=pd.to_numeric(df_inicial['people_fully_vaccinated'])) # convertendo tipos de dados df_inicial['dias'] = df_inicial['dias'] / np.timedelta64(1, 'D') df_inicial['dias'] = df_inicial['dias'].astype(int) df_inicial['vacinados'] = df_inicial['vacinados'].astype(int) ## Utilizando regressão linear para previsão dos dados. lin_reg = LinearRegression() x = pd.DataFrame(df_inicial['dias']) y = pd.DataFrame(df_inicial['vacinados']) # transformando colunas do dataframe em array numpy x = x.iloc[:, 0].values.reshape(-1, 1) y = y.iloc[:, 0].values.reshape(-1, 1) # calculando o score do modelo utilizado reg = LinearRegression().fit(x, y) # treinando modelo com os dados historicos print('O valor de score do modelo é = ',
class TestNumericArraylikeArithmeticWithDatetimeLike: # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_mul_td64arr(self, left, box_cls): # GH#22390 right = np.array([1, 2, 3], dtype="m8[s]") right = box_cls(right) expected = pd.TimedeltaIndex(["10s", "40s", "90s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = left * right tm.assert_equal(result, expected) result = right * left tm.assert_equal(result, expected) # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_div_td64arr(self, left, box_cls): # GH#22390 right = np.array([10, 40, 90], dtype="m8[s]") right = box_cls(right) expected = pd.TimedeltaIndex(["1s", "2s", "3s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) result = right / left tm.assert_equal(result, expected) result = right // left tm.assert_equal(result, expected) msg = "Cannot divide" with pytest.raises(TypeError, match=msg): left / right with pytest.raises(TypeError, match=msg): left // right # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 td = Timedelta("1 day") other = pd.Series([1, 2]) expected = pd.Series(pd.to_timedelta(["1 day", "2 days"])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) # TODO: also test non-nanosecond timedelta64 and Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing @pytest.mark.parametrize( "scalar_td", [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta(), ], ids=lambda x: type(x).__name__, ) def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): # GH#19333 index = numeric_idx expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(5)]) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = index * scalar_td tm.assert_equal(result, expected) commute = scalar_td * index tm.assert_equal(commute, expected) @pytest.mark.parametrize( "scalar_td", [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta(), ], ids=lambda x: type(x).__name__, ) def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box): arr = np.arange(2 * 10 ** 4).astype(np.int64) obj = tm.box_expected(arr, box, transpose=False) expected = arr.view("timedelta64[D]").astype("timedelta64[ns]") expected = tm.box_expected(expected, box, transpose=False) result = obj * scalar_td tm.assert_equal(result, expected) result = scalar_td * obj tm.assert_equal(result, expected) def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): index = numeric_idx[1:3] expected = TimedeltaIndex(["3 Days", "36 Hours"]) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = three_days / index tm.assert_equal(result, expected) msg = "cannot use operands with types dtype" with pytest.raises(TypeError, match=msg): index / three_days @pytest.mark.parametrize( "other", [ pd.Timedelta(hours=31), pd.Timedelta(hours=31).to_pytimedelta(), pd.Timedelta(hours=31).to_timedelta64(), pd.Timedelta(hours=31).to_timedelta64().astype("m8[h]"), np.timedelta64("NaT"), np.timedelta64("NaT", "D"), pd.offsets.Minute(3), pd.offsets.Second(0), ], ) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): left = tm.box_expected(numeric_idx, box) msg = ( "unsupported operand type|" "Addition/subtraction of integers and integer-arrays|" "Instead of adding/subtracting|" "cannot use operands with types dtype|" "Concatenation operation is not implemented for NumPy arrays" ) with pytest.raises(TypeError, match=msg): left + other with pytest.raises(TypeError, match=msg): other + left with pytest.raises(TypeError, match=msg): left - other with pytest.raises(TypeError, match=msg): other - left @pytest.mark.parametrize( "other", [ pd.Timestamp.now().to_pydatetime(), pd.Timestamp.now(tz="UTC").to_pydatetime(), pd.Timestamp.now().to_datetime64(), pd.NaT, ], ) @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box): # GH#28080 numeric+datetime64 should raise; Timestamp raises # NullFrequencyError instead of TypeError so is excluded. left = tm.box_expected(numeric_idx, box) msg = ( "unsupported operand type|" "Cannot (add|subtract) NaT (to|from) ndarray|" "Addition/subtraction of integers and integer-arrays|" "Concatenation operation is not implemented for NumPy arrays" ) with pytest.raises(TypeError, match=msg): left + other with pytest.raises(TypeError, match=msg): other + left with pytest.raises(TypeError, match=msg): left - other with pytest.raises(TypeError, match=msg): other - left
def test_construction(self): expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') self.assertEqual(Timedelta(10, unit='d').value, expected) self.assertEqual(Timedelta(10.0, unit='d').value, expected) self.assertEqual(Timedelta('10 days').value, expected) self.assertEqual(Timedelta(days=10).value, expected) self.assertEqual(Timedelta(days=10.0).value, expected) expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') self.assertEqual(Timedelta('10 days 00:00:10').value, expected) self.assertEqual(Timedelta(days=10, seconds=10).value, expected) self.assertEqual( Timedelta(days=10, milliseconds=10 * 1000).value, expected) self.assertEqual( Timedelta(days=10, microseconds=10 * 1000 * 1000).value, expected) # test construction with np dtypes # GH 8757 timedelta_kwargs = {'days': 'D', 'seconds': 's', 'microseconds': 'us', 'milliseconds': 'ms', 'minutes': 'm', 'hours': 'h', 'weeks': 'W'} npdtypes = [np.int64, np.int32, np.int16, np.float64, np.float32, np.float16] for npdtype in npdtypes: for pykwarg, npkwarg in timedelta_kwargs.items(): expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8') self.assertEqual( Timedelta(**{pykwarg: npdtype(1)}).value, expected) # rounding cases self.assertEqual(Timedelta(82739999850000).value, 82739999850000) self.assertTrue('0 days 22:58:59.999850' in str(Timedelta( 82739999850000))) self.assertEqual(Timedelta(123072001000000).value, 123072001000000) self.assertTrue('1 days 10:11:12.001' in str(Timedelta( 123072001000000))) # string conversion with/without leading zero # GH 9570 self.assertEqual(Timedelta('0:00:00'), timedelta(hours=0)) self.assertEqual(Timedelta('00:00:00'), timedelta(hours=0)) self.assertEqual(Timedelta('-1:00:00'), -timedelta(hours=1)) self.assertEqual(Timedelta('-01:00:00'), -timedelta(hours=1)) # more strings & abbrevs # GH 8190 self.assertEqual(Timedelta('1 h'), timedelta(hours=1)) self.assertEqual(Timedelta('1 hour'), timedelta(hours=1)) self.assertEqual(Timedelta('1 hr'), timedelta(hours=1)) self.assertEqual(Timedelta('1 hours'), timedelta(hours=1)) self.assertEqual(Timedelta('-1 hours'), -timedelta(hours=1)) self.assertEqual(Timedelta('1 m'), timedelta(minutes=1)) self.assertEqual(Timedelta('1.5 m'), timedelta(seconds=90)) self.assertEqual(Timedelta('1 minute'), timedelta(minutes=1)) self.assertEqual(Timedelta('1 minutes'), timedelta(minutes=1)) self.assertEqual(Timedelta('1 s'), timedelta(seconds=1)) self.assertEqual(Timedelta('1 second'), timedelta(seconds=1)) self.assertEqual(Timedelta('1 seconds'), timedelta(seconds=1)) self.assertEqual(Timedelta('1 ms'), timedelta(milliseconds=1)) self.assertEqual(Timedelta('1 milli'), timedelta(milliseconds=1)) self.assertEqual(Timedelta('1 millisecond'), timedelta(milliseconds=1)) self.assertEqual(Timedelta('1 us'), timedelta(microseconds=1)) self.assertEqual(Timedelta('1 micros'), timedelta(microseconds=1)) self.assertEqual(Timedelta('1 microsecond'), timedelta(microseconds=1)) self.assertEqual(Timedelta('1.5 microsecond'), Timedelta('00:00:00.000001500')) self.assertEqual(Timedelta('1 ns'), Timedelta('00:00:00.000000001')) self.assertEqual(Timedelta('1 nano'), Timedelta('00:00:00.000000001')) self.assertEqual(Timedelta('1 nanosecond'), Timedelta('00:00:00.000000001')) # combos self.assertEqual(Timedelta('10 days 1 hour'), timedelta(days=10, hours=1)) self.assertEqual(Timedelta('10 days 1 h'), timedelta(days=10, hours=1)) self.assertEqual(Timedelta('10 days 1 h 1m 1s'), timedelta( days=10, hours=1, minutes=1, seconds=1)) self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), - timedelta(days=10, hours=1, minutes=1, seconds=1)) self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), - timedelta(days=10, hours=1, minutes=1, seconds=1)) self.assertEqual(Timedelta('-10 days 1 h 1m 1s 3us'), - timedelta(days=10, hours=1, minutes=1, seconds=1, microseconds=3)) self.assertEqual(Timedelta('-10 days 1 h 1.5m 1s 3us'), - timedelta(days=10, hours=1, minutes=1, seconds=31, microseconds=3)) # currently invalid as it has a - on the hhmmdd part (only allowed on # the days) self.assertRaises(ValueError, lambda: Timedelta('-10 days -1 h 1.5m 1s 3us')) # only leading neg signs are allowed self.assertRaises(ValueError, lambda: Timedelta('10 days -1 h 1.5m 1s 3us')) # no units specified self.assertRaises(ValueError, lambda: Timedelta('3.1415')) # invalid construction tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta", lambda: Timedelta()) tm.assertRaisesRegexp(ValueError, "unit abbreviation w/o a number", lambda: Timedelta('foo')) tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta from the passed " "arguments, allowed keywords are ", lambda: Timedelta(day=10)) # roundtripping both for string and value for v in ['1s', '-1s', '1us', '-1us', '1 day', '-1 day', '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', '1ns', '-23:59:59.999999999']: td = Timedelta(v) self.assertEqual(Timedelta(td.value), td) # str does not normally display nanos if not td.nanoseconds: self.assertEqual(Timedelta(str(td)), td) self.assertEqual(Timedelta(td._repr_base(format='all')), td) # floats expected = np.timedelta64( 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( 500, 'ms').astype('m8[ns]').view('i8') self.assertEqual(Timedelta(10.5, unit='s').value, expected) # offset self.assertEqual(to_timedelta(pd.offsets.Hour(2)), Timedelta('0 days, 02:00:00')) self.assertEqual(Timedelta(pd.offsets.Hour(2)), Timedelta('0 days, 02:00:00')) self.assertEqual(Timedelta(pd.offsets.Second(2)), Timedelta('0 days, 00:00:02')) # unicode # GH 11995 expected = Timedelta('1H') result = pd.Timedelta(u'1H') self.assertEqual(result, expected) self.assertEqual(to_timedelta(pd.offsets.Hour(2)), Timedelta(u'0 days, 02:00:00')) self.assertRaises(ValueError, lambda: Timedelta(u'foo bar'))
def test_unique_1d(self): def check_all(a, b, i1, i2, c, dt): base_msg = 'check {0} failed for type {1}' msg = base_msg.format('values', dt) v = unique(a) assert_array_equal(v, b, msg) msg = base_msg.format('return_index', dt) v, j = unique(a, True, False, False) assert_array_equal(v, b, msg) assert_array_equal(j, i1, msg) msg = base_msg.format('return_inverse', dt) v, j = unique(a, False, True, False) assert_array_equal(v, b, msg) assert_array_equal(j, i2, msg) msg = base_msg.format('return_counts', dt) v, j = unique(a, False, False, True) assert_array_equal(v, b, msg) assert_array_equal(j, c, msg) msg = base_msg.format('return_index and return_inverse', dt) v, j1, j2 = unique(a, True, True, False) assert_array_equal(v, b, msg) assert_array_equal(j1, i1, msg) assert_array_equal(j2, i2, msg) msg = base_msg.format('return_index and return_counts', dt) v, j1, j2 = unique(a, True, False, True) assert_array_equal(v, b, msg) assert_array_equal(j1, i1, msg) assert_array_equal(j2, c, msg) msg = base_msg.format('return_inverse and return_counts', dt) v, j1, j2 = unique(a, False, True, True) assert_array_equal(v, b, msg) assert_array_equal(j1, i2, msg) assert_array_equal(j2, c, msg) msg = base_msg.format(('return_index, return_inverse ' 'and return_counts'), dt) v, j1, j2, j3 = unique(a, True, True, True) assert_array_equal(v, b, msg) assert_array_equal(j1, i1, msg) assert_array_equal(j2, i2, msg) assert_array_equal(j3, c, msg) a = [5, 7, 1, 2, 1, 5, 7]*10 b = [1, 2, 5, 7] i1 = [2, 3, 0, 1] i2 = [2, 3, 0, 1, 0, 2, 3]*10 c = np.multiply([2, 1, 2, 2], 10) # test for numeric arrays types = [] types.extend(np.typecodes['AllInteger']) types.extend(np.typecodes['AllFloat']) types.append('datetime64[D]') types.append('timedelta64[D]') for dt in types: aa = np.array(a, dt) bb = np.array(b, dt) check_all(aa, bb, i1, i2, c, dt) # test for object arrays dt = 'O' aa = np.empty(len(a), dt) aa[:] = a bb = np.empty(len(b), dt) bb[:] = b check_all(aa, bb, i1, i2, c, dt) # test for structured arrays dt = [('', 'i'), ('', 'i')] aa = np.array(list(zip(a, a)), dt) bb = np.array(list(zip(b, b)), dt) check_all(aa, bb, i1, i2, c, dt) # test for ticket #2799 aa = [1. + 0.j, 1 - 1.j, 1] assert_array_equal(np.unique(aa), [1. - 1.j, 1. + 0.j]) # test for ticket #4785 a = [(1, 2), (1, 2), (2, 3)] unq = [1, 2, 3] inv = [0, 1, 0, 1, 1, 2] a1 = unique(a) assert_array_equal(a1, unq) a2, a2_inv = unique(a, return_inverse=True) assert_array_equal(a2, unq) assert_array_equal(a2_inv, inv) # test for chararrays with return_inverse (gh-5099) a = np.chararray(5) a[...] = '' a2, a2_inv = np.unique(a, return_inverse=True) assert_array_equal(a2_inv, np.zeros(5)) # test for ticket #9137 a = [] a1_idx = np.unique(a, return_index=True)[1] a2_inv = np.unique(a, return_inverse=True)[1] a3_idx, a3_inv = np.unique(a, return_index=True, return_inverse=True)[1:] assert_equal(a1_idx.dtype, np.intp) assert_equal(a2_inv.dtype, np.intp) assert_equal(a3_idx.dtype, np.intp) assert_equal(a3_inv.dtype, np.intp) # test for ticket 2111 - float a = [2.0, np.nan, 1.0, np.nan] ua = [1.0, 2.0, np.nan] ua_idx = [2, 0, 1] ua_inv = [1, 2, 0, 2] ua_cnt = [1, 1, 2] assert_equal(np.unique(a), ua) assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) # test for ticket 2111 - complex a = [2.0-1j, np.nan, 1.0+1j, complex(0.0, np.nan), complex(1.0, np.nan)] ua = [1.0+1j, 2.0-1j, complex(0.0, np.nan)] ua_idx = [2, 0, 3] ua_inv = [1, 2, 0, 2, 2] ua_cnt = [1, 1, 3] assert_equal(np.unique(a), ua) assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) # test for ticket 2111 - datetime64 nat = np.datetime64('nat') a = [np.datetime64('2020-12-26'), nat, np.datetime64('2020-12-24'), nat] ua = [np.datetime64('2020-12-24'), np.datetime64('2020-12-26'), nat] ua_idx = [2, 0, 1] ua_inv = [1, 2, 0, 2] ua_cnt = [1, 1, 2] assert_equal(np.unique(a), ua) assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) # test for ticket 2111 - timedelta nat = np.timedelta64('nat') a = [np.timedelta64(1, 'D'), nat, np.timedelta64(1, 'h'), nat] ua = [np.timedelta64(1, 'h'), np.timedelta64(1, 'D'), nat] ua_idx = [2, 0, 1] ua_inv = [1, 2, 0, 2] ua_cnt = [1, 1, 2] assert_equal(np.unique(a), ua) assert_equal(np.unique(a, return_index=True), (ua, ua_idx)) assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv)) assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt)) # test for gh-19300 all_nans = [np.nan] * 4 ua = [np.nan] ua_idx = [0] ua_inv = [0, 0, 0, 0] ua_cnt = [4] assert_equal(np.unique(all_nans), ua) assert_equal(np.unique(all_nans, return_index=True), (ua, ua_idx)) assert_equal(np.unique(all_nans, return_inverse=True), (ua, ua_inv)) assert_equal(np.unique(all_nans, return_counts=True), (ua, ua_cnt))
def stockForDate(dateToPrice, date): while not date in dateToPrice: date = date - np.timedelta64(1, 'D') return dateToPrice[date]
def split_timestamp_to_dt(sample): return np.datetime64(datetime.utcfromtimestamp( sample['secs'])) + np.timedelta64(sample['nano'], 'ns')
_CFTIME_DATETIME_UNITS_TESTS) def test_infer_cftime_datetime_units(calendar, date_args, expected): date_type = _all_cftime_date_types()[calendar] dates = [date_type(*args) for args in date_args] assert expected == coding.times.infer_datetime_units(dates) @pytest.mark.parametrize( ['timedeltas', 'units', 'numbers'], [('1D', 'days', np.int64(1)), (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), ('1h', 'hours', np.int64(1)), ('1ms', 'milliseconds', np.int64(1)), ('1us', 'microseconds', np.int64(1)), (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), (['30m', '60m'], 'hours', [0.5, 1.0]), (np.timedelta64('NaT', 'ns'), 'days', np.nan), (['NaT', 'NaT'], 'days', [np.nan, np.nan])]) def test_cf_timedelta(timedeltas, units, numbers): timedeltas = pd.to_timedelta(timedeltas, box=False) numbers = np.array(numbers) expected = numbers actual, _ = coding.times.encode_cf_timedelta(timedeltas, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype if units is not None: expected = timedeltas actual = coding.times.decode_cf_timedelta(numbers, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype
levels = [_nonempty_index(l) for l in idx.levels] codes = [[0, 0] for i in idx.levels] try: return pd.MultiIndex(levels=levels, codes=codes, names=idx.names) except TypeError: # older pandas versions return pd.MultiIndex(levels=levels, labels=codes, names=idx.names) raise TypeError("Don't know how to handle index of " "type {0}".format(typename(type(idx)))) _simple_fake_mapping = { "b": np.bool_(True), "V": np.void(b" "), "M": np.datetime64("1970-01-01"), "m": np.timedelta64(1), "S": np.str_("foo"), "a": np.str_("foo"), "U": np.unicode_("foo"), "O": "foo", } def _scalar_from_dtype(dtype): if dtype.kind in ("i", "f", "u"): return dtype.type(1) elif dtype.kind == "c": return dtype.type(complex(1, 0)) elif dtype.kind in _simple_fake_mapping: o = _simple_fake_mapping[dtype.kind] return o.astype(dtype) if dtype.kind in ("m", "M") else o
if not sys.warnoptions: warnings.simplefilter("ignore") # ## Import raw tweet data and merge it data with lagged spot price data # # Data source: # In[2]: #Import raw twitter data and merge Twitter data df = pd.read_csv('./data-streaming-tweets.csv') #Convert to datetime64 and convert UTC time used by Twitter to Eastern Time (New York). df.date = pd.to_datetime(df.date) - np.timedelta64(5, 'h') df.date = df.date.dt.date # Remove time part, only keep date. #remove data for 01-Dec-2019 as we cutoff at the end of Nov df = df[df['date']!= df.iloc[3324600]['date']] #Import Spot price information SpotPrice = pd.read_csv('./ChangeDateNew.csv') SpotPrice.date = pd.to_datetime(SpotPrice.date) SpotPrice.date = SpotPrice.date.dt.date SpotPrice['Lag_PriceChange'] = SpotPrice.Price_Change.shift(-1) # Lag the price change #Merge Spot price with raw data df = pd.merge(df, SpotPrice, how='left',on='date') df.tail()
def testAttributeAsDict(self): other_data = {} if pd: df = pd.DataFrame( { 'a': [1, 2, 3], 'b': [to_text('测试'), to_binary('属性'), 'c'] }, index=[[0, 0, 1], ['测试', '属性', '测试']]) other_data['w'] = df.columns other_data['ww'] = df.index other_data['x'] = df['b'] other_data['y'] = df other_data['z'] = [df.columns, df.index, df['a'], df] node4 = Node4(a=to_binary('中文'), b=np.random.randint(4, size=(3, 4)), c=np.datetime64(datetime.datetime.now()), d=np.timedelta64(datetime.timedelta(seconds=1234)), e=np.dtype('int'), f={ 'a': [True, False, False], 'd': [False, None] }, h=(1234, to_text('测试'), '属性', None, np.datetime64('1066-10-13'), np.timedelta64(1, 'D'), np.dtype([('x', 'i4'), ('y', 'f4')])), i=(slice(10), slice(0, 2), None, slice(2, 0, -1), slice('a', 'b'), slice(datetime.datetime.now(), datetime.datetime.now())), j=Node5(a='aa', b=slice(1, 100, 3)), k=[Node5(a='bb', b=slice(200, -1, -4)), None], l=Node6(b=3, nid=1), m=Node6(b=4, nid=2), n=[Node5(a='cc', b=slice(100, -2, -5)), None], **other_data) pbs = ProtobufSerializeProvider() serial = node4.serialize(pbs) d_node4 = Node4.deserialize(pbs, serial) self.assertEqual(node4.a, d_node4.a) np.testing.assert_array_equal(node4.b, d_node4.b) self.assertEqual(node4.c, d_node4.c) self.assertEqual(node4.d, d_node4.d) self.assertEqual(node4.e, d_node4.e) self.assertEqual(node4.f, d_node4.f) self.assertFalse(hasattr(d_node4, 'g')) self.assertEqual(node4.h, d_node4.h) self.assertEqual(node4.i, d_node4.i) self.assertEqual(node4.j.a, d_node4.j.a) self.assertEqual(node4.j.b, d_node4.j.b) self.assertEqual(node4.k[0].a, d_node4.k[0].a) self.assertEqual(node4.k[0].b, d_node4.k[0].b) self.assertIsNone(d_node4.k[1]) self.assertIsInstance(d_node4.l, Node7) self.assertEqual(node4.l.b, d_node4.l.b) self.assertIsInstance(d_node4.m, Node7) self.assertEqual(node4.m.b, d_node4.m.b) self.assertIsInstance(d_node4.n[0], Node5) self.assertEqual(node4.n[0].a, d_node4.n[0].a) self.assertEqual(node4.n[0].b, d_node4.n[0].b) self.assertIsNone(d_node4.n[1]) if pd: pd.testing.assert_index_equal(node4.w, d_node4.w) pd.testing.assert_index_equal(node4.ww, d_node4.ww) pd.testing.assert_series_equal(node4.x, d_node4.x) pd.testing.assert_frame_equal(node4.y, d_node4.y) pd.testing.assert_index_equal(node4.z[0], d_node4.z[0]) pd.testing.assert_index_equal(node4.z[1], d_node4.z[1]) pd.testing.assert_series_equal(node4.z[2], d_node4.z[2]) pd.testing.assert_frame_equal(node4.z[3], d_node4.z[3]) with self.assertRaises(TypeError): node42 = Node4(j=Node6()) node42.serialize(pbs) with self.assertRaises(TypeError): node6 = Node6(nid=0) node7 = Node7(nid=1, r=node6) node7.serialize(pbs) with self.assertRaises(TypeError): node6 = Node6(nid=0) node7 = Node7(nid=1, rl=[node6]) node7.serialize(pbs) node61 = Node6(nid=0) node62 = Node6(nid=0, r=node61) serial = node62.serialize(pbs) d_node62 = Node6.deserialize(pbs, serial) self.assertIsInstance(d_node62.r, Node6) node61 = Node6(nid=0) node62 = Node6(nid=0, rl=[node61]) serial = node62.serialize(pbs) d_node62 = Node6.deserialize(pbs, serial) self.assertIsInstance(d_node62.rl[0], Node6) jss = JsonSerializeProvider() serial = node4.serialize(jss) serial = json.loads(json.dumps(serial), object_hook=OrderedDict) d_node4 = Node4.deserialize(jss, serial) self.assertEqual(node4.a, d_node4.a) np.testing.assert_array_equal(node4.b, d_node4.b) self.assertEqual(node4.c, d_node4.c) self.assertEqual(node4.d, d_node4.d) self.assertEqual(node4.e, d_node4.e) self.assertEqual(node4.f, d_node4.f) self.assertFalse(hasattr(d_node4, 'g')) self.assertEqual(node4.h, d_node4.h) self.assertEqual(node4.i, d_node4.i) self.assertEqual(node4.j.a, d_node4.j.a) self.assertEqual(node4.k[0].a, d_node4.k[0].a) self.assertIsNone(d_node4.k[1]) self.assertIsInstance(d_node4.l, Node7) self.assertEqual(node4.l.b, d_node4.l.b) self.assertIsInstance(d_node4.m, Node7) self.assertEqual(node4.m.b, d_node4.m.b) self.assertIsInstance(d_node4.n[0], Node5) self.assertEqual(node4.n[0].a, d_node4.n[0].a) self.assertEqual(node4.n[0].b, d_node4.n[0].b) self.assertIsNone(d_node4.n[1]) if pd: pd.testing.assert_index_equal(node4.w, d_node4.w) pd.testing.assert_index_equal(node4.ww, d_node4.ww) pd.testing.assert_series_equal(node4.x, d_node4.x) pd.testing.assert_frame_equal(node4.y, d_node4.y) pd.testing.assert_index_equal(node4.z[0], d_node4.z[0]) pd.testing.assert_index_equal(node4.z[1], d_node4.z[1]) pd.testing.assert_series_equal(node4.z[2], d_node4.z[2]) pd.testing.assert_frame_equal(node4.z[3], d_node4.z[3]) with self.assertRaises(TypeError): node42 = Node4(j=Node6()) node42.serialize(jss) with self.assertRaises(TypeError): node6 = Node6() node7 = Node7(r=node6) node7.serialize(jss) with self.assertRaises(TypeError): node6 = Node6(nid=0) node7 = Node7(nid=1, rl=[node6]) node7.serialize(jss) node61 = Node6() node62 = Node6(r=node61) serial = node62.serialize(jss) d_node62 = Node6.deserialize(jss, serial) self.assertIsInstance(d_node62.r, Node6) node61 = Node6(nid=0) node62 = Node6(nid=0, rl=[node61]) serial = node62.serialize(jss) d_node62 = Node6.deserialize(jss, serial) self.assertIsInstance(d_node62.rl[0], Node6)
def _nonempty_index(idx): typ = type(idx) if typ is pd.RangeIndex: return pd.RangeIndex(2, name=idx.name) elif typ in _numeric_index_types: return typ([1, 2], name=idx.name) elif typ is pd.Index: return pd.Index(["a", "b"], name=idx.name) elif typ is pd.DatetimeIndex: start = "1970-01-01" # Need a non-monotonic decreasing index to avoid issues with # partial string indexing see https://github.com/dask/dask/issues/2389 # and https://github.com/pandas-dev/pandas/issues/16515 # This doesn't mean `_meta_nonempty` should ever rely on # `self.monotonic_increasing` or `self.monotonic_decreasing` try: return pd.date_range(start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) except ValueError: # older pandas versions data = [start, "1970-01-02"] if idx.freq is None else None return pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) elif typ is pd.PeriodIndex: return pd.period_range(start="1970-01-01", periods=2, freq=idx.freq, name=idx.name) elif typ is pd.TimedeltaIndex: start = np.timedelta64(1, "D") try: return pd.timedelta_range(start=start, periods=2, freq=idx.freq, name=idx.name) except ValueError: # older pandas versions start = np.timedelta64(1, "D") data = [start, start + 1] if idx.freq is None else None return pd.TimedeltaIndex(data, start=start, periods=2, freq=idx.freq, name=idx.name) elif typ is pd.CategoricalIndex: if len(idx.categories) == 0: data = pd.Categorical(_nonempty_index(idx.categories), ordered=idx.ordered) else: data = pd.Categorical.from_codes([-1, 0], categories=idx.categories, ordered=idx.ordered) return pd.CategoricalIndex(data, name=idx.name) elif typ is pd.MultiIndex: levels = [_nonempty_index(l) for l in idx.levels] codes = [[0, 0] for i in idx.levels] try: return pd.MultiIndex(levels=levels, codes=codes, names=idx.names) except TypeError: # older pandas versions return pd.MultiIndex(levels=levels, labels=codes, names=idx.names) raise TypeError("Don't know how to handle index of " "type {0}".format(typename(type(idx))))