Ejemplo n.º 1
0
    def get_dataset(self, key, info):
        if self._data is None:
            self.read()

        if key.name in ['latitude', 'longitude']:
            lons, lats = self.get_lonlats()
            if key.name == 'latitude':
                return Projectable(lats, id=key)
            else:
                return Projectable(lons, id=key)

        avhrr_channel_index = {'1': 0,
                               '2': 1,
                               '3a': 2,
                               '3b': 2,
                               '4': 3,
                               '5': 4}
        index = avhrr_channel_index[key.name]
        mask = False
        if key.name in ['3a', '3b'] and self._is3b is None:
            ch3a = bfield(self._data["id"]["id"], 10)
            self._is3b = np.logical_not(ch3a)

        if key.name == '3a':
            mask = np.tile(self._is3b, (1, 2048))
        elif key.name == '3b':
            mask = np.tile(np.logical_not(self._is3b), (1, 2048))

        data = self._data["image_data"][:, :, index]
        if key.calibration == 'counts':
            return Projectable(data,
                               mask=mask,
                               area=self.get_lonlats(),
                               units='1')

        pg_spacecraft = ''.join(self.platform_name.split()).lower()

        jdays = (np.datetime64(self.start_time) - np.datetime64(str(
            self.year) + '-01-01T00:00:00Z')) / np.timedelta64(1, 'D')
        if index < 2 or key.name == '3a':
            data = calibrate_solar(data, index, self.year, jdays,
                                   pg_spacecraft)
            units = '%'

        if index > 2 or key.name == '3b':
            if self.times is None:
                self.times = time_seconds(self._data["timecode"], self.year)
            line_numbers = (
                np.round((self.times - self.times[-1]) /
                         np.timedelta64(166666667, 'ns'))).astype(np.int)
            line_numbers -= line_numbers[0]
            if self.prt is None:
                self.prt, self.ict, self.space = self.get_telemetry()
            chan = index + 1
            data = calibrate_thermal(data, self.prt, self.ict[:, chan - 3],
                                     self.space[:, chan - 3], line_numbers,
                                     chan, pg_spacecraft)
            units = 'K'
        # TODO: check if entirely masked before returning
        return Projectable(data, mask=mask, units=units)
Ejemplo n.º 2
0
def _coerce_scalar_to_timedelta_type(r, unit='ns'):
    # kludgy here until we have a timedelta scalar
    # handle the numpy < 1.7 case

    def conv(v):
        if _np_version_under1p7:
            return timedelta(microseconds=v/1000.0)
        return np.timedelta64(v)

    if isinstance(r, compat.string_types):
        converter = _get_string_converter(r, unit=unit)
        r = converter()
        r = conv(r)
    elif r == tslib.iNaT:
        return r
    elif isinstance(r, np.timedelta64):
        r = r.astype("m8[{0}]".format(unit.lower()))
    elif is_integer(r):
        r = tslib.cast_from_unit(r, unit)
        r = conv(r)

    if _np_version_under1p7:
        if not isinstance(r, timedelta):
            raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
        if compat.PY3:
            # convert to microseconds in timedelta64
            r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000))
        else:
            return r

    if isinstance(r, timedelta):
        r = np.timedelta64(r)
    elif not isinstance(r, np.timedelta64):
        raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
    return r.astype('timedelta64[ns]')
Ejemplo n.º 3
0
    def __init__(self):
        data_frame = get_joined_frame()
        add_standardized_period(data_frame)

        # build a timeseries for each indicator/region pair
        timeseries_list = get_timeseries_list(data_frame)

        deviations = []

        for timeseries in timeseries_list:
            date_diffs = timeseries.period_end.diff()

            # leap-year hack: convert 366 day differences to 365 day differences
            date_diffs[date_diffs == np.timedelta64(366, "D")] = np.timedelta64(365, "D")

            # a little unusual that it's a series, but there can be a tie for most frequent
            mode_series = date_diffs.mode()

            if mode_series.empty:
                continue

            deviation_rows = timeseries[date_diffs != mode_series[0]]

            if not deviation_rows.empty:
                deviations.append(deviation_rows)

        self.violation_values = pd.concat(deviations)
Ejemplo n.º 4
0
    def test_timedelta64_conversions(self):
        startdate = Series(date_range('2013-01-01', '2013-01-03'))
        enddate = Series(date_range('2013-03-01', '2013-03-03'))

        s1 = enddate - startdate
        s1[2] = np.nan

        for m in [1, 3, 10]:
            for unit in ['D', 'h', 'm', 's', 'ms', 'us', 'ns']:

                # op
                expected = s1.apply(lambda x: x / np.timedelta64(m, unit))
                result = s1 / np.timedelta64(m, unit)
                assert_series_equal(result, expected)

                if m == 1 and unit != 'ns':

                    # astype
                    result = s1.astype("timedelta64[{0}]".format(unit))
                    assert_series_equal(result, expected)

                # reverse op
                expected = s1.apply(
                    lambda x: Timedelta(np.timedelta64(m, unit)) / x)
                result = np.timedelta64(m, unit) / s1

        # astype
        s = Series(date_range('20130101', periods=3))
        result = s.astype(object)
        self.assertIsInstance(result.iloc[0], datetime)
        self.assertTrue(result.dtype == np.object_)

        result = s1.astype(object)
        self.assertIsInstance(result.iloc[0], timedelta)
        self.assertTrue(result.dtype == np.object_)
Ejemplo n.º 5
0
    def test_ufunc_coercions(self):
        idx = date_range('2011-01-01', periods=3, freq='2D', name='x')

        delta = np.timedelta64(1, 'D')
        for result in [idx + delta, np.add(idx, delta)]:
            assert isinstance(result, DatetimeIndex)
            exp = date_range('2011-01-02', periods=3, freq='2D', name='x')
            tm.assert_index_equal(result, exp)
            assert result.freq == '2D'

        for result in [idx - delta, np.subtract(idx, delta)]:
            assert isinstance(result, DatetimeIndex)
            exp = date_range('2010-12-31', periods=3, freq='2D', name='x')
            tm.assert_index_equal(result, exp)
            assert result.freq == '2D'

        delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'),
                          np.timedelta64(3, 'D')])
        for result in [idx + delta, np.add(idx, delta)]:
            assert isinstance(result, DatetimeIndex)
            exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'],
                                freq='3D', name='x')
            tm.assert_index_equal(result, exp)
            assert result.freq == '3D'

        for result in [idx - delta, np.subtract(idx, delta)]:
            assert isinstance(result, DatetimeIndex)
            exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'],
                                freq='D', name='x')
            tm.assert_index_equal(result, exp)
            assert result.freq == 'D'
Ejemplo n.º 6
0
    def formula(individu, period, parameters):
        assiette_allegement = individu('assiette_allegement', period)
        contrat_de_travail_duree = individu('contrat_de_travail_duree', period)
        TypesContratDeTravailDuree = contrat_de_travail_duree.possible_values
        contrat_de_travail_debut = individu('contrat_de_travail_debut', period)
        contrat_de_travail_fin = individu('contrat_de_travail_fin', period)
        effectif_entreprise = individu('effectif_entreprise', period)
        smic_proratise = individu('smic_proratise', period)
        zone_revitalisation_rurale = individu('zone_revitalisation_rurale', period)

        duree_cdd_eligible = contrat_de_travail_fin > contrat_de_travail_debut + timedelta64(365, 'D')
        # TODO: move to parameters file
        contrat_de_travail_eligible = (
            contrat_de_travail_duree == TypesContratDeTravailDuree.cdi) + (
            (contrat_de_travail_duree == TypesContratDeTravailDuree.cdd) * (duree_cdd_eligible)
            )

        duree_validite = (
            datetime64(period.start) + timedelta64(1, 'D') - contrat_de_travail_debut
            ).astype('timedelta64[Y]') < timedelta64(1, 'Y')

        eligible = (
            contrat_de_travail_eligible
            * (effectif_entreprise <= 50)
            * zone_revitalisation_rurale
            * duree_validite
            )

        taux_max = .281 if period.start.year < 2015 else .2655  # TODO: move to parameters file
        seuil_max = 2.4
        seuil_min = 1.5
        taux_exoneration = compute_taux_exoneration(assiette_allegement, smic_proratise, taux_max, seuil_max, seuil_min)
        exoneration_cotisations_zrr = taux_exoneration * assiette_allegement * eligible

        return exoneration_cotisations_zrr
Ejemplo n.º 7
0
    def test_timestamp_and_series(self):
        timestamp_series = Series(date_range('2014-03-17', periods=2, freq='D', tz='US/Eastern'))
        first_timestamp = timestamp_series[0]

        delta_series = Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')])
        assert_series_equal(timestamp_series - first_timestamp, delta_series)
        assert_series_equal(first_timestamp - timestamp_series, -delta_series)
Ejemplo n.º 8
0
  def testResampleData(self):
    # test upsampling by a factor of 2
    timestamps = numpy.array([numpy.datetime64(
      datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) +
      datetime.timedelta(hours=i)) for i in xrange(8)])
    values = numpy.linspace(0, 7, 8)
    newSamplingInterval = numpy.timedelta64(1800, 's')
    (newTimeStamps, newValues) = param_finder._resampleData(timestamps,
                                                            values,
                                                            newSamplingInterval)

    trueNewTimeStamps = numpy.array([numpy.datetime64(
      datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) +
      datetime.timedelta(hours=0.5 * i)) for i in xrange(15)])
    self.assertTrue(numpy.allclose(newValues, numpy.linspace(0, 7, 15)))
    timestampError = (numpy.sum(
      numpy.abs(newTimeStamps - trueNewTimeStamps))).item().total_seconds()
    self.assertAlmostEqual(timestampError, 0)

    # test down-sampling by a factor of 2
    newSamplingInterval = numpy.timedelta64(7200, 's')
    (newTimeStamps, newValues) = param_finder._resampleData(timestamps,
                                                            values,
                                                            newSamplingInterval)
    trueNewTimeStamps = numpy.array([numpy.datetime64(
      datetime.datetime(2000, 1, 1, tzinfo=dateutil.tz.tzlocal()) +
      datetime.timedelta(hours=2 * i)) for i in xrange(4)])
    timestampError = (numpy.sum(
      numpy.abs(newTimeStamps - trueNewTimeStamps))).item().total_seconds()
    self.assertTrue(numpy.allclose(newValues, numpy.linspace(0, 6, 4)))
    self.assertAlmostEqual(timestampError, 0)
Ejemplo n.º 9
0
def _as_timedelta64_scalar(time, unit=None):
    unit_args = [unit] if unit else []
    flt_unit = unit if unit else 's'
    # turn 'H:M:S.ms', 'M:S.ms', 'S.ms' into floating point seconds
    if isinstance(time, string_types):# and ':' in time:
        time = [float(t) for t in time.lstrip('T').split(':')][::-1]
        if len(time) > 1 and unit is not None:
            raise ValueError("When giving time as a string, units are automatic")
        if len(time) > 3:
            raise ValueError("Timedelta as string only goes up to hours")
        t_flt = 0.0
        for factor, t in zip([1, 60, 60 * 60], time):
            t_flt += factor * t
        time = t_flt
        flt_unit = 's'
    # turn floating point time into integer with the correct unit
    if is_datetime_like(time):
        time = as_datetime64(time) - as_datetime64(np.timedelta64(0, 's'))
    elif isinstance(time, (np.timedelta64, timedelta)):
        time = np.timedelta64(time).astype(_format_unit(unit, base=DELTA_BASE))
    elif isinstance(time, (int, float, np.integer, np.floating)):
        orig_time, orig_flt_unit = time, flt_unit
        unit_idx = TIME_UNITS.index(flt_unit)
        while not np.isclose(time, int(np.round(time)), rtol=1e-4, atol=1e-18):
            if unit_idx <= 0:
                raise ValueError("Floating point time {0} [{1}] is too precise "
                                 "for any time unit?".format(orig_time, orig_flt_unit))
            unit_idx -= 1
            time *= TIME_SCALE[unit_idx]
            flt_unit = TIME_UNITS[unit_idx]
        time = np.timedelta64(int(np.round(time)), flt_unit)
        unit, unit_args = flt_unit, [flt_unit]
    return np.timedelta64(time, *unit_args)
Ejemplo n.º 10
0
    def test_cf_timedelta(self):
        examples = [
            ('1D', 'days', np.int64(1)),
            (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')),
            ('1h', 'hours', np.int64(1)),
            ('1ms', 'milliseconds', np.int64(1)),
            ('1us', 'microseconds', np.int64(1)),
            (['NaT', '0s', '1s'], None, [np.nan, 0, 1]),
            (['30m', '60m'], 'hours', [0.5, 1.0]),
        ]
        if pd.__version__ >= '0.16':
            # not quite sure why, but these examples don't work on older pandas
            examples.extend([(np.timedelta64('NaT', 'ns'), 'days', np.nan),
                             (['NaT', 'NaT'], 'days', [np.nan, np.nan])])

        for timedeltas, units, numbers in examples:
            timedeltas = pd.to_timedelta(timedeltas, box=False)
            numbers = np.array(numbers)

            expected = numbers
            actual, _ = conventions.encode_cf_timedelta(timedeltas, units)
            self.assertArrayEqual(expected, actual)
            self.assertEqual(expected.dtype, actual.dtype)

            if units is not None:
                expected = timedeltas
                actual = conventions.decode_cf_timedelta(numbers, units)
                self.assertArrayEqual(expected, actual)
                self.assertEqual(expected.dtype, actual.dtype)

        expected = np.timedelta64('NaT', 'ns')
        actual = conventions.decode_cf_timedelta(np.array(np.nan), 'days')
        self.assertArrayEqual(expected, actual)
Ejemplo n.º 11
0
 def test_timedelta_conversions(self):
     assert (Timedelta(timedelta(seconds=1)) ==
             np.timedelta64(1, 's').astype('m8[ns]'))
     assert (Timedelta(timedelta(microseconds=1)) ==
             np.timedelta64(1, 'us').astype('m8[ns]'))
     assert (Timedelta(timedelta(days=1)) ==
             np.timedelta64(1, 'D').astype('m8[ns]'))
Ejemplo n.º 12
0
def close_gaps(ts, verbose = False):
    ts = ts.copy()
    ts.data = ts.data.sort_index()
    if type(ts.data).__name__ == 'Panel':
        data = ts.data.items.values
        index = ts.data.items
    else:
        data = ts.data.index.values
        index = ts.data.index

    index_df = _pd.DataFrame(index = index)

    dt = data[1:] - data[:-1]
    dt = dt / _np.timedelta64(1,'s')

    median = _np.median(dt)

    if median > (1.1 * ts._data_period) or median < (0.9 * ts._data_period):
        _warnings.warn('There is a periode and median missmatch (%0.1f,%0.1f), this is either due to an error in the assumed period or becuase there are too many gaps in the _timeseries.'%(median,ts._data_period))

    point_dist = (index.values[1:] - index.values[:-1]) / _np.timedelta64(1, 's')
    where = point_dist > 2 * ts._data_period
    off_periods = _np.array([index[:-1][where], index[1:][where]]).transpose()
    if verbose:
        print('found %i gaps'%(off_periods.shape[0]))
    for i, op in enumerate(off_periods):
        no_periods = round((op[1] - op[0])/ _np.timedelta64(1,'s')) / ts._data_period
        out = _pd.date_range(start = op[0], periods= no_periods, freq= '%i s'%ts._data_period)
        out = out[1:]
        out = _pd.DataFrame(index = out)
        index_df = _pd.concat([index_df, out])

    index_df.sort_index(inplace=True)
    ts.data = ts.data.reindex(index_df.index)
    return ts
Ejemplo n.º 13
0
def rolling_correlation(data, correlant, window, min_good_ratio = 0.67, verbose = True):
    "time as here: http://docs.scipy.org/doc/numpy/reference/arrays.datetime.html#datetime-units"

    correlant = correlant.align_to(data) # I do align before merge, because it is more suffisticated!
    merged = data.copy()
    merged.data['correlant'] = correlant.data

    data_period = _np.timedelta64(int(merged._data_period), 's')
    window = _np.timedelta64(window[0], window[1])
    window = int(window/data_period)
    if verbose:
        print('Each window contains %s data points of which at least %s are not nan.'%(window, int(window * min_good_ratio)))

    min_good = window * min_good_ratio
    size = merged.data.shape[0]-window + 1
    timestamps = _pd.TimeSeries(_pd.to_datetime(_pd.Series(_np.zeros(size))))
    pear_r = _np.zeros(size)
    for i in range(size):
        secment = TimeSeries(merged.data.iloc[i:i+window,:])
        secment._data_period = merged._data_period
    #     print(secment.data.dropna().shape[0] < min_good)
        if secment.data.dropna().shape[0] < min_good:
            pear_r[i]= _np.nan
        else:
            corr = secment.correlate_to(secment, data_column=merged.data.columns[0], correlant_column=merged.data.columns[1])
            pear_r[i] = corr.pearson_r[0]
        timestamps.iloc[i] = secment.data.index[0] + ((secment.data.index[-1] - secment.data.index[0])/2.)
    #     break

    pear_r_ts = TimeSeries(_pd.DataFrame(pear_r, index = timestamps, columns = ['pearson_r']))
    pear_r_ts._data_period = merged._data_period
    pear_r_ts._y_label = 'r'
    return pear_r_ts
Ejemplo n.º 14
0
    def test_timestamp_and_series(self):
        timestamp_series = Series(date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern"))
        first_timestamp = timestamp_series[0]

        delta_series = Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")])
        assert_series_equal(timestamp_series - first_timestamp, delta_series)
        assert_series_equal(first_timestamp - timestamp_series, -delta_series)
Ejemplo n.º 15
0
    def test_timedelta_ops_scalar(self):
        _skip_if_numpy_not_friendly()

        # GH 6808
        base = pd.to_datetime('20130101 09:01:12.123456')
        expected_add = pd.to_datetime('20130101 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta(10,unit='s'),
                       timedelta(seconds=10),
                       np.timedelta64(10,'s'),
                       np.timedelta64(10000000000,'ns'),
                       pd.offsets.Second(10)]:
            result = base + offset
            self.assertEquals(result, expected_add)

            result = base - offset
            self.assertEquals(result, expected_sub)

        base = pd.to_datetime('20130102 09:01:12.123456')
        expected_add = pd.to_datetime('20130103 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta('1 day, 00:00:10'),
                       pd.to_timedelta('1 days, 00:00:10'),
                       timedelta(days=1,seconds=10),
                       np.timedelta64(1,'D')+np.timedelta64(10,'s'),
                       pd.offsets.Day()+pd.offsets.Second(10)]:
            result = base + offset
            self.assertEquals(result, expected_add)

            result = base - offset
            self.assertEquals(result, expected_sub)
Ejemplo n.º 16
0
 def test_timedelta_conversions(self):
     self.assertEqual(ct(timedelta(seconds=1)),
                      np.timedelta64(1, 's').astype('m8[ns]'))
     self.assertEqual(ct(timedelta(microseconds=1)),
                      np.timedelta64(1, 'us').astype('m8[ns]'))
     self.assertEqual(ct(timedelta(days=1)),
                      np.timedelta64(1, 'D').astype('m8[ns]'))
Ejemplo n.º 17
0
    def test_nat_items(self):
        # not a datetime
        nadt_no_unit = np.datetime64("NaT")
        nadt_s = np.datetime64("NaT", "s")
        nadt_d = np.datetime64("NaT", "ns")
        # not a timedelta
        natd_no_unit = np.timedelta64("NaT")
        natd_s = np.timedelta64("NaT", "s")
        natd_d = np.timedelta64("NaT", "ns")

        dts = [nadt_no_unit, nadt_s, nadt_d]
        tds = [natd_no_unit, natd_s, natd_d]
        for a, b in itertools.product(dts, dts):
            self._assert_func(a, b)
            self._assert_func([a], [b])
            self._test_not_equal([a], b)

        for a, b in itertools.product(tds, tds):
            self._assert_func(a, b)
            self._assert_func([a], [b])
            self._test_not_equal([a], b)

        for a, b in itertools.product(tds, dts):
            self._test_not_equal(a, b)
            self._test_not_equal(a, [b])
            self._test_not_equal([a], [b])
            self._test_not_equal([a], np.datetime64("2017-01-01", "s"))
            self._test_not_equal([b], np.datetime64("2017-01-01", "s"))
            self._test_not_equal([a], np.timedelta64(123, "s"))
            self._test_not_equal([b], np.timedelta64(123, "s"))
Ejemplo n.º 18
0
    def test_split_key_cmp(self):
        dt1 = numpy.datetime64("2015-01-01T15:03")
        dt1_1 = numpy.datetime64("2015-01-01T15:03")
        dt2 = numpy.datetime64("2015-01-05T15:03")
        td = numpy.timedelta64(60, 's')
        td2 = numpy.timedelta64(300, 's')

        self.assertEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td))
        self.assertEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt1_1, td))
        self.assertNotEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt2, td))
        self.assertNotEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td2))

        self.assertLess(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt2, td))
        self.assertLessEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td))

        self.assertGreater(
            carbonara.SplitKey.from_timestamp_and_sampling(dt2, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt1, td))
        self.assertGreaterEqual(
            carbonara.SplitKey.from_timestamp_and_sampling(dt2, td),
            carbonara.SplitKey.from_timestamp_and_sampling(dt2, td))
Ejemplo n.º 19
0
def calcReturn(log, transactions, date):
    startt = str(np.datetime64(date + 'T00:01'))[:16]
    endt   = str(np.datetime64(date + 'T23:59'))[:16]
    starty = str(np.datetime64(date + 'T00:01') - np.timedelta64(1, 'D'))[:16]
    endy   = str(np.datetime64(date + 'T23:59') - np.timedelta64(1, 'D'))[:16]

    dft = log[startt : endt]
    dfy = log[starty : endy]

    try:
        tdf = transactions[date]
    except:
        tdf = []

    if len(dfy) > 0:
        startValue = float(dfy.tail(1)['EUR'] + \
                           dfy.tail(1)['BTC'] * dfy.tail(1)['Bid'])
        openPrice  = float((dfy.tail(1)['Bid'] + dfy.tail(1)['Ask'])/2)
    else:
        startValue = float(dft.head(1)['EUR'] + \
                           dft.head(1)['BTC'] * dft.head(1)['Bid'])
        openPrice  = float((dft.head(1)['Bid'] + dft.head(1)['Ask'])/2)
        
    endValue   = float(dft.tail(1)['EUR'] + \
                       dft.tail(1)['BTC'] * dft.tail(1)['Bid'])
    closePrice = float((dft.tail(1)['Bid'] + dft.tail(1)['Ask'])/2)
    
    if len(tdf) != 0:
        tdf['EUR']=tdf['AmountBTC']*(tdf['Bid']+tdf['Ask'])/2 + tdf['AmountEUR']
        endValue   = endValue - float(tdf.sum()['EUR'])
    
    retStrategy= endValue / startValue - 1
    retHold    = closePrice / openPrice - 1
    return openPrice, closePrice, retHold, retStrategy
Ejemplo n.º 20
0
    def test_74_percentile_serialized(self):
        ts = carbonara.TimeSerie.from_data(
            [datetime64(2014, 1, 1, 12, 0, 0),
             datetime64(2014, 1, 1, 12, 0, 4),
             datetime64(2014, 1, 1, 12, 0, 9)],
            [3, 5, 6])
        ts = self._resample(ts, numpy.timedelta64(60, 's'), '74pct')

        self.assertEqual(1, len(ts))
        self.assertEqual(5.48, ts[datetime64(2014, 1, 1, 12, 0, 0)][1])

        # Serialize and unserialize
        key = ts.get_split_key()
        o, s = ts.serialize(key)
        saved_ts = carbonara.AggregatedTimeSerie.unserialize(
            s, key, ts.aggregation)

        self.assertEqual(ts.aggregation, saved_ts.aggregation)

        ts = carbonara.TimeSerie.from_data(
            [datetime64(2014, 1, 1, 12, 0, 0),
             datetime64(2014, 1, 1, 12, 0, 4),
             datetime64(2014, 1, 1, 12, 0, 9)],
            [3, 5, 6])
        ts = self._resample(ts, numpy.timedelta64(60, 's'), '74pct')
        saved_ts.merge(ts)

        self.assertEqual(1, len(ts))
        self.assertEqual(5.48, ts[datetime64(2014, 1, 1, 12, 0, 0)][1])
Ejemplo n.º 21
0
    def test_fetch_nano(self):
        ts = {'sampling': numpy.timedelta64(200, 'ms'),
              'size': 10, 'agg': 'mean'}
        tsb = carbonara.BoundTimeSerie(block_size=ts['sampling'])

        tsb.set_values(numpy.array([
            (datetime64(2014, 1, 1, 11, 46, 0, 200123), 4),
            (datetime64(2014, 1, 1, 11, 46, 0, 340000), 8),
            (datetime64(2014, 1, 1, 11, 47, 0, 323154), 50),
            (datetime64(2014, 1, 1, 11, 48, 0, 590903), 4),
            (datetime64(2014, 1, 1, 11, 48, 0, 903291), 4)],
            dtype=carbonara.TIMESERIES_ARRAY_DTYPE),
            before_truncate_callback=functools.partial(
                self._resample_and_merge, agg_dict=ts))

        tsb.set_values(numpy.array([
            (datetime64(2014, 1, 1, 11, 48, 0, 821312), 5)],
            dtype=carbonara.TIMESERIES_ARRAY_DTYPE),
            before_truncate_callback=functools.partial(
                self._resample_and_merge, agg_dict=ts))

        self.assertEqual([
            (datetime64(2014, 1, 1, 11, 46, 0, 200000), 6.0),
            (datetime64(2014, 1, 1, 11, 47, 0, 200000), 50.0),
            (datetime64(2014, 1, 1, 11, 48, 0, 400000), 4.0),
            (datetime64(2014, 1, 1, 11, 48, 0, 800000), 4.5)
        ], list(ts['return'].fetch()))
        self.assertEqual(numpy.timedelta64(200000000, 'ns'),
                         ts['return'].aggregation.granularity)
def time_features_enricher(dataset):
	"""
    Feature engineering on time related fields
    :param dataset: train/test dataset
	"""
	dataset['date_time_dt'] = pd.to_datetime(dataset.date_time, format = '%Y-%m-%d %H:%M:%S')
	dataset['date_time_dow'] = dataset.date_time_dt.dt.dayofweek
	dataset['date_time_hour'] = dataset.date_time_dt.dt.hour
	dataset['date_time_month'] = dataset.date_time_dt.dt.month

	dataset.loc[dataset.srch_ci == '2161-10-00', 'srch_ci'] = '2016-01-20' #handle one error format case in test set

	dataset['srch_ci_dt'] = pd.to_datetime(dataset.srch_ci, format = '%Y-%m-%d')
	dataset['srch_ci_dow'] = dataset.srch_ci_dt.dt.dayofweek
	dataset['srch_ci_month'] = dataset.srch_ci_dt.dt.month

	dataset['srch_co_dt'] = pd.to_datetime(dataset.srch_co, format = '%Y-%m-%d')
	dataset['srch_co_dow'] = dataset.srch_co_dt.dt.dayofweek
	dataset['srch_co_month'] = dataset.srch_co_dt.dt.month

	dataset['booking_window'] = (dataset['srch_ci_dt'] - dataset['date_time_dt'])/np.timedelta64(1, 'D')
	dataset['booking_window'].fillna(1000, inplace=True)
	dataset['booking_window'] = map(int, dataset['booking_window'])

	dataset['length_of_stay'] = (dataset['srch_co_dt'] - dataset['srch_ci_dt'])/np.timedelta64(1, 'D')
Ejemplo n.º 23
0
    def function(self, simulation, period):
        period = period.this_month
        assiette_allegement = simulation.calculate('assiette_allegement', period)
        contrat_de_travail_duree = simulation.calculate('contrat_de_travail_duree', period)  # 0: CDI, 1:CDD
        contrat_de_travail_debut = simulation.calculate('contrat_de_travail_debut', period)
        contrat_de_travail_fin = simulation.calculate('contrat_de_travail_fin', period)
        effectif_entreprise = simulation.calculate('effectif_entreprise', period)
        smic_proratise = simulation.calculate('smic_proratise', period)
        zone_revitalisation_rurale = simulation.calculate('zone_revitalisation_rurale', period)

        duree_cdd_eligible = contrat_de_travail_fin > contrat_de_travail_debut + timedelta64(365, 'D')
        # TODO: move to legislation parameters file
        contrat_de_travail_eligible = (
            contrat_de_travail_duree == 0) + (
            (contrat_de_travail_duree == 1) * (duree_cdd_eligible)
            )

        duree_validite = (
            datetime64(period.start) + timedelta64(1, 'D') - contrat_de_travail_debut
            ).astype('timedelta64[Y]') < timedelta64(1, 'Y')

        eligible = (
            contrat_de_travail_eligible *
            (effectif_entreprise <= 50) *
            zone_revitalisation_rurale *
            duree_validite
            )
        taux_max = .281 if period.start.year < 2015 else .2655  # TODO: move to legislation parameters file
        seuil_max = 2.4
        seuil_min = 1.5
        taux_exoneration = compute_taux_exoneration(assiette_allegement, smic_proratise, taux_max, seuil_max, seuil_min)
        exoneration_cotisations_zrr = taux_exoneration * assiette_allegement * eligible

        return period, exoneration_cotisations_zrr
def transform_data(train, test):
    """ Transform train and test data to include new variables.
    """
    
    # Time
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]')  # Arbitrary date chosen
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') for mn in train['time'].values)    
    train['hour'] = d_times.hour
    train['weekday'] = d_times.weekday
    train['day_of_month'] = d_times.day
    train['month'] = d_times.month
    train['year'] = d_times.year

    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') for mn in test['time'].values)    
    test['hour'] = d_times.hour
    test['weekday'] = d_times.weekday
    test['day_of_month'] = d_times.day
    test['month'] = d_times.month
    test['year'] = d_times.year  
            
    # Accuracy 
    train['accuracy'] = np.log10(train['accuracy']) * 10.0
    test['accuracy'] = np.log10(test['accuracy']) * 10.0
      
    # Combine x and y attributes
    eps = 0.00001  
    train['x_d_y'] = train.x.values / (train.y.values + eps) 
    test['x_d_y'] = test.x.values / (test.y.values + eps)    
    
    train['x_t_y'] = train.x.values * train.y.values 
    test['x_t_y'] = test.x.values * test.y.values 
            
    # Return data
    return train, test  
Ejemplo n.º 25
0
def test_decode_standard_calendar_multidim_time_inside_timestamp_range(
        calendar, enable_cftimeindex):
    if enable_cftimeindex:
        pytest.importorskip('cftime')

    cftime = _import_cftime()

    units = 'days since 0001-01-01'
    times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D')
    times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D')
    noleap_time1 = cftime.date2num(times1.to_pydatetime(),
                                   units, calendar=calendar)
    noleap_time2 = cftime.date2num(times2.to_pydatetime(),
                                   units, calendar=calendar)
    mdim_time = np.empty((len(noleap_time1), 2), )
    mdim_time[:, 0] = noleap_time1
    mdim_time[:, 1] = noleap_time2

    expected1 = times1.values
    expected2 = times2.values

    actual = coding.times.decode_cf_datetime(
        mdim_time, units, calendar=calendar,
        enable_cftimeindex=enable_cftimeindex)
    assert actual.dtype == np.dtype('M8[ns]')

    abs_diff1 = abs(actual[:, 0] - expected1)
    abs_diff2 = abs(actual[:, 1] - expected2)
    # once we no longer support versions of netCDF4 older than 1.1.5,
    # we could do this check with near microsecond accuracy:
    # https://github.com/Unidata/netcdf4-python/issues/355
    assert (abs_diff1 <= np.timedelta64(1, 's')).all()
    assert (abs_diff2 <= np.timedelta64(1, 's')).all()
Ejemplo n.º 26
0
def PlotEwmaPredictions(daily, name):
    """
    """

    # use EWMA to estimate slopes
    filled = timeseries.FillMissing(daily)
    filled['slope'] = pandas.ewma(filled.ppg.diff(), span=180)
    filled[-1:]

    # extract the last inter and slope
    start = filled.index[-1]
    inter = filled.ewma[-1]
    slope = filled.slope[-1]

    # reindex the DataFrame, adding a year to the end
    dates = pandas.date_range(filled.index.min(), 
                              filled.index.max() + np.timedelta64(365, 'D'))
    predicted = filled.reindex(dates)

    # generate predicted values and add them to the end
    predicted['date'] = predicted.index
    one_day = np.timedelta64(1, 'D')
    predicted['days'] = (predicted.date - start) / one_day
    predict = inter + slope * predicted.days
    predicted.ewma.fillna(predict, inplace=True)

    # plot the actual values and predictions
    thinkplot.Scatter(daily.ppg, alpha=0.1, label=name)
    thinkplot.Plot(predicted.ewma)
    thinkplot.Save()
Ejemplo n.º 27
0
    def get_holidays(self, start, end, cal="FX"):
        # TODO use Pandas CustomBusinessDays to get more calendars
        holidays_list = []

        if cal == "FX":
            # filter for Christmas & New Year's Day
            for i in range(1970, 2020):
                holidays_list.append(str(i) + "-12-25")
                holidays_list.append(str(i) + "-01-01")

        if cal == "WEEKDAY":
            bday = CustomBusinessDay(weekmask="Sat Sun")

            holidays_list = pandas.date_range(start, end, freq=bday)

        holidays_list = pandas.to_datetime(holidays_list).order()

        # floor start date
        start = np.datetime64(start) - np.timedelta64(1, "D")

        # ceiling end date
        end = np.datetime64(end) + np.timedelta64(1, "D")

        holidays_list = [x for x in holidays_list if x >= start and x <= end]

        return pandas.to_datetime(holidays_list)
Ejemplo n.º 28
0
    def test_timedelta_ops_scalar(self):
        # GH 6808
        base = pd.to_datetime('20130101 09:01:12.123456')
        expected_add = pd.to_datetime('20130101 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10),
                       np.timedelta64(10, 's'),
                       np.timedelta64(10000000000, 'ns'),
                       pd.offsets.Second(10)]:
            result = base + offset
            assert result == expected_add

            result = base - offset
            assert result == expected_sub

        base = pd.to_datetime('20130102 09:01:12.123456')
        expected_add = pd.to_datetime('20130103 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta('1 day, 00:00:10'),
                       pd.to_timedelta('1 days, 00:00:10'),
                       timedelta(days=1, seconds=10),
                       np.timedelta64(1, 'D') + np.timedelta64(10, 's'),
                       pd.offsets.Day() + pd.offsets.Second(10)]:
            result = base + offset
            assert result == expected_add

            result = base - offset
            assert result == expected_sub
Ejemplo n.º 29
0
    def test_timedelta(self, freq):
        index = date_range('1/1/2000', periods=50, freq=freq)

        shifted = index + timedelta(1)
        back = shifted + timedelta(-1)
        tm.assert_index_equal(index, back)

        if freq == 'D':
            expected = pd.tseries.offsets.Day(1)
            assert index.freq == expected
            assert shifted.freq == expected
            assert back.freq == expected
        else:  # freq == 'B'
            assert index.freq == pd.tseries.offsets.BusinessDay(1)
            assert shifted.freq is None
            assert back.freq == pd.tseries.offsets.BusinessDay(1)

        result = index - timedelta(1)
        expected = index + timedelta(-1)
        tm.assert_index_equal(result, expected)

        # GH4134, buggy with timedeltas
        rng = date_range('2013', '2014')
        s = Series(rng)
        result1 = rng - pd.offsets.Hour(1)
        result2 = DatetimeIndex(s - np.timedelta64(100000000))
        result3 = rng - np.timedelta64(100000000)
        result4 = DatetimeIndex(s - pd.offsets.Hour(1))
        tm.assert_index_equal(result1, result4)
        tm.assert_index_equal(result2, result3)
Ejemplo n.º 30
0
    def test_is_datetimelike_array_all_nan_nat_like(self):
        arr = np.array([np.nan, pd.NaT, np.datetime64('nat')])
        assert lib.is_datetime_array(arr)
        assert lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')])
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, pd.NaT, np.datetime64('nat'),
                        np.timedelta64('nat')])
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, pd.NaT])
        assert lib.is_datetime_array(arr)
        assert lib.is_datetime64_array(arr)
        assert lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, np.nan], dtype=object)
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        assert lib.is_datetime_with_singletz_array(
            np.array([pd.Timestamp('20130101', tz='US/Eastern'),
                      pd.Timestamp('20130102', tz='US/Eastern')],
                     dtype=object))
        assert not lib.is_datetime_with_singletz_array(
            np.array([pd.Timestamp('20130101', tz='US/Eastern'),
                      pd.Timestamp('20130102', tz='CET')],
                     dtype=object))
    def BCEuropeanCallDirichlet(self, V, t, op):
        V[0] = 0
        V[-1] = self.S[-1] - op.getStrike()*np.exp(-self.getR()*t/np.timedelta64(365, 'D'))

        return V
Ejemplo n.º 32
0
def test_convert_timedelta_type_non_pandas_types() -> None:
    assert bus.convert_timedelta_type(
        datetime.timedelta(3000)) == 259200000000.0
    assert bus.convert_timedelta_type(np.timedelta64(3000, 'ms')) == 3000.
Ejemplo n.º 33
0
import numpy as np
import pandas as pd
from datetime import datetime
import pytest
import empyrical

from vectorbt import defaults
from vectorbt.records.drawdowns import Drawdowns

from tests.utils import isclose

day_dt = np.timedelta64(86400000000000)

index = pd.DatetimeIndex([
    datetime(2018, 1, 1),
    datetime(2018, 1, 2),
    datetime(2018, 1, 3),
    datetime(2018, 1, 4),
    datetime(2018, 1, 5)
])
ts = pd.DataFrame(
    {
        'a': [1, 2, 3, 4, 5],
        'b': [5, 4, 3, 2, 1],
        'c': [1, 2, 3, 2, 1]
    },
    index=index)
ret = ts.pct_change()

defaults.returns['year_freq'] = '252 days'  # same as empyrical
Ejemplo n.º 34
0
def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
                                       observation_period_end=None, freq='D', freq_multiplier=1):
    """
    Return summary data from transactions.

    This transforms a Dataframe of transaction data of the form:
        customer_id, datetime [, monetary_value]
    to a Dataframe of the form:
        customer_id, frequency, recency, T [, monetary_value]

    Parameters
    ----------
    transactions: :obj: DataFrame
        a Pandas DataFrame that contains the customer_id col and the datetime col.
    customer_id_col: string
        the column in transactions DataFrame that denotes the customer_id
    datetime_col:  string
        the column in transactions that denotes the datetime the purchase was made.
    monetary_value_col: string, optional
        the columns in the transactions that denotes the monetary value of the transaction.
        Optional, only needed for customer lifetime value estimation models.
    observation_period_end: datetime, optional
         a string or datetime to denote the final date of the study.
         Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
    datetime_format: string, optional
        a string that represents the timestamp format. Useful if Pandas can't understand
        the provided format.
    freq: string, optional
        Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
    freq_multiplier: int, optional
        Default 1, could be use to get exact recency and T, i.e. with freq='W'
        row for user id_sample=1 will be recency=30 and T=39 while data in
        CDNOW summary are different. Exact values could be obtained with
        freq='D' and freq_multiplier=7 which will lead to recency=30.43
        and T=38.86

    Returns
    -------
    :obj: Dataframe:
        customer_id, frequency, recency, T [, monetary_value]

    """
    if observation_period_end is None:
        observation_period_end = pd.to_datetime(transactions[datetime_col].max(), format=datetime_format).to_period(freq).to_timestamp()
    else:
        observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()

    # label all of the repeated transactions
    repeated_transactions = _find_first_transactions(
        transactions,
        customer_id_col,
        datetime_col,
        monetary_value_col,
        datetime_format,
        observation_period_end,
        freq
    )
    # reset datetime_col to timestamp
    repeated_transactions[datetime_col] = pd.Index(repeated_transactions[datetime_col]).to_timestamp()

    # count all orders by customer.
    customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])

    # subtract 1 from count, as we ignore their first order.
    customers['frequency'] = customers['count'] - 1

    customers['T'] = (observation_period_end - customers['min']) / np.timedelta64(1, freq) / freq_multiplier
    customers['recency'] = (customers['max'] - customers['min']) / np.timedelta64(1, freq) / freq_multiplier

    summary_columns = ['frequency', 'recency', 'T']

    if monetary_value_col:
        # create an index of all the first purchases
        first_purchases = repeated_transactions[repeated_transactions['first']].index
        # by setting the monetary_value cells of all the first purchases to NaN,
        # those values will be excluded from the mean value calculation
        repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
        customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
        summary_columns.append('monetary_value')

    return customers[summary_columns].astype(float)
Ejemplo n.º 35
0
# from bokeh.models.ranges import FactorRange
import numpy as np
from datetime import date, datetime, timedelta


def nptodt(dt):
    return datetime.strftime(
        datetime.strptime(str(dt), '%Y-%m-%d').date(), '%b-%d')


output_file('bokehtrial.html')
N = 50
stdate = np.datetime64('2020-01-30')
enddate = np.datetime64(
    (datetime.utcnow() + timedelta(hours=5, minutes=30)).date())
N = int((enddate - stdate) / np.timedelta64(1, 'D'))
x = [str(x) for x in range(1, N + 1)]
dates = [nptodt(x) for x in np.arange(stdate, stdate + N)]
stdate = date(2020, 1, 30)
enddate = (datetime.utcnow() + timedelta(hours=5, minutes=30) -
           timedelta(days=1)).date()

# Confirmed
newconf = np.array([np.random.randint(N) for i in range(N)])
cumconf = np.zeros(N)
cumconf[0] = newconf[0]
for i in range(1, N):
    cumconf[i] = newconf[i] + cumconf[i - 1]
oldconf = list(cumconf - newconf)
cumconf = list(cumconf)
newconf = list(newconf)
Ejemplo n.º 36
0
def moving_average(time_series, window_len):
    """Calculates the moving average of an unevenly spaced time series.

  This moving average implementation weights each value by the time it remained
  unchanged, which conceptually matches smart recording on GPS devices: a sample
  is taken when some value changes sufficiently, so before a new sample is taken
  the previous one is assumed to be more or less constant.

  The term "area" below means a sum of time-weighted values.

  This implementation follows the SMA_last algorithm proposed
  in (Eckner, 2017) (see README for citation).

  Args:
    time_series: A pandas.Series of the values to average,
                 indexed with timestamps.
    window_len: The size of the moving average window, in seconds.

  Returns:
    A numpy array of length len(time_series) containing the
    moving average values
  """
    # Re-index the time series with duration in seconds from the first value
    time_series.index = ((time_series.index - time_series.index[0]) /
                         np.timedelta64(1, 's')).astype('int')

    window_area = time_series.iloc[0] * window_len

    # It may not always be possible to construct a window of length exactly equal
    # to window_len using timestamps present in the data. To handle this, the left
    # side of the window is allowed to fall between timestamps (the right side is
    # always fixed to a timestamp in the data). Therefore we need to separately
    # compute the area of the inter-timestamp region on the left side of the
    # window so that it can be added to the window area. left_area is that value.
    left_area = window_area

    out = np.zeros(len(time_series))
    out[0] = time_series.iloc[0]

    # i is the left side of the window and j is the right
    i = 0
    for j in xrange(1, len(time_series)):
        # Remove the last iteration's left_area as a new right window bound may
        # change the left_area required in this iteration
        window_area -= left_area

        # Expand window to the right
        window_area += time_series.iloc[j - 1] * (time_series.index[j] -
                                                  time_series.index[j - 1])

        # Shrink window from the left if expanding to the right has created too
        # large a window. new_left_time may fall between timestamps present in the
        # data, which is fine, since that's handled by left_area.
        new_left_time = time_series.index[j] - window_len
        while time_series.index[i] < new_left_time:
            window_area -= time_series.iloc[i] * (time_series.index[i + 1] -
                                                  time_series.index[i])
            i += 1

        # Add left side inter-timestamp area to window
        left_area = time_series.iloc[max(
            0, i - 1)] * (time_series.index[i] - new_left_time)
        window_area += left_area

        out[j] = window_area / window_len

    return out
    def BCEuropeanPutDirichlet(self, V, t, op):
        V[0] = op.getStrike()*np.exp(-self.getR()*t/np.timedelta64(365, 'D'))
        V[-1] = 0

        return V
Ejemplo n.º 38
0
    def getMatchingEvents(self, solve=True):
        """Return a list of dictionaries matching input parameters.

        Args:
            solve (bool):
                If set to True, then this method
                should return a list with a maximum of one event.

        Returns:
            list: List of event dictionaries, with fields:
                  - time Event time (UTC)
                  - lat Event latitude
                  - lon Event longitude
                  - depth Event depth
                  - mag Event magnitude
        """
        jpyear = str(self.jptime.year)
        jpquarter = str(QUARTERS[self.jptime.month])
        if len(jpquarter) == 1:
            jpquarter = '0' + jpquarter
        url = SEARCH_URL.replace('YEAR', jpyear)
        url = url.replace('QUARTER', jpquarter)
        req = requests.get(url)
        data = req.text
        soup = BeautifulSoup(data, features="lxml")
        select = soup.find('select')
        options = select.find_all('option')
        times = []
        lats = []
        lons = []
        depths = []
        mags = []
        values = []
        for option in options:
            if 'Data not found' in option.text:
                break
            eventstr = option.contents[0]
            timestr = re.search(TIMEPAT, eventstr).group()
            latstr = re.search(LATPAT, eventstr).group()
            lonstr = re.search(LONPAT, eventstr).group()
            depstr = re.search(DEPPAT, eventstr).group()
            magstr = re.search(MAGPAT, eventstr).group()
            lat = float(latstr.replace('N', ''))
            lon = float(lonstr.replace('E', ''))
            depth = float(depstr.replace('km', ''))
            mag = float(magstr.replace('M', ''))
            etime = datetime.strptime(timestr, TIMEFMT)
            times.append(np.datetime64(etime))
            lats.append(lat)
            lons.append(lon)
            depths.append(depth)
            mags.append(mag)
            values.append(option.get('value'))

        events = []
        if not len(times):
            return events

        times = np.array(times)
        lats = np.array(lats)
        lons = np.array(lons)
        depths = np.array(depths)
        mags = np.array(mags)
        values = np.array(values)
        distances = geodetic_distance(self.lon, self.lat, lons, lats)
        didx = distances <= self.radius
        jptime = np.datetime64(self.jptime)
        # dtimes is in microseconds
        dtimes = np.abs(jptime - times)
        tidx = dtimes <= np.timedelta64(int(self.dt), 's')
        etimes = times[didx & tidx]
        elats = lats[didx & tidx]
        elons = lons[didx & tidx]
        edepths = depths[didx & tidx]
        emags = mags[didx & tidx]
        evalues = values[didx & tidx]

        for etime, elat, elon, edep, emag, evalue in zip(
                etimes, elats, elons, edepths, emags, evalues):
            jtime = UTCDateTime(str(etime))
            utime = jtime - JST_OFFSET
            edict = {
                'time': utime,
                'lat': elat,
                'lon': elon,
                'depth': edep,
                'mag': emag,
                'cgi_value': evalue
            }
            events.append(edict)

        if solve and len(events) > 1:
            event = self.solveEvents(events)
            events = [event]

        return events
Ejemplo n.º 39
0
    def test_infer_dtype_datetime(self):

        arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        arr = np.array(
            [np.datetime64('2011-01-01'),
             np.datetime64('2011-01-01')],
            dtype=object)
        assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

        arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # starts with nan
        for n in [pd.NaT, np.nan]:
            arr = np.array([n, pd.Timestamp('2011-01-02')])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, np.datetime64('2011-01-02')])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

            arr = np.array([n, datetime(2011, 1, 1)])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, pd.Timestamp('2011-01-02'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, np.datetime64('2011-01-02'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

            arr = np.array([n, datetime(2011, 1, 1), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # different type of nat
        arr = np.array([np.timedelta64('nat'),
                        np.datetime64('2011-01-02')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([np.datetime64('2011-01-02'),
                        np.timedelta64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        # mixed datetime
        arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # should be datetime?
        arr = np.array(
            [np.datetime64('2011-01-01'),
             pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array(
            [pd.Timestamp('2011-01-02'),
             np.datetime64('2011-01-01')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'

        arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'
t=set(activities_6)
train_data['same_time_activ_6'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x]))                 
t=set(activities_8)
train_data['same_time_activ_8'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x]))                 
t=set(activities_10)
train_data['same_time_activ_10'] = train_data.people_id.apply(lambda x : set([x]).intersection(t)==set([x]))                 
# number of selected activities per person
train_data['occur']=train_data.people_id
train_data.occur=train_data.people_id.apply(dict(train_data.people_id.value_counts()).get)
#mean of the time interval between activities 
for pep , df in train_data.groupby('people_id')['date_x']:
     df=pd.DataFrame(df)
     df.sort(columns='date_x',ascending=False,inplace=True)
     l=list(set(df.date_x.values))
     if len(l)>1:
         mean_time= (sum([l[i]-l[i+1] for i in range(0,len(l)-1,1)])/np.timedelta64(1,'D'))/(len(df.date_x.values)-1)
         people.loc[people.people_id==pep,'mean_time']=mean_time
     else:
         people.loc[people.people_id==pep,'mean_time']=0

train_data=pd.merge(train_data,people.loc[:,['people_id','mean_time']],on='people_id')

#percentage of groups that are in the test and not in the train 
test_train.loc[test_train.group_1.isin(groups)==False,'group_1'].shape[0]/test_train.shape[0]
#the first and the last activitie selected
first_activitie= train_data.loc[:,['people_id','date_x','activity_category']].sort(columns=['people_id','date_x']).drop_duplicates(['people_id'] ,keep='first')
first_activitie.rename(columns = {'activity_category':'first activity'} , inplace = True)
first_activitie.drop('date_x',axis=1,inplace=True)
last_activity = train_data.loc[:,['people_id','date_x','activity_category']].sort(columns=['people_id','date_x']).drop_duplicates(['people_id'],keep='last')
last_activity.rename(columns = {'activity_category':'last_activity'} , inplace=True)
last_activity.drop('date_x',axis=1,inplace=True)
Ejemplo n.º 41
0
    def gantt(data, monthly=True):
        """
        Make a Gantt plot, which shows the temporal data availability for each station.

        Parameters
        ----------
        data : pandas DataFrame
            A Pandas daily DataFrame with DatetimeIndex where each column corresponds to a station..
        monthly : boolean, default True
            Defines if the availability count of the data will be monthly to obtain a more fluid graph.

        Returns
        -------
        fig : plotly Figure
        """

        date_index = pd.date_range(data.index[0], data.index[-1], freq='D')
        data = data.reindex(date_index)
        periods = []
        for column in data.columns:
            series = data[column]
            if monthly:
                missing = series.isnull().groupby(
                    pd.Grouper(freq='1MS')).sum().to_frame()
                series_drop = missing.loc[
                    missing[column] <
                    7]  # A MONTH WITHOUT 7 DATA IS CONSIDERED A MISSING MONTH
                DELTA = 'M'
            else:
                series_drop = series.dropna()
                DELTA = 'D'
            if series_drop.shape[0] > 1:
                task = column
                resource = 'Available data'
                start = str(series_drop.index[0].year) + '-' + str(
                    series_drop.index[0].month) + '-' + str(
                        series_drop.index[0].day)
                finish = 0
                for i in range(len(series_drop)):
                    if i != 0 and round(
                        (series_drop.index[i] - series_drop.index[i - 1]) /
                            np.timedelta64(1, DELTA), 0) != 1:
                        finish = str(
                            series_drop.index[i - 1].year) + '-' + str(
                                series_drop.index[i - 1].month) + '-' + str(
                                    series_drop.index[i - 1].day)
                        periods.append(
                            dict(Task=task,
                                 Start=start,
                                 Finish=finish,
                                 Resource=resource))
                        start = str(series_drop.index[i].year) + '-' + str(
                            series_drop.index[i].month) + '-' + str(
                                series_drop.index[i].day)
                        finish = 0
                finish = str(series_drop.index[-1].year) + '-' + str(
                    series_drop.index[-1].month) + '-' + str(
                        series_drop.index[-1].day)
                periods.append(
                    dict(Task=task,
                         Start=start,
                         Finish=finish,
                         Resource=resource))
            else:
                print('Station {} has no months with significant data'.format(
                    column))
        periods = pd.DataFrame(periods)
        start_year = periods['Start'].apply(lambda x: int(x[:4])).min()
        finish_year = periods['Start'].apply(lambda x: int(x[:4])).max()
        colors = {'Available data': 'rgb(0,191,255)'}
        fig = ff.create_gantt(periods,
                              colors=colors,
                              index_col='Resource',
                              show_colorbar=True,
                              showgrid_x=True,
                              showgrid_y=True,
                              group_tasks=True)

        fig.layout.xaxis.tickvals = pd.date_range('1/1/' + str(start_year),
                                                  '12/31/' +
                                                  str(finish_year + 1),
                                                  freq='2AS')
        fig.layout.xaxis.ticktext = pd.date_range('1/1/' + str(start_year),
                                                  '12/31/' +
                                                  str(finish_year + 1),
                                                  freq='2AS').year
        return fig
Ejemplo n.º 42
0
    def test_infer_dtype_all_nan_nat_like(self):
        arr = np.array([np.nan, np.nan])
        assert lib.infer_dtype(arr, skipna=True) == 'floating'

        # nan and None mix are result in mixed
        arr = np.array([np.nan, np.nan, None])
        assert lib.infer_dtype(arr, skipna=True) == 'empty'
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([None, np.nan, np.nan])
        assert lib.infer_dtype(arr, skipna=True) == 'empty'
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        # pd.NaT
        arr = np.array([pd.NaT])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([pd.NaT, np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([np.nan, pd.NaT])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([np.nan, pd.NaT, np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([None, pd.NaT, None])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        # np.datetime64(nat)
        arr = np.array([np.datetime64('nat')])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

        for n in [np.nan, pd.NaT, None]:
            arr = np.array([n, np.datetime64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

            arr = np.array([pd.NaT, n, np.datetime64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

        arr = np.array([np.timedelta64('nat')], dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

        for n in [np.nan, pd.NaT, None]:
            arr = np.array([n, np.timedelta64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

            arr = np.array([pd.NaT, n, np.timedelta64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

        # datetime / timedelta mixed
        arr = np.array(
            [pd.NaT,
             np.datetime64('nat'),
             np.timedelta64('nat'), np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([np.timedelta64('nat'),
                        np.datetime64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
Ejemplo n.º 43
0
m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"]

na_vals = (
    [
        None,
        NaT,
        float("NaN"),
        complex("NaN"),
        np.nan,
        np.float64("NaN"),
        np.float32("NaN"),
        np.complex64(np.nan),
        np.complex128(np.nan),
        np.datetime64("NaT"),
        np.timedelta64("NaT"),
    ]
    + [np.datetime64("NaT", unit) for unit in m8_units]
    + [np.timedelta64("NaT", unit) for unit in m8_units]
)

inf_vals = [
    float("inf"),
    float("-inf"),
    complex("inf"),
    complex("-inf"),
    np.inf,
    np.NINF,
]

int_na_vals = [
Ejemplo n.º 44
0
class TestTypeInference(object):

    # Dummy class used for testing with Python objects
    class Dummy():
        pass

    def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
        # see pandas/conftest.py
        inferred_dtype, values = any_skipna_inferred_dtype

        # make sure the inferred dtype of the fixture is as requested
        assert inferred_dtype == lib.infer_dtype(values, skipna=True)

    @pytest.mark.parametrize('skipna', [True, False])
    def test_length_zero(self, skipna):
        result = lib.infer_dtype(np.array([], dtype='i4'), skipna=skipna)
        assert result == 'integer'

        result = lib.infer_dtype([], skipna=skipna)
        assert result == 'empty'

        # GH 18004
        arr = np.array(
            [np.array([], dtype=object),
             np.array([], dtype=object)])
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'empty'

    def test_integers(self):
        arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'integer'

        arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'mixed-integer'

        arr = np.array([1, 2, 3, 4, 5], dtype='i4')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'integer'

    def test_deprecation(self):
        # GH 24050
        arr = np.array([1, 2, 3], dtype=object)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = lib.infer_dtype(arr)  # default: skipna=None -> warn
            assert result == 'integer'

    def test_bools(self):
        arr = np.array([True, False, True, True, True], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'boolean'

        arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'boolean'

        arr = np.array([True, False, True, 'foo'], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'mixed'

        arr = np.array([True, False, True], dtype=bool)
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'boolean'

        arr = np.array([True, np.nan, False], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'boolean'

        result = lib.infer_dtype(arr, skipna=False)
        assert result == 'mixed'

    def test_floats(self):
        arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'floating'

        arr = np.array([1, 2, 3, np.float64(4),
                        np.float32(5), 'foo'],
                       dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'mixed-integer'

        arr = np.array([1, 2, 3, 4, 5], dtype='f4')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'floating'

        arr = np.array([1, 2, 3, 4, 5], dtype='f8')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'floating'

    def test_decimals(self):
        # GH15690
        arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'decimal'

        arr = np.array([1.0, 2.0, Decimal(3)])
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'mixed'

        arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'decimal'

        arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'decimal'

    # complex is compatible with nan, so skipna has no effect
    @pytest.mark.parametrize('skipna', [True, False])
    def test_complex(self, skipna):
        # gets cast to complex on array construction
        arr = np.array([1.0, 2.0, 1 + 1j])
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'complex'

        arr = np.array([1.0, 2.0, 1 + 1j], dtype='O')
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'mixed'

        # gets cast to complex on array construction
        arr = np.array([1, np.nan, 1 + 1j])
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'complex'

        arr = np.array([1.0, np.nan, 1 + 1j], dtype='O')
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'mixed'

        # complex with nans stays complex
        arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O')
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'complex'

        # test smaller complex dtype; will pass through _try_infer_map fastpath
        arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64)
        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == 'complex'

    def test_string(self):
        pass

    def test_unicode(self):
        arr = [u'a', np.nan, u'c']
        result = lib.infer_dtype(arr, skipna=False)
        assert result == 'mixed'

        arr = [u'a', np.nan, u'c']
        result = lib.infer_dtype(arr, skipna=True)
        expected = 'unicode' if PY2 else 'string'
        assert result == expected

    @pytest.mark.parametrize('dtype, missing, skipna, expected',
                             [(float, np.nan, False, 'floating'),
                              (float, np.nan, True, 'floating'),
                              (object, np.nan, False, 'floating'),
                              (object, np.nan, True, 'empty'),
                              (object, None, False, 'mixed'),
                              (object, None, True, 'empty')])
    @pytest.mark.parametrize('box', [pd.Series, np.array])
    def test_object_empty(self, box, missing, dtype, skipna, expected):
        # GH 23421
        arr = box([missing, missing], dtype=dtype)

        result = lib.infer_dtype(arr, skipna=skipna)
        assert result == expected

    def test_datetime(self):

        dates = [datetime(2012, 1, x) for x in range(1, 20)]
        index = Index(dates)
        assert index.inferred_type == 'datetime64'

    def test_infer_dtype_datetime(self):

        arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        arr = np.array(
            [np.datetime64('2011-01-01'),
             np.datetime64('2011-01-01')],
            dtype=object)
        assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

        arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # starts with nan
        for n in [pd.NaT, np.nan]:
            arr = np.array([n, pd.Timestamp('2011-01-02')])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, np.datetime64('2011-01-02')])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

            arr = np.array([n, datetime(2011, 1, 1)])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, pd.Timestamp('2011-01-02'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

            arr = np.array([n, np.datetime64('2011-01-02'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

            arr = np.array([n, datetime(2011, 1, 1), n])
            assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # different type of nat
        arr = np.array([np.timedelta64('nat'),
                        np.datetime64('2011-01-02')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([np.datetime64('2011-01-02'),
                        np.timedelta64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        # mixed datetime
        arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'datetime'

        # should be datetime?
        arr = np.array(
            [np.datetime64('2011-01-01'),
             pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array(
            [pd.Timestamp('2011-01-02'),
             np.datetime64('2011-01-01')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'

        arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

        arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
        assert lib.infer_dtype(arr, skipna=True) == 'mixed'

    def test_infer_dtype_timedelta(self):

        arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')])
        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

        arr = np.array([np.timedelta64(1, 'D'),
                        np.timedelta64(2, 'D')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

        arr = np.array([timedelta(1), timedelta(2)])
        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

        # starts with nan
        for n in [pd.NaT, np.nan]:
            arr = np.array([n, Timedelta('1 days')])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

            arr = np.array([n, np.timedelta64(1, 'D')])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

            arr = np.array([n, timedelta(1)])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

            arr = np.array([n, pd.Timedelta('1 days'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

            arr = np.array([n, np.timedelta64(1, 'D'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

            arr = np.array([n, timedelta(1), n])
            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

        # different type of nat
        arr = np.array([np.datetime64('nat'),
                        np.timedelta64(1, 'D')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([np.timedelta64(1, 'D'),
                        np.datetime64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

    def test_infer_dtype_period(self):
        # GH 13664
        arr = np.array(
            [pd.Period('2011-01', freq='D'),
             pd.Period('2011-02', freq='D')])
        assert lib.infer_dtype(arr, skipna=True) == 'period'

        arr = np.array(
            [pd.Period('2011-01', freq='D'),
             pd.Period('2011-02', freq='M')])
        assert lib.infer_dtype(arr, skipna=True) == 'period'

        # starts with nan
        for n in [pd.NaT, np.nan]:
            arr = np.array([n, pd.Period('2011-01', freq='D')])
            assert lib.infer_dtype(arr, skipna=True) == 'period'

            arr = np.array([n, pd.Period('2011-01', freq='D'), n])
            assert lib.infer_dtype(arr, skipna=True) == 'period'

        # different type of nat
        arr = np.array([np.datetime64('nat'),
                        pd.Period('2011-01', freq='M')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([pd.Period('2011-01', freq='M'),
                        np.datetime64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

    @pytest.mark.parametrize(
        "data",
        [[datetime(2017, 6, 12, 19, 30),
          datetime(2017, 3, 11, 1, 15)],
         [Timestamp("20170612"), Timestamp("20170311")],
         [
             Timestamp("20170612", tz='US/Eastern'),
             Timestamp("20170311", tz='US/Eastern')
         ], [date(2017, 6, 12),
             Timestamp("20170311", tz='US/Eastern')],
         [np.datetime64("2017-06-12"),
          np.datetime64("2017-03-11")],
         [np.datetime64("2017-06-12"),
          datetime(2017, 3, 11, 1, 15)]])
    def test_infer_datetimelike_array_datetime(self, data):
        assert lib.infer_datetimelike_array(data) == "datetime"

    @pytest.mark.parametrize("data", [[
        timedelta(2017, 6, 12), timedelta(2017, 3, 11)
    ], [timedelta(2017, 6, 12), date(2017, 3, 11)
        ], [np.timedelta64(2017, "D"),
            np.timedelta64(6, "s")
            ], [np.timedelta64(2017, "D"),
                timedelta(2017, 3, 11)]])
    def test_infer_datetimelike_array_timedelta(self, data):
        assert lib.infer_datetimelike_array(data) == "timedelta"

    def test_infer_datetimelike_array_date(self):
        arr = [date(2017, 6, 12), date(2017, 3, 11)]
        assert lib.infer_datetimelike_array(arr) == "date"

    @pytest.mark.parametrize(
        "data",
        [["2017-06-12", "2017-03-11"], [20170612, 20170311],
         [20170612.5, 20170311.8], [Dummy(), Dummy()],
         [Timestamp("20170612"),
          Timestamp("20170311", tz='US/Eastern')],
         [Timestamp("20170612"), 20170311],
         [timedelta(2017, 6, 12),
          Timestamp("20170311", tz='US/Eastern')]])
    def test_infer_datetimelike_array_mixed(self, data):
        assert lib.infer_datetimelike_array(data) == "mixed"

    @pytest.mark.parametrize(
        "first, expected",
        [[[None], "mixed"], [[np.nan], "mixed"], [[pd.NaT], "nat"],
         [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"],
         [[np.datetime64("2017-06-12"), pd.NaT], "datetime"],
         [[date(2017, 6, 12), pd.NaT], "date"],
         [[timedelta(2017, 6, 12), pd.NaT], "timedelta"],
         [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"]])
    @pytest.mark.parametrize("second", [None, np.nan])
    def test_infer_datetimelike_array_nan_nat_like(self, first, second,
                                                   expected):
        first.append(second)
        assert lib.infer_datetimelike_array(first) == expected

    def test_infer_dtype_all_nan_nat_like(self):
        arr = np.array([np.nan, np.nan])
        assert lib.infer_dtype(arr, skipna=True) == 'floating'

        # nan and None mix are result in mixed
        arr = np.array([np.nan, np.nan, None])
        assert lib.infer_dtype(arr, skipna=True) == 'empty'
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([None, np.nan, np.nan])
        assert lib.infer_dtype(arr, skipna=True) == 'empty'
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        # pd.NaT
        arr = np.array([pd.NaT])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([pd.NaT, np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([np.nan, pd.NaT])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([np.nan, pd.NaT, np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        arr = np.array([None, pd.NaT, None])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime'

        # np.datetime64(nat)
        arr = np.array([np.datetime64('nat')])
        assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

        for n in [np.nan, pd.NaT, None]:
            arr = np.array([n, np.datetime64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

            arr = np.array([pd.NaT, n, np.datetime64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

        arr = np.array([np.timedelta64('nat')], dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

        for n in [np.nan, pd.NaT, None]:
            arr = np.array([n, np.timedelta64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

            arr = np.array([pd.NaT, n, np.timedelta64('nat'), n])
            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

        # datetime / timedelta mixed
        arr = np.array(
            [pd.NaT,
             np.datetime64('nat'),
             np.timedelta64('nat'), np.nan])
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

        arr = np.array([np.timedelta64('nat'),
                        np.datetime64('nat')],
                       dtype=object)
        assert lib.infer_dtype(arr, skipna=False) == 'mixed'

    def test_is_datetimelike_array_all_nan_nat_like(self):
        arr = np.array([np.nan, pd.NaT, np.datetime64('nat')])
        assert lib.is_datetime_array(arr)
        assert lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')])
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array(
            [np.nan, pd.NaT,
             np.datetime64('nat'),
             np.timedelta64('nat')])
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, pd.NaT])
        assert lib.is_datetime_array(arr)
        assert lib.is_datetime64_array(arr)
        assert lib.is_timedelta_or_timedelta64_array(arr)

        arr = np.array([np.nan, np.nan], dtype=object)
        assert not lib.is_datetime_array(arr)
        assert not lib.is_datetime64_array(arr)
        assert not lib.is_timedelta_or_timedelta64_array(arr)

        assert lib.is_datetime_with_singletz_array(
            np.array([
                pd.Timestamp('20130101', tz='US/Eastern'),
                pd.Timestamp('20130102', tz='US/Eastern')
            ],
                     dtype=object))
        assert not lib.is_datetime_with_singletz_array(
            np.array([
                pd.Timestamp('20130101', tz='US/Eastern'),
                pd.Timestamp('20130102', tz='CET')
            ],
                     dtype=object))

    @pytest.mark.parametrize("func", [
        'is_datetime_array', 'is_datetime64_array', 'is_bool_array',
        'is_timedelta_or_timedelta64_array', 'is_date_array', 'is_time_array',
        'is_interval_array', 'is_period_array'
    ])
    def test_other_dtypes_for_array(self, func):
        func = getattr(lib, func)
        arr = np.array(['foo', 'bar'])
        assert not func(arr)

        arr = np.array([1, 2])
        assert not func(arr)

    def test_date(self):

        dates = [date(2012, 1, day) for day in range(1, 20)]
        index = Index(dates)
        assert index.inferred_type == 'date'

        dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
        result = lib.infer_dtype(dates, skipna=False)
        assert result == 'mixed'

        result = lib.infer_dtype(dates, skipna=True)
        assert result == 'date'

    def test_is_numeric_array(self):

        assert lib.is_float_array(np.array([1, 2.0]))
        assert lib.is_float_array(np.array([1, 2.0, np.nan]))
        assert not lib.is_float_array(np.array([1, 2]))

        assert lib.is_integer_array(np.array([1, 2]))
        assert not lib.is_integer_array(np.array([1, 2.0]))

    def test_is_string_array(self):

        assert lib.is_string_array(np.array(['foo', 'bar']))
        assert not lib.is_string_array(
            np.array(['foo', 'bar', np.nan], dtype=object), skipna=False)
        assert lib.is_string_array(np.array(['foo', 'bar', np.nan],
                                            dtype=object),
                                   skipna=True)
        assert not lib.is_string_array(np.array([1, 2]))

    def test_to_object_array_tuples(self):
        r = (5, 6)
        values = [r]
        result = lib.to_object_array_tuples(values)

        try:
            # make sure record array works
            from collections import namedtuple
            record = namedtuple('record', 'x y')
            r = record(5, 6)
            values = [r]
            result = lib.to_object_array_tuples(values)  # noqa
        except ImportError:
            pass

    def test_object(self):

        # GH 7431
        # cannot infer more than this as only a single element
        arr = np.array([None], dtype='O')
        result = lib.infer_dtype(arr, skipna=False)
        assert result == 'mixed'
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'empty'

    def test_to_object_array_width(self):
        # see gh-13320
        rows = [[1, 2, 3], [4, 5, 6]]

        expected = np.array(rows, dtype=object)
        out = lib.to_object_array(rows)
        tm.assert_numpy_array_equal(out, expected)

        expected = np.array(rows, dtype=object)
        out = lib.to_object_array(rows, min_width=1)
        tm.assert_numpy_array_equal(out, expected)

        expected = np.array([[1, 2, 3, None, None], [4, 5, 6, None, None]],
                            dtype=object)
        out = lib.to_object_array(rows, min_width=5)
        tm.assert_numpy_array_equal(out, expected)

    def test_is_period(self):
        assert lib.is_period(pd.Period('2011-01', freq='M'))
        assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))
        assert not lib.is_period(pd.Timestamp('2011-01'))
        assert not lib.is_period(1)
        assert not lib.is_period(np.nan)

    def test_categorical(self):

        # GH 8974
        from pandas import Categorical, Series
        arr = Categorical(list('abc'))
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'categorical'

        result = lib.infer_dtype(Series(arr), skipna=True)
        assert result == 'categorical'

        arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
        result = lib.infer_dtype(arr, skipna=True)
        assert result == 'categorical'

        result = lib.infer_dtype(Series(arr), skipna=True)
        assert result == 'categorical'
Ejemplo n.º 45
0
    def dcr_coin(self):
        """
        Pulls Coinmetrics v2 API Community
            - adds coin age metric (days)
            - adds coin age metric (supply) = Supply / 21M
            - adds Bittrex early price data not included in coinmetrics from csv

            OUTPUT DATAFRAME COLUMNS:
            'date', 'blk','age_days','age_sply','btc_blk_est',
            'DailyIssuedNtv', 'DailyIssuedUSD', 'inf_pct_ann', 'S2F',
            'AdrActCnt', 'BlkCnt', 'BlkSizeByte', 'BlkSizeMeanByte',
            'CapMVRVCur', 'CapMrktCurUSD', 'CapRealUSD', 'DiffMean', 
            'FeeMeanNtv','FeeMeanUSD', 'FeeMedNtv', 'FeeMedUSD', 'FeeTotNtv', 'FeeTotUSD',
            'PriceBTC', 'PriceUSD', 'PriceRealUSD', 'SplyCur',
            'TxCnt', 'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValAdjUSD',
            'TxTfrValMeanNtv', 'TxTfrValMeanUSD', 'TxTfrValMedNtv',
            'TxTfrValMedUSD', 'TxTfrValNtv', 'TxTfrValUSD',
            'notes'
        """
        df = Coinmetrics_api('dcr', "2016-02-08", today).convert_to_pd()
        #Calculate coin age since launch in days
        df['age_days'] = (df[['date']] - df.loc[0, ['date']]) / np.timedelta64(
            1, 'D')
        #Calculate coin age since launch in terms of supply
        df['age_sply'] = df['SplyCur'] / 21e6
        print(
            '...adding PriceUSD and CapMrktCurUSD for $0.49 (founders, 8/9-Feb-2016)'
        )
        print('and Bittrex (10-02-2016 to 16-05-2016)...')
        #Import Early price data -->
        #   founders $0.49 for 8/9 Feb 2016
        #   Bitrex up to 16-May-2016 (saved in relative link csv)
        df_early = pd.read_csv(
            r"dcronchain\resources\data\dcr_pricedata_2016-02-08_2016-05-16.csv"
        )
        df_early['date'] = pd.to_datetime(
            df_early['date'], utc=True)  #Convert to correct datetime format
        df['notes'] = str('')  # add notes for storing data
        for i in df_early['date']:  #swap in early price data
            #Add Early PriceUSD Data
            df.loc[df.date == i,
                   'PriceUSD'] = float(df_early.loc[df_early.date == i,
                                                    'PriceUSD'])
            #Add Early PriceBTC Data
            df.loc[df.date == i,
                   'PriceBTC'] = float(df_early.loc[df_early.date == i,
                                                    'PriceBTC'])
            #Add Early MarketCap Data
            df.loc[df.date == i,
                   'CapMrktCurUSD'] = (df.loc[df.date == i, 'PriceUSD'] *
                                       df.loc[df.date == i, 'SplyCur'])
            #Add Notes
            df.loc[df.date == i, 'notes'] = df_early.loc[df_early.date == i,
                                                         'notes']
        # Restructure final dataset
        df = df[[
            'date', 'blk', 'age_days', 'age_sply', 'btc_blk_est',
            'DailyIssuedNtv', 'DailyIssuedUSD', 'inf_pct_ann', 'S2F',
            'AdrActCnt', 'BlkCnt', 'BlkSizeByte', 'BlkSizeMeanByte',
            'CapMVRVCur', 'CapMrktCurUSD', 'CapRealUSD', 'DiffMean',
            'FeeMeanNtv', 'FeeMeanUSD', 'FeeMedNtv', 'FeeMedUSD', 'FeeTotNtv',
            'FeeTotUSD', 'PriceBTC', 'PriceUSD', 'PriceRealUSD', 'SplyCur',
            'TxCnt', 'TxTfrCnt', 'TxTfrValAdjNtv', 'TxTfrValAdjUSD',
            'TxTfrValMeanNtv', 'TxTfrValMeanUSD', 'TxTfrValMedNtv',
            'TxTfrValMedUSD', 'TxTfrValNtv', 'TxTfrValUSD', 'notes'
        ]]
        #Reformat datetime
        #df['date'] = df['date'].dt.strftime('%d-%m-%y')
        return df
Ejemplo n.º 46
0
from sklearn.metrics import confusion_matrix

df_response = pd.read_csv('/content/drive/MyDrive/CRM/Retail_Data_Response.csv')
df_transactions = pd.read_csv('/content/drive/MyDrive/CRM/Retail_Data_Transactions.csv', parse_dates=['trans_date'])

df_response.head()

df_transactions.head()

print(df_transactions['trans_date'].min())
print(df_transactions['trans_date'].max())

campaign_date = dt.datetime(2015,3,17)
df_transactions['recent']= campaign_date - df_transactions['trans_date']
df_transactions['recent'].astype('timedelta64[D]')
df_transactions['recent']=df_transactions['recent'] / np.timedelta64(1, 'D')
df_transactions.head()



## create data set with RFM variables

df_rfm = df_transactions.groupby('customer_id').agg({'recent': lambda x:x.min(),                      # Recency
                                                     'customer_id': lambda x: len(x),                 # Frequency
                                                     'tran_amount': lambda x: x.sum()})               # Monetary Value           


df_rfm.rename(columns={'recent': 'recency', 
                       'customer_id': 'frequency', 
                       'tran_amount': 'monetary_value'}, inplace=True)
#df_rfm['ticket_size'] = df_rfm['monetary_value'] / df_rfm['frequency']
Ejemplo n.º 47
0
    def test_short_format_converters(self):
        def conv(v):
            return v.astype('m8[ns]')

        self.assertEqual(ct('10'), np.timedelta64(10, 'ns'))
        self.assertEqual(ct('10ns'), np.timedelta64(10, 'ns'))
        self.assertEqual(ct('100'), np.timedelta64(100, 'ns'))
        self.assertEqual(ct('100ns'), np.timedelta64(100, 'ns'))

        self.assertEqual(ct('1000'), np.timedelta64(1000, 'ns'))
        self.assertEqual(ct('1000ns'), np.timedelta64(1000, 'ns'))
        self.assertEqual(ct('1000NS'), np.timedelta64(1000, 'ns'))

        self.assertEqual(ct('10us'), np.timedelta64(10000, 'ns'))
        self.assertEqual(ct('100us'), np.timedelta64(100000, 'ns'))
        self.assertEqual(ct('1000us'), np.timedelta64(1000000, 'ns'))
        self.assertEqual(ct('1000Us'), np.timedelta64(1000000, 'ns'))
        self.assertEqual(ct('1000uS'), np.timedelta64(1000000, 'ns'))

        self.assertEqual(ct('1ms'), np.timedelta64(1000000, 'ns'))
        self.assertEqual(ct('10ms'), np.timedelta64(10000000, 'ns'))
        self.assertEqual(ct('100ms'), np.timedelta64(100000000, 'ns'))
        self.assertEqual(ct('1000ms'), np.timedelta64(1000000000, 'ns'))

        self.assertEqual(ct('-1s'), -np.timedelta64(1000000000, 'ns'))
        self.assertEqual(ct('1s'), np.timedelta64(1000000000, 'ns'))
        self.assertEqual(ct('10s'), np.timedelta64(10000000000, 'ns'))
        self.assertEqual(ct('100s'), np.timedelta64(100000000000, 'ns'))
        self.assertEqual(ct('1000s'), np.timedelta64(1000000000000, 'ns'))

        self.assertEqual(ct('1d'), conv(np.timedelta64(1, 'D')))
        self.assertEqual(ct('-1d'), -conv(np.timedelta64(1, 'D')))
        self.assertEqual(ct('1D'), conv(np.timedelta64(1, 'D')))
        self.assertEqual(ct('10D'), conv(np.timedelta64(10, 'D')))
        self.assertEqual(ct('100D'), conv(np.timedelta64(100, 'D')))
        self.assertEqual(ct('1000D'), conv(np.timedelta64(1000, 'D')))
        self.assertEqual(ct('10000D'), conv(np.timedelta64(10000, 'D')))

        # space
        self.assertEqual(ct(' 10000D '), conv(np.timedelta64(10000, 'D')))
        self.assertEqual(ct(' - 10000D '), -conv(np.timedelta64(10000, 'D')))

        # invalid
        self.assertRaises(ValueError, ct, '1foo')
        self.assertRaises(ValueError, ct, 'foo')
Ejemplo n.º 48
0
    with pytest.raises(TypeError, match=r"Expected integer or floating point"):
        da_time.interpolate_na("t", max_gap="1H", use_coordinate=False)

    with pytest.raises(ValueError, match=r"Could not convert 'huh' to timedelta64"):
        da_time.interpolate_na("t", max_gap="huh")


@requires_bottleneck
@pytest.mark.parametrize(
    "time_range_func",
    [pd.date_range, pytest.param(xr.cftime_range, marks=requires_cftime)],
)
@pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.to_dataset(name="a")])
@pytest.mark.parametrize(
    "max_gap", ["3H", np.timedelta64(3, "h"), pd.to_timedelta("3H")]
)
def test_interpolate_na_max_gap_time_specifier(
    da_time, max_gap, transform, time_range_func
):
    da_time["t"] = time_range_func("2001-01-01", freq="H", periods=11)
    expected = transform(
        da_time.copy(data=[np.nan, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan, 10])
    )
    actual = transform(da_time).interpolate_na("t", max_gap=max_gap)
    assert_allclose(actual, expected)


@requires_bottleneck
@pytest.mark.parametrize(
    "coords",
Ejemplo n.º 49
0
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
'''data exploring'''

inputfile = "F:\spyder\datamining\chapter10\demo\data\water_heater.xls"
data = pd.read_excel(inputfile)

data[u'发生时间'] = pd.to_datetime(data[u'发生时间'], format='%Y%m%d%H%M%S')
data = data[data[u'水流量'] > 0]
'''Pandas计算出的时间间隔数据的类型是np.timedelta64,
 不是Python标准库中的timedelta类型,因此没有total_minutes()函数,
 需要除以np.timedelta64的1分钟来计算间隔了多少分钟。'''
data[u'用水停顿时间间隔'] = data[u'发生时间'].diff() / np.timedelta64(1, 'm')  #计算间隔了多少分钟
data = data.fillna(0)
'''step1: check maximum, minimum of each column'''
data_explore = data.describe().T
print(data_explore)
data_explore['null'] = len(data) - data_explore['count']  # numbers of nulls
explore = data_explore[['min', 'max', 'null']]
explore.columns = [u'最小值', u'最大值', u'空值数']
print(explore)
'''step2: Discretization and surface division'''
Ti = list(data[u'用水停顿时间间隔'])  #将要面元化的数据转化为一维列表
timegaplist = [
    0.0, 0.1, 0.2, 0.3, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2100
]  #确定划分区间
cats = pd.cut(Ti, timegaplist, right=False)  #包括区间左端,类似于[0, 0.1),(默认则包含又端点)
x = pd.value_counts(cats)
Ejemplo n.º 50
0
df_inicial = df_inicial.assign(dias=pd.to_datetime(df_inicial['date']) -
                               data_base)
df_inicial = df_inicial.assign(
    vacinados=pd.to_numeric(df_inicial['people_fully_vaccinated']))

# primeiro dia de ocorrencia de imunização completa
print('A primeira imunização completa no Brasil ocorreu em:', data_base)

# gerando a coluna com dias de vacinação
df_inicial = df_inicial.assign(dias=pd.to_datetime(df_inicial['date']) -
                               data_base)
df_inicial = df_inicial.assign(
    vacinados=pd.to_numeric(df_inicial['people_fully_vaccinated']))

# convertendo tipos de dados
df_inicial['dias'] = df_inicial['dias'] / np.timedelta64(1, 'D')
df_inicial['dias'] = df_inicial['dias'].astype(int)
df_inicial['vacinados'] = df_inicial['vacinados'].astype(int)

## Utilizando regressão linear para previsão dos dados.
lin_reg = LinearRegression()
x = pd.DataFrame(df_inicial['dias'])
y = pd.DataFrame(df_inicial['vacinados'])

# transformando colunas do dataframe em array numpy
x = x.iloc[:, 0].values.reshape(-1, 1)
y = y.iloc[:, 0].values.reshape(-1, 1)

# calculando o score do modelo utilizado
reg = LinearRegression().fit(x, y)  # treinando modelo com os dados historicos
print('O valor de score do modelo é = ',
Ejemplo n.º 51
0
class TestNumericArraylikeArithmeticWithDatetimeLike:

    # TODO: also check name retentention
    @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize(
        "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
    )
    def test_mul_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([1, 2, 3], dtype="m8[s]")
        right = box_cls(right)

        expected = pd.TimedeltaIndex(["10s", "40s", "90s"])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = left * right
        tm.assert_equal(result, expected)

        result = right * left
        tm.assert_equal(result, expected)

    # TODO: also check name retentention
    @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series])
    @pytest.mark.parametrize(
        "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
    )
    def test_div_td64arr(self, left, box_cls):
        # GH#22390
        right = np.array([10, 40, 90], dtype="m8[s]")
        right = box_cls(right)

        expected = pd.TimedeltaIndex(["1s", "2s", "3s"])
        if isinstance(left, pd.Series) or box_cls is pd.Series:
            expected = pd.Series(expected)

        result = right / left
        tm.assert_equal(result, expected)

        result = right // left
        tm.assert_equal(result, expected)

        msg = "Cannot divide"
        with pytest.raises(TypeError, match=msg):
            left / right

        with pytest.raises(TypeError, match=msg):
            left // right

    # TODO: de-duplicate with test_numeric_arr_mul_tdscalar
    def test_ops_series(self):
        # regression test for G#H8813
        td = Timedelta("1 day")
        other = pd.Series([1, 2])
        expected = pd.Series(pd.to_timedelta(["1 day", "2 days"]))
        tm.assert_series_equal(expected, td * other)
        tm.assert_series_equal(expected, other * td)

    # TODO: also test non-nanosecond timedelta64 and Tick objects;
    #  see test_numeric_arr_rdiv_tdscalar for note on these failing
    @pytest.mark.parametrize(
        "scalar_td",
        [
            Timedelta(days=1),
            Timedelta(days=1).to_timedelta64(),
            Timedelta(days=1).to_pytimedelta(),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box):
        # GH#19333
        index = numeric_idx

        expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(5)])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    @pytest.mark.parametrize(
        "scalar_td",
        [
            Timedelta(days=1),
            Timedelta(days=1).to_timedelta64(),
            Timedelta(days=1).to_pytimedelta(),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box):
        arr = np.arange(2 * 10 ** 4).astype(np.int64)
        obj = tm.box_expected(arr, box, transpose=False)

        expected = arr.view("timedelta64[D]").astype("timedelta64[ns]")
        expected = tm.box_expected(expected, box, transpose=False)

        result = obj * scalar_td
        tm.assert_equal(result, expected)

        result = scalar_td * obj
        tm.assert_equal(result, expected)

    def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box):
        index = numeric_idx[1:3]

        expected = TimedeltaIndex(["3 Days", "36 Hours"])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = three_days / index
        tm.assert_equal(result, expected)

        msg = "cannot use operands with types dtype"
        with pytest.raises(TypeError, match=msg):
            index / three_days

    @pytest.mark.parametrize(
        "other",
        [
            pd.Timedelta(hours=31),
            pd.Timedelta(hours=31).to_pytimedelta(),
            pd.Timedelta(hours=31).to_timedelta64(),
            pd.Timedelta(hours=31).to_timedelta64().astype("m8[h]"),
            np.timedelta64("NaT"),
            np.timedelta64("NaT", "D"),
            pd.offsets.Minute(3),
            pd.offsets.Second(0),
        ],
    )
    def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box):
        left = tm.box_expected(numeric_idx, box)
        msg = (
            "unsupported operand type|"
            "Addition/subtraction of integers and integer-arrays|"
            "Instead of adding/subtracting|"
            "cannot use operands with types dtype|"
            "Concatenation operation is not implemented for NumPy arrays"
        )
        with pytest.raises(TypeError, match=msg):
            left + other
        with pytest.raises(TypeError, match=msg):
            other + left
        with pytest.raises(TypeError, match=msg):
            left - other
        with pytest.raises(TypeError, match=msg):
            other - left

    @pytest.mark.parametrize(
        "other",
        [
            pd.Timestamp.now().to_pydatetime(),
            pd.Timestamp.now(tz="UTC").to_pydatetime(),
            pd.Timestamp.now().to_datetime64(),
            pd.NaT,
        ],
    )
    @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning")
    def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box):
        # GH#28080 numeric+datetime64 should raise; Timestamp raises
        #  NullFrequencyError instead of TypeError so is excluded.
        left = tm.box_expected(numeric_idx, box)

        msg = (
            "unsupported operand type|"
            "Cannot (add|subtract) NaT (to|from) ndarray|"
            "Addition/subtraction of integers and integer-arrays|"
            "Concatenation operation is not implemented for NumPy arrays"
        )
        with pytest.raises(TypeError, match=msg):
            left + other
        with pytest.raises(TypeError, match=msg):
            other + left
        with pytest.raises(TypeError, match=msg):
            left - other
        with pytest.raises(TypeError, match=msg):
            other - left
Ejemplo n.º 52
0
    def test_construction(self):

        expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8')
        self.assertEqual(Timedelta(10, unit='d').value, expected)
        self.assertEqual(Timedelta(10.0, unit='d').value, expected)
        self.assertEqual(Timedelta('10 days').value, expected)
        self.assertEqual(Timedelta(days=10).value, expected)
        self.assertEqual(Timedelta(days=10.0).value, expected)

        expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8')
        self.assertEqual(Timedelta('10 days 00:00:10').value, expected)
        self.assertEqual(Timedelta(days=10, seconds=10).value, expected)
        self.assertEqual(
            Timedelta(days=10, milliseconds=10 * 1000).value, expected)
        self.assertEqual(
            Timedelta(days=10, microseconds=10 * 1000 * 1000).value, expected)

        # test construction with np dtypes
        # GH 8757
        timedelta_kwargs = {'days': 'D',
                            'seconds': 's',
                            'microseconds': 'us',
                            'milliseconds': 'ms',
                            'minutes': 'm',
                            'hours': 'h',
                            'weeks': 'W'}
        npdtypes = [np.int64, np.int32, np.int16, np.float64, np.float32,
                    np.float16]
        for npdtype in npdtypes:
            for pykwarg, npkwarg in timedelta_kwargs.items():
                expected = np.timedelta64(1,
                                          npkwarg).astype('m8[ns]').view('i8')
                self.assertEqual(
                    Timedelta(**{pykwarg: npdtype(1)}).value, expected)

        # rounding cases
        self.assertEqual(Timedelta(82739999850000).value, 82739999850000)
        self.assertTrue('0 days 22:58:59.999850' in str(Timedelta(
            82739999850000)))
        self.assertEqual(Timedelta(123072001000000).value, 123072001000000)
        self.assertTrue('1 days 10:11:12.001' in str(Timedelta(
            123072001000000)))

        # string conversion with/without leading zero
        # GH 9570
        self.assertEqual(Timedelta('0:00:00'), timedelta(hours=0))
        self.assertEqual(Timedelta('00:00:00'), timedelta(hours=0))
        self.assertEqual(Timedelta('-1:00:00'), -timedelta(hours=1))
        self.assertEqual(Timedelta('-01:00:00'), -timedelta(hours=1))

        # more strings & abbrevs
        # GH 8190
        self.assertEqual(Timedelta('1 h'), timedelta(hours=1))
        self.assertEqual(Timedelta('1 hour'), timedelta(hours=1))
        self.assertEqual(Timedelta('1 hr'), timedelta(hours=1))
        self.assertEqual(Timedelta('1 hours'), timedelta(hours=1))
        self.assertEqual(Timedelta('-1 hours'), -timedelta(hours=1))
        self.assertEqual(Timedelta('1 m'), timedelta(minutes=1))
        self.assertEqual(Timedelta('1.5 m'), timedelta(seconds=90))
        self.assertEqual(Timedelta('1 minute'), timedelta(minutes=1))
        self.assertEqual(Timedelta('1 minutes'), timedelta(minutes=1))
        self.assertEqual(Timedelta('1 s'), timedelta(seconds=1))
        self.assertEqual(Timedelta('1 second'), timedelta(seconds=1))
        self.assertEqual(Timedelta('1 seconds'), timedelta(seconds=1))
        self.assertEqual(Timedelta('1 ms'), timedelta(milliseconds=1))
        self.assertEqual(Timedelta('1 milli'), timedelta(milliseconds=1))
        self.assertEqual(Timedelta('1 millisecond'), timedelta(milliseconds=1))
        self.assertEqual(Timedelta('1 us'), timedelta(microseconds=1))
        self.assertEqual(Timedelta('1 micros'), timedelta(microseconds=1))
        self.assertEqual(Timedelta('1 microsecond'), timedelta(microseconds=1))
        self.assertEqual(Timedelta('1.5 microsecond'),
                         Timedelta('00:00:00.000001500'))
        self.assertEqual(Timedelta('1 ns'), Timedelta('00:00:00.000000001'))
        self.assertEqual(Timedelta('1 nano'), Timedelta('00:00:00.000000001'))
        self.assertEqual(Timedelta('1 nanosecond'),
                         Timedelta('00:00:00.000000001'))

        # combos
        self.assertEqual(Timedelta('10 days 1 hour'),
                         timedelta(days=10, hours=1))
        self.assertEqual(Timedelta('10 days 1 h'), timedelta(days=10, hours=1))
        self.assertEqual(Timedelta('10 days 1 h 1m 1s'), timedelta(
            days=10, hours=1, minutes=1, seconds=1))
        self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), -
                         timedelta(days=10, hours=1, minutes=1, seconds=1))
        self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), -
                         timedelta(days=10, hours=1, minutes=1, seconds=1))
        self.assertEqual(Timedelta('-10 days 1 h 1m 1s 3us'), -
                         timedelta(days=10, hours=1, minutes=1,
                                   seconds=1, microseconds=3))
        self.assertEqual(Timedelta('-10 days 1 h 1.5m 1s 3us'), -
                         timedelta(days=10, hours=1, minutes=1,
                                   seconds=31, microseconds=3))

        # currently invalid as it has a - on the hhmmdd part (only allowed on
        # the days)
        self.assertRaises(ValueError,
                          lambda: Timedelta('-10 days -1 h 1.5m 1s 3us'))

        # only leading neg signs are allowed
        self.assertRaises(ValueError,
                          lambda: Timedelta('10 days -1 h 1.5m 1s 3us'))

        # no units specified
        self.assertRaises(ValueError, lambda: Timedelta('3.1415'))

        # invalid construction
        tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta",
                              lambda: Timedelta())
        tm.assertRaisesRegexp(ValueError, "unit abbreviation w/o a number",
                              lambda: Timedelta('foo'))
        tm.assertRaisesRegexp(ValueError,
                              "cannot construct a Timedelta from the passed "
                              "arguments, allowed keywords are ",
                              lambda: Timedelta(day=10))

        # roundtripping both for string and value
        for v in ['1s', '-1s', '1us', '-1us', '1 day', '-1 day',
                  '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns',
                  '1ns', '-23:59:59.999999999']:

            td = Timedelta(v)
            self.assertEqual(Timedelta(td.value), td)

            # str does not normally display nanos
            if not td.nanoseconds:
                self.assertEqual(Timedelta(str(td)), td)
            self.assertEqual(Timedelta(td._repr_base(format='all')), td)

        # floats
        expected = np.timedelta64(
            10, 's').astype('m8[ns]').view('i8') + np.timedelta64(
                500, 'ms').astype('m8[ns]').view('i8')
        self.assertEqual(Timedelta(10.5, unit='s').value, expected)

        # offset
        self.assertEqual(to_timedelta(pd.offsets.Hour(2)),
                         Timedelta('0 days, 02:00:00'))
        self.assertEqual(Timedelta(pd.offsets.Hour(2)),
                         Timedelta('0 days, 02:00:00'))
        self.assertEqual(Timedelta(pd.offsets.Second(2)),
                         Timedelta('0 days, 00:00:02'))

        # unicode
        # GH 11995
        expected = Timedelta('1H')
        result = pd.Timedelta(u'1H')
        self.assertEqual(result, expected)
        self.assertEqual(to_timedelta(pd.offsets.Hour(2)),
                         Timedelta(u'0 days, 02:00:00'))

        self.assertRaises(ValueError, lambda: Timedelta(u'foo bar'))
Ejemplo n.º 53
0
    def test_unique_1d(self):

        def check_all(a, b, i1, i2, c, dt):
            base_msg = 'check {0} failed for type {1}'

            msg = base_msg.format('values', dt)
            v = unique(a)
            assert_array_equal(v, b, msg)

            msg = base_msg.format('return_index', dt)
            v, j = unique(a, True, False, False)
            assert_array_equal(v, b, msg)
            assert_array_equal(j, i1, msg)

            msg = base_msg.format('return_inverse', dt)
            v, j = unique(a, False, True, False)
            assert_array_equal(v, b, msg)
            assert_array_equal(j, i2, msg)

            msg = base_msg.format('return_counts', dt)
            v, j = unique(a, False, False, True)
            assert_array_equal(v, b, msg)
            assert_array_equal(j, c, msg)

            msg = base_msg.format('return_index and return_inverse', dt)
            v, j1, j2 = unique(a, True, True, False)
            assert_array_equal(v, b, msg)
            assert_array_equal(j1, i1, msg)
            assert_array_equal(j2, i2, msg)

            msg = base_msg.format('return_index and return_counts', dt)
            v, j1, j2 = unique(a, True, False, True)
            assert_array_equal(v, b, msg)
            assert_array_equal(j1, i1, msg)
            assert_array_equal(j2, c, msg)

            msg = base_msg.format('return_inverse and return_counts', dt)
            v, j1, j2 = unique(a, False, True, True)
            assert_array_equal(v, b, msg)
            assert_array_equal(j1, i2, msg)
            assert_array_equal(j2, c, msg)

            msg = base_msg.format(('return_index, return_inverse '
                                   'and return_counts'), dt)
            v, j1, j2, j3 = unique(a, True, True, True)
            assert_array_equal(v, b, msg)
            assert_array_equal(j1, i1, msg)
            assert_array_equal(j2, i2, msg)
            assert_array_equal(j3, c, msg)

        a = [5, 7, 1, 2, 1, 5, 7]*10
        b = [1, 2, 5, 7]
        i1 = [2, 3, 0, 1]
        i2 = [2, 3, 0, 1, 0, 2, 3]*10
        c = np.multiply([2, 1, 2, 2], 10)

        # test for numeric arrays
        types = []
        types.extend(np.typecodes['AllInteger'])
        types.extend(np.typecodes['AllFloat'])
        types.append('datetime64[D]')
        types.append('timedelta64[D]')
        for dt in types:
            aa = np.array(a, dt)
            bb = np.array(b, dt)
            check_all(aa, bb, i1, i2, c, dt)

        # test for object arrays
        dt = 'O'
        aa = np.empty(len(a), dt)
        aa[:] = a
        bb = np.empty(len(b), dt)
        bb[:] = b
        check_all(aa, bb, i1, i2, c, dt)

        # test for structured arrays
        dt = [('', 'i'), ('', 'i')]
        aa = np.array(list(zip(a, a)), dt)
        bb = np.array(list(zip(b, b)), dt)
        check_all(aa, bb, i1, i2, c, dt)

        # test for ticket #2799
        aa = [1. + 0.j, 1 - 1.j, 1]
        assert_array_equal(np.unique(aa), [1. - 1.j, 1. + 0.j])

        # test for ticket #4785
        a = [(1, 2), (1, 2), (2, 3)]
        unq = [1, 2, 3]
        inv = [0, 1, 0, 1, 1, 2]
        a1 = unique(a)
        assert_array_equal(a1, unq)
        a2, a2_inv = unique(a, return_inverse=True)
        assert_array_equal(a2, unq)
        assert_array_equal(a2_inv, inv)

        # test for chararrays with return_inverse (gh-5099)
        a = np.chararray(5)
        a[...] = ''
        a2, a2_inv = np.unique(a, return_inverse=True)
        assert_array_equal(a2_inv, np.zeros(5))

        # test for ticket #9137
        a = []
        a1_idx = np.unique(a, return_index=True)[1]
        a2_inv = np.unique(a, return_inverse=True)[1]
        a3_idx, a3_inv = np.unique(a, return_index=True,
                                   return_inverse=True)[1:]
        assert_equal(a1_idx.dtype, np.intp)
        assert_equal(a2_inv.dtype, np.intp)
        assert_equal(a3_idx.dtype, np.intp)
        assert_equal(a3_inv.dtype, np.intp)

        # test for ticket 2111 - float
        a = [2.0, np.nan, 1.0, np.nan]
        ua = [1.0, 2.0, np.nan]
        ua_idx = [2, 0, 1]
        ua_inv = [1, 2, 0, 2]
        ua_cnt = [1, 1, 2]
        assert_equal(np.unique(a), ua)
        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))

        # test for ticket 2111 - complex
        a = [2.0-1j, np.nan, 1.0+1j, complex(0.0, np.nan), complex(1.0, np.nan)]
        ua = [1.0+1j, 2.0-1j, complex(0.0, np.nan)]
        ua_idx = [2, 0, 3]
        ua_inv = [1, 2, 0, 2, 2]
        ua_cnt = [1, 1, 3]
        assert_equal(np.unique(a), ua)
        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))

        # test for ticket 2111 - datetime64
        nat = np.datetime64('nat')
        a = [np.datetime64('2020-12-26'), nat, np.datetime64('2020-12-24'), nat]
        ua = [np.datetime64('2020-12-24'), np.datetime64('2020-12-26'), nat]
        ua_idx = [2, 0, 1]
        ua_inv = [1, 2, 0, 2]
        ua_cnt = [1, 1, 2]
        assert_equal(np.unique(a), ua)
        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))

        # test for ticket 2111 - timedelta
        nat = np.timedelta64('nat')
        a = [np.timedelta64(1, 'D'), nat, np.timedelta64(1, 'h'), nat]
        ua = [np.timedelta64(1, 'h'), np.timedelta64(1, 'D'), nat]
        ua_idx = [2, 0, 1]
        ua_inv = [1, 2, 0, 2]
        ua_cnt = [1, 1, 2]
        assert_equal(np.unique(a), ua)
        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))

        # test for gh-19300
        all_nans = [np.nan] * 4
        ua = [np.nan]
        ua_idx = [0]
        ua_inv = [0, 0, 0, 0]
        ua_cnt = [4]
        assert_equal(np.unique(all_nans), ua)
        assert_equal(np.unique(all_nans, return_index=True), (ua, ua_idx))
        assert_equal(np.unique(all_nans, return_inverse=True), (ua, ua_inv))
        assert_equal(np.unique(all_nans, return_counts=True), (ua, ua_cnt))
Ejemplo n.º 54
0
def stockForDate(dateToPrice, date):
    while not date in dateToPrice:
        date = date - np.timedelta64(1, 'D')
    return dateToPrice[date]
Ejemplo n.º 55
0
def split_timestamp_to_dt(sample):
    return np.datetime64(datetime.utcfromtimestamp(
        sample['secs'])) + np.timedelta64(sample['nano'], 'ns')
Ejemplo n.º 56
0
                         _CFTIME_DATETIME_UNITS_TESTS)
def test_infer_cftime_datetime_units(calendar, date_args, expected):
    date_type = _all_cftime_date_types()[calendar]
    dates = [date_type(*args) for args in date_args]
    assert expected == coding.times.infer_datetime_units(dates)


@pytest.mark.parametrize(
    ['timedeltas', 'units', 'numbers'],
    [('1D', 'days', np.int64(1)),
     (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')),
     ('1h', 'hours', np.int64(1)), ('1ms', 'milliseconds', np.int64(1)),
     ('1us', 'microseconds', np.int64(1)),
     (['NaT', '0s', '1s'], None, [np.nan, 0, 1]),
     (['30m', '60m'], 'hours', [0.5, 1.0]),
     (np.timedelta64('NaT', 'ns'), 'days', np.nan),
     (['NaT', 'NaT'], 'days', [np.nan, np.nan])])
def test_cf_timedelta(timedeltas, units, numbers):
    timedeltas = pd.to_timedelta(timedeltas, box=False)
    numbers = np.array(numbers)

    expected = numbers
    actual, _ = coding.times.encode_cf_timedelta(timedeltas, units)
    assert_array_equal(expected, actual)
    assert expected.dtype == actual.dtype

    if units is not None:
        expected = timedeltas
        actual = coding.times.decode_cf_timedelta(numbers, units)
        assert_array_equal(expected, actual)
        assert expected.dtype == actual.dtype
Ejemplo n.º 57
0
        levels = [_nonempty_index(l) for l in idx.levels]
        codes = [[0, 0] for i in idx.levels]
        try:
            return pd.MultiIndex(levels=levels, codes=codes, names=idx.names)
        except TypeError:  # older pandas versions
            return pd.MultiIndex(levels=levels, labels=codes, names=idx.names)

    raise TypeError("Don't know how to handle index of "
                    "type {0}".format(typename(type(idx))))


_simple_fake_mapping = {
    "b": np.bool_(True),
    "V": np.void(b" "),
    "M": np.datetime64("1970-01-01"),
    "m": np.timedelta64(1),
    "S": np.str_("foo"),
    "a": np.str_("foo"),
    "U": np.unicode_("foo"),
    "O": "foo",
}


def _scalar_from_dtype(dtype):
    if dtype.kind in ("i", "f", "u"):
        return dtype.type(1)
    elif dtype.kind == "c":
        return dtype.type(complex(1, 0))
    elif dtype.kind in _simple_fake_mapping:
        o = _simple_fake_mapping[dtype.kind]
        return o.astype(dtype) if dtype.kind in ("m", "M") else o
Ejemplo n.º 58
0
if not sys.warnoptions:
    warnings.simplefilter("ignore")


# ## Import raw tweet data and merge it data with lagged spot price data
# 
# Data source:

# In[2]:


#Import raw twitter data and merge Twitter data
df = pd.read_csv('./data-streaming-tweets.csv')
#Convert to datetime64 and convert UTC time used by Twitter to Eastern Time (New York).
df.date = pd.to_datetime(df.date) - np.timedelta64(5, 'h')
df.date = df.date.dt.date     # Remove time part, only keep date.
#remove data for 01-Dec-2019 as we cutoff at the end of Nov
df = df[df['date']!= df.iloc[3324600]['date']]

#Import Spot price information
SpotPrice = pd.read_csv('./ChangeDateNew.csv')
SpotPrice.date = pd.to_datetime(SpotPrice.date)
SpotPrice.date = SpotPrice.date.dt.date  
SpotPrice['Lag_PriceChange'] = SpotPrice.Price_Change.shift(-1) # Lag the price change

#Merge Spot price with raw data
df = pd.merge(df, SpotPrice, how='left',on='date')

df.tail()                                    
Ejemplo n.º 59
0
    def testAttributeAsDict(self):
        other_data = {}
        if pd:
            df = pd.DataFrame(
                {
                    'a': [1, 2, 3],
                    'b': [to_text('测试'), to_binary('属性'), 'c']
                },
                index=[[0, 0, 1], ['测试', '属性', '测试']])
            other_data['w'] = df.columns
            other_data['ww'] = df.index
            other_data['x'] = df['b']
            other_data['y'] = df
            other_data['z'] = [df.columns, df.index, df['a'], df]
        node4 = Node4(a=to_binary('中文'),
                      b=np.random.randint(4, size=(3, 4)),
                      c=np.datetime64(datetime.datetime.now()),
                      d=np.timedelta64(datetime.timedelta(seconds=1234)),
                      e=np.dtype('int'),
                      f={
                          'a': [True, False, False],
                          'd': [False, None]
                      },
                      h=(1234, to_text('测试'), '属性', None,
                         np.datetime64('1066-10-13'), np.timedelta64(1, 'D'),
                         np.dtype([('x', 'i4'), ('y', 'f4')])),
                      i=(slice(10), slice(0,
                                          2), None, slice(2, 0,
                                                          -1), slice('a', 'b'),
                         slice(datetime.datetime.now(),
                               datetime.datetime.now())),
                      j=Node5(a='aa', b=slice(1, 100, 3)),
                      k=[Node5(a='bb', b=slice(200, -1, -4)), None],
                      l=Node6(b=3, nid=1),
                      m=Node6(b=4, nid=2),
                      n=[Node5(a='cc', b=slice(100, -2, -5)), None],
                      **other_data)

        pbs = ProtobufSerializeProvider()

        serial = node4.serialize(pbs)
        d_node4 = Node4.deserialize(pbs, serial)

        self.assertEqual(node4.a, d_node4.a)
        np.testing.assert_array_equal(node4.b, d_node4.b)
        self.assertEqual(node4.c, d_node4.c)
        self.assertEqual(node4.d, d_node4.d)
        self.assertEqual(node4.e, d_node4.e)
        self.assertEqual(node4.f, d_node4.f)
        self.assertFalse(hasattr(d_node4, 'g'))
        self.assertEqual(node4.h, d_node4.h)
        self.assertEqual(node4.i, d_node4.i)
        self.assertEqual(node4.j.a, d_node4.j.a)
        self.assertEqual(node4.j.b, d_node4.j.b)
        self.assertEqual(node4.k[0].a, d_node4.k[0].a)
        self.assertEqual(node4.k[0].b, d_node4.k[0].b)
        self.assertIsNone(d_node4.k[1])
        self.assertIsInstance(d_node4.l, Node7)
        self.assertEqual(node4.l.b, d_node4.l.b)
        self.assertIsInstance(d_node4.m, Node7)
        self.assertEqual(node4.m.b, d_node4.m.b)
        self.assertIsInstance(d_node4.n[0], Node5)
        self.assertEqual(node4.n[0].a, d_node4.n[0].a)
        self.assertEqual(node4.n[0].b, d_node4.n[0].b)
        self.assertIsNone(d_node4.n[1])
        if pd:
            pd.testing.assert_index_equal(node4.w, d_node4.w)
            pd.testing.assert_index_equal(node4.ww, d_node4.ww)
            pd.testing.assert_series_equal(node4.x, d_node4.x)
            pd.testing.assert_frame_equal(node4.y, d_node4.y)
            pd.testing.assert_index_equal(node4.z[0], d_node4.z[0])
            pd.testing.assert_index_equal(node4.z[1], d_node4.z[1])
            pd.testing.assert_series_equal(node4.z[2], d_node4.z[2])
            pd.testing.assert_frame_equal(node4.z[3], d_node4.z[3])

        with self.assertRaises(TypeError):
            node42 = Node4(j=Node6())
            node42.serialize(pbs)

        with self.assertRaises(TypeError):
            node6 = Node6(nid=0)
            node7 = Node7(nid=1, r=node6)
            node7.serialize(pbs)

        with self.assertRaises(TypeError):
            node6 = Node6(nid=0)
            node7 = Node7(nid=1, rl=[node6])
            node7.serialize(pbs)

        node61 = Node6(nid=0)
        node62 = Node6(nid=0, r=node61)
        serial = node62.serialize(pbs)
        d_node62 = Node6.deserialize(pbs, serial)
        self.assertIsInstance(d_node62.r, Node6)

        node61 = Node6(nid=0)
        node62 = Node6(nid=0, rl=[node61])
        serial = node62.serialize(pbs)
        d_node62 = Node6.deserialize(pbs, serial)
        self.assertIsInstance(d_node62.rl[0], Node6)

        jss = JsonSerializeProvider()

        serial = node4.serialize(jss)
        serial = json.loads(json.dumps(serial), object_hook=OrderedDict)
        d_node4 = Node4.deserialize(jss, serial)

        self.assertEqual(node4.a, d_node4.a)
        np.testing.assert_array_equal(node4.b, d_node4.b)
        self.assertEqual(node4.c, d_node4.c)
        self.assertEqual(node4.d, d_node4.d)
        self.assertEqual(node4.e, d_node4.e)
        self.assertEqual(node4.f, d_node4.f)
        self.assertFalse(hasattr(d_node4, 'g'))
        self.assertEqual(node4.h, d_node4.h)
        self.assertEqual(node4.i, d_node4.i)
        self.assertEqual(node4.j.a, d_node4.j.a)
        self.assertEqual(node4.k[0].a, d_node4.k[0].a)
        self.assertIsNone(d_node4.k[1])
        self.assertIsInstance(d_node4.l, Node7)
        self.assertEqual(node4.l.b, d_node4.l.b)
        self.assertIsInstance(d_node4.m, Node7)
        self.assertEqual(node4.m.b, d_node4.m.b)
        self.assertIsInstance(d_node4.n[0], Node5)
        self.assertEqual(node4.n[0].a, d_node4.n[0].a)
        self.assertEqual(node4.n[0].b, d_node4.n[0].b)
        self.assertIsNone(d_node4.n[1])
        if pd:
            pd.testing.assert_index_equal(node4.w, d_node4.w)
            pd.testing.assert_index_equal(node4.ww, d_node4.ww)
            pd.testing.assert_series_equal(node4.x, d_node4.x)
            pd.testing.assert_frame_equal(node4.y, d_node4.y)
            pd.testing.assert_index_equal(node4.z[0], d_node4.z[0])
            pd.testing.assert_index_equal(node4.z[1], d_node4.z[1])
            pd.testing.assert_series_equal(node4.z[2], d_node4.z[2])
            pd.testing.assert_frame_equal(node4.z[3], d_node4.z[3])

        with self.assertRaises(TypeError):
            node42 = Node4(j=Node6())
            node42.serialize(jss)

        with self.assertRaises(TypeError):
            node6 = Node6()
            node7 = Node7(r=node6)
            node7.serialize(jss)

        with self.assertRaises(TypeError):
            node6 = Node6(nid=0)
            node7 = Node7(nid=1, rl=[node6])
            node7.serialize(jss)

        node61 = Node6()
        node62 = Node6(r=node61)
        serial = node62.serialize(jss)
        d_node62 = Node6.deserialize(jss, serial)
        self.assertIsInstance(d_node62.r, Node6)

        node61 = Node6(nid=0)
        node62 = Node6(nid=0, rl=[node61])
        serial = node62.serialize(jss)
        d_node62 = Node6.deserialize(jss, serial)
        self.assertIsInstance(d_node62.rl[0], Node6)
Ejemplo n.º 60
0
def _nonempty_index(idx):
    typ = type(idx)
    if typ is pd.RangeIndex:
        return pd.RangeIndex(2, name=idx.name)
    elif typ in _numeric_index_types:
        return typ([1, 2], name=idx.name)
    elif typ is pd.Index:
        return pd.Index(["a", "b"], name=idx.name)
    elif typ is pd.DatetimeIndex:
        start = "1970-01-01"
        # Need a non-monotonic decreasing index to avoid issues with
        # partial string indexing see https://github.com/dask/dask/issues/2389
        # and https://github.com/pandas-dev/pandas/issues/16515
        # This doesn't mean `_meta_nonempty` should ever rely on
        # `self.monotonic_increasing` or `self.monotonic_decreasing`
        try:
            return pd.date_range(start=start,
                                 periods=2,
                                 freq=idx.freq,
                                 tz=idx.tz,
                                 name=idx.name)
        except ValueError:  # older pandas versions
            data = [start, "1970-01-02"] if idx.freq is None else None
            return pd.DatetimeIndex(data,
                                    start=start,
                                    periods=2,
                                    freq=idx.freq,
                                    tz=idx.tz,
                                    name=idx.name)
    elif typ is pd.PeriodIndex:
        return pd.period_range(start="1970-01-01",
                               periods=2,
                               freq=idx.freq,
                               name=idx.name)
    elif typ is pd.TimedeltaIndex:
        start = np.timedelta64(1, "D")
        try:
            return pd.timedelta_range(start=start,
                                      periods=2,
                                      freq=idx.freq,
                                      name=idx.name)
        except ValueError:  # older pandas versions
            start = np.timedelta64(1, "D")
            data = [start, start + 1] if idx.freq is None else None
            return pd.TimedeltaIndex(data,
                                     start=start,
                                     periods=2,
                                     freq=idx.freq,
                                     name=idx.name)
    elif typ is pd.CategoricalIndex:
        if len(idx.categories) == 0:
            data = pd.Categorical(_nonempty_index(idx.categories),
                                  ordered=idx.ordered)
        else:
            data = pd.Categorical.from_codes([-1, 0],
                                             categories=idx.categories,
                                             ordered=idx.ordered)
        return pd.CategoricalIndex(data, name=idx.name)
    elif typ is pd.MultiIndex:
        levels = [_nonempty_index(l) for l in idx.levels]
        codes = [[0, 0] for i in idx.levels]
        try:
            return pd.MultiIndex(levels=levels, codes=codes, names=idx.names)
        except TypeError:  # older pandas versions
            return pd.MultiIndex(levels=levels, labels=codes, names=idx.names)

    raise TypeError("Don't know how to handle index of "
                    "type {0}".format(typename(type(idx))))