Example #1
0
    def get_date_trend(self, mode_date):
        """
        :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2)
        """
        axisLabels = self.oriDate[:]
        pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        rule_mode = {"0": "D", "1": "W", "2": "M", "3": "Q"}

        df = DataFrame(pointVals, index=axisLabels)
        df = df.resample(rule_mode[str(mode_date)], how="sum")
        df = df.fillna(0)

        """各项总和"""
        # cols_name = []
        # for name, col in df.iteritems():
        #     cols_name.append(name)
        # df['SUM'] = 0
        # for i in xrange(len(cols_name)):
        #     df['SUM'] += df[cols_name[i]]

        """宿舍比重"""
        # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0  # 仅当存在宿舍值时才计算宿舍比重,否则设为0

        axisLabels = map(lambda x: x.strftime("%Y-%m-%d"), df.index.tolist())  # 从dataframe 中取出作为索引的日期标签成为队列
        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist())
            seriesData.append({"name": colName, "data": data})

        json_dateTrend = {"axisLabels": axisLabels, "legendLabels": legendLabels, "seriesData": seriesData}
        return json_dateTrend
Example #2
0
    def fill_old(self, df, year=None):
        """
        Takes an age, sex profile (per capita transfers) in df 
        to fill year 'year' or all years if year is None
        """
        if isinstance(df, DataFrame):
            df1 = df
        else:
            df1 = DataFrame(df)

        for col_name in df1.columns:
            if col_name not in self._types:
                self.new_type(col_name)

        if year is None:
            for yr in sorted(self.index_sets["year"]):
                self.fill(df, year=yr)
        else:
            yr = year
            if isinstance(df, DataFrame):
                df1 = df
            else:
                df1 = DataFrame(df)

            for col_name, column in df1.iteritems():
                column = column.reset_index()
                column["year"] = yr
                column = column.set_index(["age", "sex", "year"])
                self.update(column)
Example #3
0
def ac_time(userID, startDate, endDate):
    # 最近一个月
    # 时间分布 6点~7点宿舍打卡
    # 23点到5点打卡  ACPeriodCate

    from GetJson_ACPeriodCate import GetJson_ACPeriodCate

    json_ACPeriodCate = GetJson_ACPeriodCate(userID, 2, startDate, endDate)

    if "errMsg" in json_ACPeriodCate:
        return {"count_early": -1, "count_night": -1}

    timeDistri = json_ACPeriodCate["json_timeDistribution"]
    dict_vals = {}
    for item in timeDistri["seriesData"]:
        dict_vals[item["name"]] = item["data"]

    df = DataFrame(dict_vals, index=range(24))

    df["SUM"] = 0
    for col, vals in df.iteritems():
        if col == "SUM":
            break
        df["SUM"] += vals

    count_early = df.loc[6]["dorm"] if "dorm" in df else 0  # 取 6 点宿舍值,总计早起次数
    count_night = sum(df.loc[0:6]["SUM"].tolist()) + df.loc[23]["SUM"]  # 取23点 ~ 5点总门禁次数

    return {"count_early": count_early, "count_night": count_night}
Example #4
0
    def get_time_distribution(self):
        dates = self.oriDate[:]
        values = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        # 生成时间点和时间标签队列。
        periods = []
        axisLabels = []
        for i in xrange(24):
            periods.append(time(i))
            axisLabels.append(str(i) + u"点~" + str((i + 1) % 24) + u"点")

        # 时间点队列 -> 时间区间队列。
        periodRanges = []
        for i in xrange(len(periods)):
            periodRange = [periods[i], periods[(i + 1) % len(periods)]]
            periodRanges.append(periodRange)

        lTimes = map(lambda d: d.time(), dates)  # Keep time.
        vals = []  # Init vals
        for i in xrange(len(periods)):
            vals.append({})

        # Add to total vals.
        for i in xrange(len(lTimes)):
            for j in xrange(len(periodRanges)):
                if periodRanges[j][0] <= lTimes[i] < periodRanges[j][1]:
                    vals[j + 1] = helpers.mergeDict(vals[j + 1], values[i])

        df = DataFrame(vals)

        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0 if isnan(x) else int(x), col.tolist())
            seriesData.append({"name": colName, "data": data})

        json_timeDistribution = {"axisLabels": axisLabels, "legendLabels": legendLabels, "seriesData": seriesData}
        return json_timeDistribution
Example #5
0
class FrameParser(Parser):
    _default_orient = "columns"
    _split_keys = ("columns", "index", "data")

    def _parse_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            args = loads(json, dtype=None, numpy=True, labelled=True, precise_float=self.precise_float)
            if args:
                args = (args[0].T, args[2], args[1])
            self.obj = DataFrame(*args)
        elif orient == "split":
            decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float)
            decoded = dict((str(k), v) for k, v in compat.iteritems(decoded))
            self.check_keys_split(decoded)
            self.obj = DataFrame(**decoded)
        elif orient == "values":
            self.obj = DataFrame(loads(json, dtype=None, numpy=True, precise_float=self.precise_float))
        else:
            self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True, precise_float=self.precise_float))

    def _parse_no_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            self.obj = DataFrame(loads(json, precise_float=self.precise_float), dtype=None)
        elif orient == "split":
            decoded = dict((str(k), v) for k, v in compat.iteritems(loads(json, precise_float=self.precise_float)))
            self.check_keys_split(decoded)
            self.obj = DataFrame(dtype=None, **decoded)
        elif orient == "index":
            self.obj = DataFrame(loads(json, precise_float=self.precise_float), dtype=None).T
        else:
            self.obj = DataFrame(loads(json, precise_float=self.precise_float), dtype=None)

    def _process_converter(self, f, filt=None):
        """ take a conversion function and possibly recreate the frame """

        if filt is None:
            filt = lambda col, c: True

        needs_new_obj = False
        new_obj = dict()
        for i, (col, c) in enumerate(self.obj.iteritems()):
            if filt(col, c):
                new_data, result = f(col, c)
                if result:
                    c = new_data
                    needs_new_obj = True
            new_obj[i] = c

        if needs_new_obj:

            # possibly handle dup columns
            new_obj = DataFrame(new_obj, index=self.obj.index)
            new_obj.columns = self.obj.columns
            self.obj = new_obj

    def _try_convert_types(self):
        if self.obj is None:
            return
        if self.convert_dates:
            self._try_convert_dates()

        self._process_converter(lambda col, c: self._try_convert_data(col, c, convert_dates=False))

    def _try_convert_dates(self):
        if self.obj is None:
            return

        # our columns to parse
        convert_dates = self.convert_dates
        if convert_dates is True:
            convert_dates = []
        convert_dates = set(convert_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, compat.string_types):
                return False

            if (
                col.endswith("_at")
                or col.endswith("_time")
                or col.lower() == "modified"
                or col.lower() == "date"
                or col.lower() == "datetime"
            ):
                return True
            return False

        self._process_converter(
            lambda col, c: self._try_convert_to_date(c),
            lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates),
        )
Example #6
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {
                "i32": np.array([1, 2, 3] * 3, dtype="int32"),
                "f32": np.array([None, 2.5, 3.5] * 3, dtype="float32"),
                "cat": Series(["a", "b", "c"] * 3).astype("category"),
                "obj": Series(["d", "e", "f"] * 3),
                "bool": np.array([True, False, True] * 3),
                "dt": Series(pd.date_range("20130101", periods=9)),
                "dt_tz": Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
                "td": Series(pd.timedelta_range("2000", periods=9)),
            }
        )

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(["foo", "bar", "baz"]))
        expected = Series(
            np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype="uint64"),
            index=["foo", "bar", "baz"],
        )
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop("index", None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [
            Series([1, 2, 3]),
            Series([1.0, 1.5, 3.2]),
            Series([1.0, 1.5, np.nan]),
            Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
            Series(["a", "b", "c"]),
            Series(["a", np.nan, "c"]),
            Series([True, False, True]),
            Index([1, 2, 3]),
            Index([True, False, True]),
            DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
            tm.makeMissingDataframe(),
            tm.makeMixedDataFrame(),
            tm.makeTimeDataFrame(),
            tm.makeTimeSeries(),
            tm.makeTimedeltaIndex(),
            Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])),
        ]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype="float64"), Series([], dtype="object"), Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_errors(self):

        for obj in [pd.Timestamp("20130101"), tm.makePanel()]:

            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list("abc"))
        a = hash_pandas_object(obj, hash_key="9876543210123456")
        b = hash_pandas_object(obj, hash_key="9876543210123465")
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list("abc")), hash_key="foo")

        self.assertRaises(ValueError, f)

    def test_mixed(self):
        # mixed objects
        obj = Series(["1", 2, 3])
        self.check_equal(obj)
        self.check_not_equal_with_index(obj)

        # mixed are actually equal when stringified
        a = hash_pandas_object(obj)
        b = hash_pandas_object(Series(list("123")))
        self.assert_series_equal(a, b)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list("abc")).str.encode("utf8")
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list("abc"))
        self.check_equal(obj, encoding="ascii")

    def test_long_strings(self):

        obj = Index(tm.rands_array(nchars=10000, size=100))
        self.check_equal(obj)
Example #7
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {
                "i32": np.array([1, 2, 3] * 3, dtype="int32"),
                "f32": np.array([None, 2.5, 3.5] * 3, dtype="float32"),
                "cat": Series(["a", "b", "c"] * 3).astype("category"),
                "obj": Series(["d", "e", "f"] * 3),
                "bool": np.array([True, False, True] * 3),
                "dt": Series(pd.date_range("20130101", periods=9)),
                "dt_tz": Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
                "td": Series(pd.timedelta_range("2000", periods=9)),
            }
        )

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(["foo", "bar", "baz"]))
        expected = Series(
            np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype="uint64"),
            index=["foo", "bar", "baz"],
        )
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop("index", None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [
            Series([1, 2, 3]),
            Series([1.0, 1.5, 3.2]),
            Series([1.0, 1.5, np.nan]),
            Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
            Series(["a", "b", "c"]),
            Series(["a", np.nan, "c"]),
            Series(["a", None, "c"]),
            Series([True, False, True]),
            Index([1, 2, 3]),
            Index([True, False, True]),
            DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
            tm.makeMissingDataframe(),
            tm.makeMixedDataFrame(),
            tm.makeTimeDataFrame(),
            tm.makeTimeSeries(),
            tm.makeTimedeltaIndex(),
        ]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype="float64"), Series([], dtype="object"), Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_errors(self):

        for obj in [pd.Timestamp("20130101"), tm.makePanel()]:

            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list("abc"))
        a = hash_pandas_object(obj, hash_key="9876543210123456")
        b = hash_pandas_object(obj, hash_key="9876543210123465")
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list("abc")), hash_key="foo")

        self.assertRaises(ValueError, f)

    def test_unsupported_objects(self):

        # mixed objects are not supported
        obj = Series(["1", 2, 3])

        def f():
            hash_pandas_object(obj)

        self.assertRaises(TypeError, f)

        # MultiIndex are represented as tuples
        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]))

        def f():
            hash_pandas_object(obj)

        self.assertRaises(TypeError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list("abc")).str.encode("utf8")
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list("abc"))
        self.check_equal(obj, encoding="ascii")

    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2 ** (l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, "utf8")
            self.assertFalse(result[0] == result[1])

        for l in range(8):
            length = 2 ** (l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, "utf8")
            self.assertFalse(result[0] == result[1])

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = [
            "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9",  # noqa
            "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe",
        ]  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), "utf8")
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        self.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), "utf8")
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        self.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), "utf8")
        self.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
Example #8
0
class FrameParser(Parser):
    _default_orient = "columns"

    def _parse(self):

        json = self.json
        dtype = self.dtype
        orient = self.orient
        numpy = self.numpy

        if numpy:
            try:
                if orient == "columns":
                    args = loads(json, dtype=dtype, numpy=True, labelled=True)
                    if args:
                        args = (args[0].T, args[2], args[1])
                    self.obj = DataFrame(*args)
                elif orient == "split":
                    decoded = loads(json, dtype=dtype, numpy=True)
                    decoded = dict((str(k), v) for k, v in decoded.iteritems())
                    self.obj = DataFrame(**decoded)
                elif orient == "values":
                    self.obj = DataFrame(loads(json, dtype=dtype, numpy=True))
                else:
                    self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, labelled=True))
            except ValueError:
                numpy = False

        if not numpy:
            if orient == "columns":
                self.obj = DataFrame(loads(json), dtype=dtype)
            elif orient == "split":
                decoded = dict((str(k), v) for k, v in loads(json).iteritems())
                self.obj = DataFrame(dtype=dtype, **decoded)
            elif orient == "index":
                self.obj = DataFrame(loads(json), dtype=dtype).T
            else:
                self.obj = DataFrame(loads(json), dtype=dtype)

    def _convert_axes(self):
        """ try to axes if they are datelike """
        if self.orient == "columns":
            axis = "index"
        elif self.orient == "index":
            axis = "columns"
        else:
            return

        try:
            a = getattr(self.obj, axis)
            setattr(self.obj, axis, self._try_parse_to_date(a))
        except:
            pass

    def _try_parse_dates(self):
        if self.obj is None:
            return

        # our columns to parse
        parse_dates = self.parse_dates
        if parse_dates is True:
            parse_dates = []
        parse_dates = set(parse_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, basestring):
                return False

            if (
                col.endswith("_at")
                or col.endswith("_time")
                or col.lower() == "modified"
                or col.lower() == "date"
                or col.lower() == "datetime"
            ):
                return True
            return False

        for col, c in self.obj.iteritems():
            if (self.keep_default_dates and is_ok(col)) or col in parse_dates:
                self.obj[col] = self._try_parse_to_date(c)