def check_str_query_method(self, parser, engine):
        tm.skip_if_no_ne(engine)
        df = DataFrame(randn(10, 1), columns=['b'])
        df['strings'] = Series(list('aabbccddee'))
        expect = df[df.strings == 'a']

        if parser != 'pandas':
            col = 'strings'
            lst = '"a"'

            lhs = [col] * 2 + [lst] * 2
            rhs = lhs[::-1]

            eq, ne = '==', '!='
            ops = 2 * ([eq] + [ne])

            for lhs, op, rhs in zip(lhs, ops, rhs):
                ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
                assertRaises(NotImplementedError, df.query, ex, engine=engine,
                             parser=parser, local_dict={'strings': df.strings})
        else:
            res = df.query('"a" == strings', engine=engine, parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('strings == "a"', engine=engine, parser=parser)
            assert_frame_equal(res, expect)
            assert_frame_equal(res, df[df.strings.isin(['a'])])

            expect = df[df.strings != 'a']
            res = df.query('strings != "a"', engine=engine, parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('"a" != strings', engine=engine, parser=parser)
            assert_frame_equal(res, expect)
            assert_frame_equal(res, df[~df.strings.isin(['a'])])
Exemple #2
0
    def test_query_with_partially_named_multiindex(self, parser, engine):
        skip_if_no_pandas_parser(parser)
        a = np.random.choice(['red', 'green'], size=10)
        b = np.arange(10)
        index = MultiIndex.from_arrays([a, b])
        index.names = [None, 'rating']
        df = DataFrame(np.random.randn(10, 2), index=index)
        res = df.query('rating == 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind == 1]
        assert_frame_equal(res, exp)

        res = df.query('rating != 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind != 1]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind == "red"]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind != "red"]
        assert_frame_equal(res, exp)
Exemple #3
0
    def test_nested_scope(self):
        from pandas.core.computation.ops import UndefinedVariableError
        engine = self.engine
        parser = self.parser
        # smoke test
        x = 1  # noqa
        result = pd.eval('x + 1', engine=engine, parser=parser)
        assert result == 2

        df = DataFrame(np.random.randn(5, 3))
        df2 = DataFrame(np.random.randn(5, 3))

        # don't have the pandas parser
        with pytest.raises(SyntaxError):
            df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)

        with pytest.raises(UndefinedVariableError):
            df.query('(df>0) & (df2>0)', engine=engine, parser=parser)

        expected = df[(df > 0) & (df2 > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
                         parser=parser)
        assert_frame_equal(expected, result)

        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
                         engine=engine, parser=parser)
        assert_frame_equal(expected, result)
Exemple #4
0
    def test_nested_scope(self):
        engine = self.engine
        parser = self.parser

        skip_if_no_pandas_parser(parser)

        df = DataFrame(np.random.randn(5, 3))
        df2 = DataFrame(np.random.randn(5, 3))
        expected = df[(df > 0) & (df2 > 0)]

        result = df.query('(@df > 0) & (@df2 > 0)', engine=engine,
                          parser=parser)
        assert_frame_equal(result, expected)

        result = pd.eval('df[df > 0 and df2 > 0]', engine=engine,
                         parser=parser)
        assert_frame_equal(result, expected)

        result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]',
                         engine=engine, parser=parser)
        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
        assert_frame_equal(result, expected)

        result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
        expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
        assert_frame_equal(result, expected)
Exemple #5
0
    def test_index_resolvers_come_after_columns_with_the_same_name(self):
        n = 1  # noqa
        a = np.r_[20:101:20]

        df = DataFrame({'index': a, 'b': np.random.randn(a.size)})
        df.index.name = 'index'
        result = df.query('index > 5', engine=self.engine, parser=self.parser)
        expected = df[df['index'] > 5]
        assert_frame_equal(result, expected)

        df = DataFrame({'index': a,
                        'b': np.random.randn(a.size)})
        result = df.query('ilevel_0 > 5', engine=self.engine,
                          parser=self.parser)
        expected = df.loc[df.index[df.index > 5]]
        assert_frame_equal(result, expected)

        df = DataFrame({'a': a, 'b': np.random.randn(a.size)})
        df.index.name = 'a'
        result = df.query('a > 5', engine=self.engine, parser=self.parser)
        expected = df[df.a > 5]
        assert_frame_equal(result, expected)

        result = df.query('index > 5', engine=self.engine, parser=self.parser)
        expected = df.loc[df.index[df.index > 5]]
        assert_frame_equal(result, expected)
 def test_query_undefined_local(self):
     from pandas.computation.ops import UndefinedVariableError
     engine, parser = self.engine, self.parser
     skip_if_no_pandas_parser(parser)
     df = DataFrame(np.random.rand(10, 2), columns=list('ab'))
     with tm.assertRaisesRegexp(UndefinedVariableError,
                                "local variable 'c' is not defined"):
         df.query('a == @c', engine=engine, parser=parser)
Exemple #7
0
    def test_nested_raises_on_local_self_reference(self):
        from pandas.core.computation.ops import UndefinedVariableError

        df = DataFrame(np.random.randn(5, 3))

        # can't reference ourself b/c we're a local so @ is necessary
        with pytest.raises(UndefinedVariableError):
            df.query('df > 0', engine=self.engine, parser=self.parser)
Exemple #8
0
    def test_query(self):
        engine, parser = self.engine, self.parser
        df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])

        assert_frame_equal(df.query('a < b', engine=engine, parser=parser),
                           df[df.a < df.b])
        assert_frame_equal(df.query('a + b > b * c', engine=engine,
                                    parser=parser),
                           df[df.a + df.b > df.b * df.c])
Exemple #9
0
def update_progress(
        status: dict,
        settings: dict,
        progress: pd.DataFrame,
        queue: pd.DataFrame,
        passengers: pd.DataFrame,
        **kwargs
) -> pd.DataFrame:
    """
    :param status:
        The current status of the simulation
    :param settings:
        Configuration settings for the current trial
    :param progress:
        The current progress data frame for the snapshot, which will be
        replaced by the one returned
    :param queue:
        The queue data frame for the trial
    :param passengers:
        The passengers data frame for the trial
    """

    row = dict(
        progress=status['progress'],
        time=status['time']
    )

    aisle_vacancies = []

    for queue_index in range(queue.shape[0]):
        queue_item = queue.loc[queue_index]
        passenger_index = queue_item['passenger']

        if queue_item['aisle'] >= 0:
            aisle_vacancies.append(0 if passenger_index is None else 1)

        if passenger_index is not None:
            key = 'p_{}'.format(passenger_index)
            row[key] = 'Q:{}'.format(queue_item['aisle'])

    seated = passengers.query('delay_interchange > 0')
    for passenger_index, passenger in seated.iterrows():
        key = 'p_{}'.format(passenger_index)
        row[key] = '{}:{}'.format(
            'O' if passenger['seated'] else 'I',
            passenger['aisle']
        )

    seated = passengers.query('seated and delay_interchange == 0')
    for passenger_index, passenger in seated.iterrows():
        key = 'p_{}'.format(passenger_index)
        row[key] = 'S:{}'.format(passenger['aisle'])

    row['aisle_density'] = sum(aisle_vacancies)/len(aisle_vacancies)

    return progress.append(row, ignore_index=True)
Exemple #10
0
    def test_query_doesnt_pickup_local(self):
        from pandas.core.computation.ops import UndefinedVariableError

        engine, parser = self.engine, self.parser
        n = m = 10
        df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))

        # we don't pick up the local 'sin'
        with pytest.raises(UndefinedVariableError):
            df.query('sin > 5', engine=engine, parser=parser)
Exemple #11
0
    def test_query_undefined_local(self):
        from pandas.core.computation.ops import UndefinedVariableError
        engine, parser = self.engine, self.parser
        skip_if_no_pandas_parser(parser)

        df = DataFrame(np.random.rand(10, 2), columns=list('ab'))
        msg = "local variable 'c' is not defined"

        with pytest.raises(UndefinedVariableError, match=msg):
            df.query('a == @c', engine=engine, parser=parser)
Exemple #12
0
 def test_date_index_query_with_NaT_duplicates(self):
     engine, parser = self.engine, self.parser
     n = 10
     df = DataFrame(np.random.randn(n, 3))
     df['dates1'] = date_range('1/1/2012', periods=n)
     df['dates3'] = date_range('1/1/2014', periods=n)
     df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
     df.set_index('dates1', inplace=True, drop=True)
     with pytest.raises(NotImplementedError):
         df.query('index < 20130101 < dates3', engine=engine, parser=parser)
Exemple #13
0
    def test_query_builtin(self):
        from pandas.core.computation.engines import NumExprClobberingError
        engine, parser = self.engine, self.parser

        n = m = 10
        df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))

        df.index.name = 'sin'
        msg = 'Variables in expression.+'
        with pytest.raises(NumExprClobberingError, match=msg):
            df.query('sin > 5', engine=engine, parser=parser)
Exemple #14
0
    def test_object_array_eq_ne(self, parser, engine):
        df = DataFrame({'a': list('aaaabbbbcccc'),
                        'b': list('aabbccddeeff'),
                        'c': np.random.randint(5, size=12),
                        'd': np.random.randint(9, size=12)})
        res = df.query('a == b', parser=parser, engine=engine)
        exp = df[df.a == df.b]
        assert_frame_equal(res, exp)

        res = df.query('a != b', parser=parser, engine=engine)
        exp = df[df.a != df.b]
        assert_frame_equal(res, exp)
Exemple #15
0
    def test_date_query_with_non_date(self):
        engine, parser = self.engine, self.parser

        n = 10
        df = DataFrame({'dates': date_range('1/1/2012', periods=n),
                        'nondate': np.arange(n)})

        ops = '==', '!=', '<', '>', '<=', '>='

        for op in ops:
            with tm.assertRaises(TypeError):
                df.query('dates %s nondate' % op, parser=parser, engine=engine)
Exemple #16
0
    def test_local_syntax(self):
        skip_if_no_pandas_parser(self.parser)

        engine, parser = self.engine, self.parser
        df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij'))
        b = 1
        expect = df[df.a < b]
        result = df.query('a < @b', engine=engine, parser=parser)
        assert_frame_equal(result, expect)

        expect = df[df.a < df.b]
        result = df.query('a < b', engine=engine, parser=parser)
        assert_frame_equal(result, expect)
Exemple #17
0
    def test_query_index_with_name(self):
        engine, parser = self.engine, self.parser
        df = DataFrame(np.random.randint(10, size=(10, 3)),
                       index=Index(range(10), name='blob'),
                       columns=['a', 'b', 'c'])
        res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser)
        expec = df[(df.index < 5) & (df.a < df.b)]
        assert_frame_equal(res, expec)

        res = df.query('blob < b', engine=engine, parser=parser)
        expec = df[df.index < df.b]

        assert_frame_equal(res, expec)
Exemple #18
0
    def test_query_index_without_name(self):
        engine, parser = self.engine, self.parser
        df = DataFrame(np.random.randint(10, size=(10, 3)),
                       index=range(10), columns=['a', 'b', 'c'])

        # "index" should refer to the index
        res = df.query('index < b', engine=engine, parser=parser)
        expec = df[df.index < df.b]
        assert_frame_equal(res, expec)

        # test against a scalar
        res = df.query('index < 5', engine=engine, parser=parser)
        expec = df[df.index < 5]
        assert_frame_equal(res, expec)
Exemple #19
0
    def test_local_variable_with_in(self):
        engine, parser = self.engine, self.parser
        skip_if_no_pandas_parser(parser)
        a = Series(np.random.randint(3, size=15), name='a')
        b = Series(np.random.randint(10, size=15), name='b')
        df = DataFrame({'a': a, 'b': b})

        expected = df.loc[(df.b - 1).isin(a)]
        result = df.query('b - 1 in a', engine=engine, parser=parser)
        assert_frame_equal(expected, result)

        b = Series(np.random.randint(10, size=15), name='b')
        expected = df.loc[(b - 1).isin(a)]
        result = df.query('@b - 1 in a', engine=engine, parser=parser)
        assert_frame_equal(expected, result)
def QA_fetch_stock_min_adv(code, start, end, type_='1min', if_drop_index=False, collections=QA_Setting.client.quantaxis.stock_min):
    '获取股票分钟线'
    if type_ in ['1min', '1m']:
        type_ = '1min'
    elif type_ in ['5min', '5m']:
        type_ = '5min'
    elif type_ in ['15min', '15m']:
        type_ = '15min'
    elif type_ in ['30min', '30m']:
        type_ = '30min'
    elif type_ in ['60min', '60m']:
        type_ = '60min'
    __data = []
    for item in collections.find({
        'code': str(code), "time_stamp": {
            "$gte": QA_util_time_stamp(start),
            "$lte": QA_util_time_stamp(end)
        }, 'type': type_
    }):

        __data.append([str(item['code']), float(item['open']), float(item['high']), float(
            item['low']), float(item['close']), float(item['vol']), item['datetime'], item['time_stamp'], item['date']])

    __data = DataFrame(__data, columns=[
        'code', 'open', 'high', 'low', 'close', 'volume', 'datetime', 'time_stamp', 'date'])

    __data['datetime'] = pd.to_datetime(__data['datetime'])
    return QA_DataStruct_Stock_min(__data.query('volume>1').set_index(['datetime', 'code'], drop=if_drop_index))
    def from_contract_description(cls, contracts: pd.DataFrame, position, premium, option_type=None,
                                  strike_price=None, underlying_asset=None, expiry=None, quantity=1):
        queries = []

        if option_type is not None:
            if option_type == OptionType.Call:
                queries.append("Right=='C'")
            elif option_type == OptionType.Put:
                queries.append("Right=='P'")

        if strike_price is not None:
            queries.append("Strike==" + str(strike_price))

        if underlying_asset is not None:
            queries.append("Symbol=='{}'".format(underlying_asset))

        if expiry is not None:
            queries.append("Symbol=='{}'".format(expiry))

        query = None
        for q in queries:
            if query is None:
                query = q
            else:
                query = query + " and " + q

        selected_contract = contracts.query(query)
        if selected_contract.shape[0] > 1:
            raise ValueError()
        else:
            return cls.from_ConId(contracts, selected_contract.index[0], position, premium, quantity)
Exemple #22
0
    def open(self, slug):
        columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        self.write_message({'type': 'columns', 'data': fields})

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))

        ca = None
        for e in MyAdminBucket.get('element').data:
            if e['slug'] == slug:
                ca = e['categories']

        categories = []
        for i in df.to_dict(outtype='records'):
            if ca:
                categories.append(i[ca])
            self.write_message({'type': 'data', 'data': i})

        self.write_message({'type': 'categories', 'data': categories})
        self.write_message({'type': 'close'})
Exemple #23
0
 def test_query_with_nested_special_character(self, parser, engine):
     skip_if_no_pandas_parser(parser)
     df = DataFrame({'a': ['a', 'b', 'test & test'],
                     'b': [1, 2, 3]})
     res = df.query('a == "test & test"', parser=parser, engine=engine)
     expec = df[df.a == 'test & test']
     assert_frame_equal(res, expec)
Exemple #24
0
    def post(self, slug):
        columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        fields_json = json.dumps(fields)
        filters_json = json.dumps({f: self.get_argument(f) for f in filters})
        if MyCache.get(str(slug)) and\
                MyCache.get('{}-columns'.format(slug)) == fields_json and\
                MyCache.get('{}-fulters'.format(slug)) == filters_json:
            self.write(MyCache.get(str(slug)))
            self.finish()

        MyCache.set('{}-columns'.format(slug), fields_json)
        MyCache.set('{}-filters'.format(slug), filters_json)

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))
        convert = df.to_dict(outtype='records')

        write = json.dumps({'columns': fields, 'json': convert})
        MyCache.set(str(slug), write)
        self.write(write)
        self.finish()
Exemple #25
0
 def test_query_single_element_booleans(self, parser, engine):
     columns = 'bid', 'bidsize', 'ask', 'asksize'
     data = np.random.randint(2, size=(1, len(columns))).astype(bool)
     df = DataFrame(data, columns=columns)
     res = df.query('bid & ask', engine=engine, parser=parser)
     expected = df[df.bid & df.ask]
     assert_frame_equal(res, expected)
Exemple #26
0
    def test_date_query_with_non_date(self):
        engine, parser = self.engine, self.parser

        n = 10
        df = DataFrame({'dates': date_range('1/1/2012', periods=n),
                        'nondate': np.arange(n)})

        result = df.query('dates == nondate', parser=parser, engine=engine)
        assert len(result) == 0

        result = df.query('dates != nondate', parser=parser, engine=engine)
        assert_frame_equal(result, df)

        for op in ['<', '>', '<=', '>=']:
            with pytest.raises(TypeError):
                df.query('dates %s nondate' % op, parser=parser, engine=engine)
Exemple #27
0
 def f(self, f):
     """
     Filter trades based on conditions
     f
         Any valid pandas dataframe query
     """
     df = DataFrame(self._trades)
     return df.query(f)
Exemple #28
0
 def f(self, query):
     """
     Filter data based on query
     query
         A valid pandas dataframe query
     """
     df = DataFrame(self._cash)
     return df.query(query)
def create_trackway_info(
        trackway_name: str,
        trackway_df: pd.DataFrame,
        **kwargs
) -> dict:
    """
    Creates information about the trackway
    """

    pes_df = trackway_df.query('is_pes == True')
    manus_df = trackway_df.query('is_pes == False')

    return dict(
        trackway=trackway_name,
        pes_count=len(pes_df),
        manus_count=len(manus_df)
    )
Exemple #30
0
 def test_at_inside_string(self):
     engine, parser = self.engine, self.parser
     skip_if_no_pandas_parser(parser)
     c = 1  # noqa
     df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']})
     result = df.query('a == "@c"', engine=engine, parser=parser)
     expected = df[df.a == "@c"]
     assert_frame_equal(result, expected)
Exemple #31
0
    def test_query_scope(self):
        from pandas.core.computation.ops import UndefinedVariableError
        engine, parser = self.engine, self.parser
        skip_if_no_pandas_parser(parser)

        df = DataFrame(np.random.randn(20, 2), columns=list('ab'))

        a, b = 1, 2  # noqa
        res = df.query('a > b', engine=engine, parser=parser)
        expected = df[df.a > df.b]
        assert_frame_equal(res, expected)

        res = df.query('@a > b', engine=engine, parser=parser)
        expected = df[a > df.b]
        assert_frame_equal(res, expected)

        # no local variable c
        with pytest.raises(UndefinedVariableError):
            df.query('@a > b > @c', engine=engine, parser=parser)

        # no column named 'c'
        with pytest.raises(UndefinedVariableError):
            df.query('@a > b > c', engine=engine, parser=parser)
    def _calc_confusion_matrix_terminology(self, user_merged: pd.DataFrame, cutoff: int = None):
        if self.relevant_threshold is None:
            relevant_threshold = user_merged['score_truth'].mean()
        else:
            relevant_threshold = self.relevant_threshold

        if cutoff:
            # We consider as 'not_predicted' also those excluded from cutoff other than those
            # not effectively retrieved (score_pred is nan)
            actually_predicted = user_merged.query('score_pred.notna()', engine='python')[:cutoff]
            not_predicted = user_merged.query('score_pred.notna()', engine='python')[cutoff:]
            if not user_merged.query('score_pred.isna()', engine='python').empty:
                not_predicted = pd.concat([not_predicted, user_merged.query('score_pred.isna()', engine='python')])
        else:
            actually_predicted = user_merged.query('score_pred.notna()', engine='python')
            not_predicted = user_merged.query('score_pred.isna()', engine='python')

        tp = len(actually_predicted.query('score_truth >= @relevant_threshold'))
        fp = len(actually_predicted.query('(score_truth < @relevant_threshold) or (score_truth.isna())', engine='python'))
        tn = len(not_predicted.query('score_truth < @relevant_threshold'))
        fn = len(not_predicted.query('score_truth >= @relevant_threshold'))

        return tp, fp, tn, fn
Exemple #33
0
    def test_query_with_string_columns(self, parser, engine):
        df = DataFrame({
            "a": list("aaaabbbbcccc"),
            "b": list("aabbccddeeff"),
            "c": np.random.randint(5, size=12),
            "d": np.random.randint(9, size=12),
        })
        if parser == "pandas":
            res = df.query("a in b", parser=parser, engine=engine)
            expec = df[df.a.isin(df.b)]
            tm.assert_frame_equal(res, expec)

            res = df.query("a in b and c < d", parser=parser, engine=engine)
            expec = df[df.a.isin(df.b) & (df.c < df.d)]
            tm.assert_frame_equal(res, expec)
        else:
            msg = r"'(Not)?In' nodes are not implemented"
            with pytest.raises(NotImplementedError, match=msg):
                df.query("a in b", parser=parser, engine=engine)

            msg = r"'BoolOp' nodes are not implemented"
            with pytest.raises(NotImplementedError, match=msg):
                df.query("a in b and c < d", parser=parser, engine=engine)
Exemple #34
0
    def test_str_list_query_method(self, parser, engine):
        df = DataFrame(np.random.randn(10, 1), columns=['b'])
        df['strings'] = Series(list('aabbccddee'))
        expect = df[df.strings.isin(['a', 'b'])]

        if parser != 'pandas':
            col = 'strings'
            lst = '["a", "b"]'

            lhs = [col] * 2 + [lst] * 2
            rhs = lhs[::-1]

            eq, ne = '==', '!='
            ops = 2 * ([eq] + [ne])

            for lhs, op, rhs in zip(lhs, ops, rhs):
                ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
                with pytest.raises(NotImplementedError):
                    df.query(ex, engine=engine, parser=parser)
        else:
            res = df.query('strings == ["a", "b"]', engine=engine,
                           parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('["a", "b"] == strings', engine=engine,
                           parser=parser)
            assert_frame_equal(res, expect)

            expect = df[~df.strings.isin(['a', 'b'])]

            res = df.query('strings != ["a", "b"]', engine=engine,
                           parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('["a", "b"] != strings', engine=engine,
                           parser=parser)
            assert_frame_equal(res, expect)
Exemple #35
0
    def test_query_scope(self):
        engine, parser = self.engine, self.parser
        skip_if_no_pandas_parser(parser)

        df = DataFrame(np.random.randn(20, 2), columns=list("ab"))

        a, b = 1, 2  # noqa:F841
        res = df.query("a > b", engine=engine, parser=parser)
        expected = df[df.a > df.b]
        tm.assert_frame_equal(res, expected)

        res = df.query("@a > b", engine=engine, parser=parser)
        expected = df[a > df.b]
        tm.assert_frame_equal(res, expected)

        # no local variable c
        with pytest.raises(UndefinedVariableError,
                           match="local variable 'c' is not defined"):
            df.query("@a > b > @c", engine=engine, parser=parser)

        # no column named 'c'
        with pytest.raises(UndefinedVariableError,
                           match="name 'c' is not defined"):
            df.query("@a > b > c", engine=engine, parser=parser)
Exemple #36
0
    def test_str_query_method(self, parser, engine):
        df = DataFrame(np.random.randn(10, 1), columns=['b'])
        df['strings'] = Series(list('aabbccddee'))
        expect = df[df.strings == 'a']

        if parser != 'pandas':
            col = 'strings'
            lst = '"a"'

            lhs = [col] * 2 + [lst] * 2
            rhs = lhs[::-1]

            eq, ne = '==', '!='
            ops = 2 * ([eq] + [ne])

            for lhs, op, rhs in zip(lhs, ops, rhs):
                ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
                msg = r"'(Not)?In' nodes are not implemented"
                with pytest.raises(NotImplementedError, match=msg):
                    df.query(ex, engine=engine, parser=parser,
                             local_dict={'strings': df.strings})
        else:
            res = df.query('"a" == strings', engine=engine, parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('strings == "a"', engine=engine, parser=parser)
            assert_frame_equal(res, expect)
            assert_frame_equal(res, df[df.strings.isin(['a'])])

            expect = df[df.strings != 'a']
            res = df.query('strings != "a"', engine=engine, parser=parser)
            assert_frame_equal(res, expect)

            res = df.query('"a" != strings', engine=engine, parser=parser)
            assert_frame_equal(res, expect)
            assert_frame_equal(res, df[~df.strings.isin(['a'])])
Exemple #37
0
def select(dataframe: pd.DataFrame, query: str, **where: str) -> pd.DataFrame:
    query = ' '.join(query.format(**where).splitlines())
    return dataframe.query(query).dropna(axis=1, how='all').dropna(axis=0,
                                                                   how='all')
Exemple #38
0
def __trading_energy_generator(df: pd.DataFrame,
                               date: date,
                               duid_id: str,
                               power_field: str = "generated") -> pd.DataFrame:
    return_cols = []

    t_start = datetime(date.year,
                       date.month,
                       date.day,
                       0,
                       5,
                       tzinfo=NetworkNEM.get_fixed_offset())

    # 48 trading intervals in the day
    # (could be better with groupby function)
    for TI in range(48):
        # t_i initial timestamp of trading_interval, t_f = final timestamp of trading interval
        t_i = t_start + timedelta(0, 1800 * TI)
        t_f = t_start + timedelta(0, 1800 * (TI + 1))

        _query = f"'{t_i}' <= trading_interval <= '{t_f}' and facility_code == '{duid_id}'"

        d_ti = df.query(_query)

        energy_value = None
        trading_interval = None

        # rooftop 30m intervals - AEMO rooftop is going to go in a separate network
        # so this won't be required
        if (d_ti.fueltech_id.all()
                == "solar_rooftop") and (d_ti[power_field].count() == 1):
            energy_value = d_ti[power_field].sum() / 2
            # ooofff - this delta comes back off as part of NEM offset
            trading_interval = d_ti.index[0] + timedelta(minutes=5)
        # interpolate if it isn't padded out
        elif d_ti[power_field].count() != 7:
            index_interpolated = pd.date_range(start=t_i,
                                               end=t_f,
                                               freq="5min",
                                               tz=NetworkNEM.get_timezone())

            d_ti = d_ti.reset_index()
            d_ti = d_ti.set_index("trading_interval")
            d_ti = d_ti.reindex(index_interpolated)
            d_ti["facility_code"] = duid_id
            d_ti[power_field] = d_ti[power_field].replace(np.NaN, 0)

            if d_ti[power_field].count() != 7:
                logger.warn("Interpolated frame didn't match generated count")

        try:
            if d_ti.fueltech_id.all() != "solar_rooftop":
                energy_value = __trapezium_integration(d_ti, power_field)
                trading_interval = d_ti.index[-2]
        except ValueError as e:
            logger.error("Error with {} at {} {}: {}".format(
                duid_id, t_i, t_f, e))

        if not d_ti.index.empty:
            return_cols.append({
                "trading_interval": trading_interval,
                "network_id": "NEM",
                "facility_code": duid_id,
                "eoi_quantity": energy_value,
            })

    return return_cols
def get_normalized_policy_shifts_and_current_policy_all_countries(
        policy_data_countries: pd.DataFrame,
        past_parameters: pd.DataFrame) -> (dict, dict):
    """
    Computes the normalized policy shifts and the current policy in each area of the world except the US
    (done in a separate function)
    :param policy_data_countries: processed dataframe with the MECE policies implemented per area for every day
    :param past_parameters: past parameters file used for policy shift generation (specifically computation of gamma(t)
    values in the process
    :return: a tuple of two dictionaries, {policy: normalized_shift_float_international} and {area: current_policy}
    """
    dict_current_policy = {}
    policy_list = future_policies
    policy_data_countries["country_cl"] = policy_data_countries[
        "country"].apply(lambda x: x.replace(",", "").strip().lower())
    past_parameters_copy = deepcopy(past_parameters)
    past_parameters_copy["Country"] = past_parameters_copy["Country"].apply(
        lambda x: str(x).replace(",", "").strip().lower())
    params_countries = past_parameters_copy["Country"]
    params_countries = set(params_countries)
    policy_data_countries_bis = policy_data_countries.query(
        "country_cl in @params_countries")
    countries_upper_set = set(policy_data_countries[
        policy_data_countries.country != "US"]["country"])
    # countries_in_oxford_and_params = params_countries.intersection(countries_upper_set)
    for country in countries_upper_set:
        dict_current_policy[(country, "None")] = list(
            compress(
                policy_list,
                (policy_data_countries.query("country == @country")
                 [policy_data_countries.query("country == @country")["date"] ==
                  policy_data_countries.query("country == @country").date.max(
                  )][policy_list] == 1).values.flatten().tolist(),
            ))[0]
    countries_common = sorted([x.lower() for x in countries_upper_set])
    pastparam_tuples_in_oxford = past_parameters_copy[
        (past_parameters_copy.Country.isin(countries_common))
        & (past_parameters_copy.Province != "None")].reset_index(drop=True)
    pastparam_tuples_in_oxford["tuple_name"] = list(
        zip(pastparam_tuples_in_oxford.Country,
            pastparam_tuples_in_oxford.Province))
    for tuple in pastparam_tuples_in_oxford.tuple_name.unique():
        country, province = tuple
        country = country[0].upper() + country[1:]
        dict_current_policy[(country,
                             province)] = dict_current_policy[(country,
                                                               "None")]

    countries_set = set(policy_data_countries["country_cl"])

    params_dic = {}
    countries_set = countries_set.intersection(params_countries)
    for country in countries_set:
        params_dic[country] = past_parameters_copy.query(
            "Country == @country")[[
                "Data Start Date", "Median Day of Action", "Rate of Action"
            ]].iloc[0]

    policy_data_countries_bis["Gamma"] = [
        gamma_t(day, country, params_dic)
        for day, country in zip(policy_data_countries_bis["date"],
                                policy_data_countries_bis["country_cl"])
    ]
    n_measures = policy_data_countries_bis.iloc[:, 3:-2].shape[1]
    dict_normalized_policy_gamma = {
        policy_data_countries_bis.columns[3 + i]: policy_data_countries_bis[
            policy_data_countries_bis.iloc[:, 3 + i] == 1].iloc[:, -1].mean()
        for i in range(n_measures)
    }
    normalize_val = dict_normalized_policy_gamma[policy_list[0]]
    for policy in dict_normalized_policy_gamma.keys():
        dict_normalized_policy_gamma[policy] = (
            dict_normalized_policy_gamma[policy] / normalize_val)

    return dict_normalized_policy_gamma, dict_current_policy
def remove_forfeits(df: pd.DataFrame) -> pd.DataFrame:
    return df.query("score != 'Forfeit'")
Exemple #41
0
def _split_ber_by_year_of_construction(
    df: pd.DataFrame,
    condition: str,
) -> pd.DataFrame:

    return df.query(condition).drop_duplicates()
Exemple #42
0
db = client['spam_database']
collection = db.spam_clean2
dataframe_spam_clean = DataFrame(list(collection.find()))

# In[3]:

len(dataframe_spam_clean)

# In[4]:

len(dataframe_ham_clean)

# In[5]:

dataframe_spam_clean.query('ContentType_body == ["text/html"]')[[
    'Subject', 'ContentType_body', 'body', 'body_text_normalize'
]]

# In[6]:

from pandas import DataFrame
from pymongo import MongoClient

client = MongoClient('mongodb://192.168.67.90:27017')
client.database_names()
db = client['spam_database']
collection = db.spam_clean
dataframe_spam_clean = DataFrame(list(collection.find()))

# In[7]:
Exemple #43
0
    def test_query_with_unnamed_multiindex(self, parser, engine):
        skip_if_no_pandas_parser(parser)
        a = np.random.choice(["red", "green"], size=10)
        b = np.random.choice(["eggs", "ham"], size=10)
        index = MultiIndex.from_arrays([a, b])
        df = DataFrame(np.random.randn(10, 2), index=index)
        ind = Series(df.index.get_level_values(0).values, index=index)

        res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
        res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
        exp = df[ind == "red"]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # inequality
        res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
        res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
        exp = df[ind != "red"]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # list equality (really just set membership)
        res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
        res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
        exp = df[ind.isin(["red"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
        res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
        exp = df[~ind.isin(["red"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # in/not in ops
        res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
        res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
        exp = df[ind.isin(["red"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        res1 = df.query('["red"] not in ilevel_0',
                        parser=parser,
                        engine=engine)
        res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
        exp = df[~ind.isin(["red"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # ## LEVEL 1
        ind = Series(df.index.get_level_values(1).values, index=index)
        res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
        res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
        exp = df[ind == "eggs"]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # inequality
        res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
        res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
        exp = df[ind != "eggs"]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # list equality (really just set membership)
        res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
        res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
        exp = df[ind.isin(["eggs"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
        res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
        exp = df[~ind.isin(["eggs"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        # in/not in ops
        res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
        res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
        exp = df[ind.isin(["eggs"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)

        res1 = df.query('["eggs"] not in ilevel_1',
                        parser=parser,
                        engine=engine)
        res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
        exp = df[~ind.isin(["eggs"])]
        tm.assert_frame_equal(res1, exp)
        tm.assert_frame_equal(res2, exp)
Exemple #44
0
    def get_context_data(self, **kwargs):
        context = super(StayDetail, self).get_context_data(**kwargs)
        stays = self.stays

        hotel_1_id = int(kwargs['hotel_1_id'])
        hotel_2_id = int(kwargs.get('hotel_2_id', 0))
        check_in_2 = self.kwargs.get('check_in_2')

        query = 'hotel_1_id == @hotel_1_id'

        max_switch_count = stays['switch_count'].max()

        if max_switch_count > 0:  # pragma: no cover
            query = query + \
                '& ((hotel_2_id == @hotel_2_id & check_in_2 == @check_in_2) \
                     | (@hotel_2_id == 0 & switch_count == 0))'

        stay = stays.query(query)

        check_out_1 = check_in_2 = datetime.strptime(
            stay['check_out_1'].values[0], '%Y-%m-%d')

        stay = stay.to_dict('records')[0]

        facilities = HotelbedsFacility.objects.all().iterator()

        facilities = [{
            'code': facility.code,
            'group': facility.group,
            'description': facility.description,
        } for facility in facilities]

        facilities = DataFrame(facilities)

        hotel = Hotel.objects.get(hotel_id=hotel_1_id)
        hotels = [hotel]

        try:  # pragma: no cover
            hotel_facilities = DataFrame(hotel.facilities)
            hotel_facilities.query('available == True', inplace=True)
            hotel_facilities = merge(hotel_facilities,
                                     facilities,
                                     on=['code', 'group'])

            hotel_facility_lists = [hotel_facilities.to_dict('records')]
        except Exception:
            hotel_facility_lists = []

        rooms = [
            HotelbedsRoom.objects.get(code=stay['room_type_1']).description
        ]
        boards = [HotelbedsBoard.objects.get(code=stay['board_1']).description]
        if hotel_2_id > 0:  # pragma: no cover
            hotel = Hotel.objects.get(hotel_id=hotel_2_id)
            hotels.append(hotel)

            try:  # pragma: no cover
                hotel_facilities = DataFrame(hotel.facilities)
                hotel_facilities.query('available == True', inplace=True)
                hotel_facilities = merge(hotel_facilities,
                                         facilities,
                                         on=['code', 'group'])

                hotel_facility_lists.append(
                    hotel_facilities.to_dict('records'))
            except Exception:
                pass

            rooms.append(
                HotelbedsRoom.objects.get(
                    code=stay['room_type_2']).description)
            boards.append(
                HotelbedsBoard.objects.get(code=stay['board_2']).description)

        # Stored separately for passing to JS
        galleria_images = self.parse_hotel_images(hotels)

        try:  # pragma: no cover
            tripadvisor_reviews = [
                tripadvisor.get_tripadvisor_review(
                    hotel.tripadvisor.tripadvisor) for hotel in hotels
            ]
        except AttributeError:
            tripadvisor_reviews = []

        facilities = {}
        for idx, hotel in enumerate(hotels):
            lst = []
            try:
                lst = hotel_facility_lists[idx]
            except IndexError:
                pass
            facilities[hotel.hotel_id] = lst

        context.update({
            'stay': stay,
            'hotels': hotels,
            'galleria_images': galleria_images,
            'rooms': rooms,
            'boards': boards,
            'facilities': facilities,
            'check_out_1': check_out_1,
            'check_in_2': check_in_2,
            'tripadvisor_reviews': tripadvisor_reviews,
            'blocked_countries': settings.BLOCKED_COUNTRIES
        })

        return context
    def _extract_process_step_statistics(self, process_steps: pd.DataFrame, machine_utilization: pd.DataFrame, data_transfers: pd.DataFrame) -> pd.DataFrame:
        data: pd.DataFrame = pd.DataFrame({PROCESS_STEP_NAME: [],
                                           PROCESS_STEP_START: [],
                                           PROCESS_STEP_END: [],
                                           PROCESS_STEP_ABS_DUR: [],
                                           PROCESS_STEP_REL_DUR: [],
                                           STEP_STATS_SUM_ABS_DURATION: [],
                                           STEP_STATS_SUM_REL_DURATION: [],
                                           STEP_STATS_AVG_CPU: [],
                                           STEP_STATS_AVG_ABS_MEM: [],
                                           STEP_STATS_AVG_REL_MEM: [],
                                           STEP_STATS_MAX_ABS_MEM: [],
                                           STEP_STATS_MAX_REL_MEM: [],
                                           STEP_STATS_TOT_ABS_IDLE_TIME: [],
                                           STEP_STATS_TOT_REL_IDLE_TIME: [],
                                           STEP_STATS_AVG_ABS_IDLE_TIME: [],
                                           STEP_STATS_AVG_REL_IDLE_TIME: [],
                                           STEP_STATS_MAX_ABS_IDLE_TIME: [],
                                           STEP_STATS_MAX_REL_IDLE_TIME: [],
                                           STEP_STATS_TOT_ABS_BUSY_TIME: [],
                                           STEP_STATS_TOT_REL_BUSY_TIME: [],
                                           STEP_STATS_AVG_ABS_BUSY_TIME: [],
                                           STEP_STATS_AVG_REL_BUSY_TIME: [],
                                           STEP_STATS_MAX_ABS_BUSY_TIME: [],
                                           STEP_STATS_MAX_REL_BUSY_TIME: []})

        processor_step_stats: pd.DataFrame = pd.DataFrame({PROCESS_STEP_NAME: [],
                                                           PROCESS_STEP_START: [],
                                                           PROCESS_STEP_END: [],
                                                           PROCESS_STEP_ABS_DUR: [],
                                                           PROCESS_STEP_REL_DUR: [],
                                                           PROCESSOR_STEP_STATS_PROC_NAME: [],
                                                           PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME: [],
                                                           PROCESSOR_STEP_STATS_TOT_REL_IDLE_TIME: [],
                                                           PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME: [],
                                                           PROCESSOR_STEP_STATS_TOT_REL_BUSY_TIME: [],
                                                           PROCESSOR_STEP_STATS_MAX_ABS_MEM: [],
                                                           PROCESSOR_STEP_STATS_MAX_REL_MEM: []})

        num_processors: int = len(self._events)
        for i in range(process_steps.shape[0]):
            step_name: str = process_steps.at[i, PROCESS_STEP_NAME]
            start: float = process_steps.at[i, PROCESS_STEP_START]
            end: float = process_steps.at[i, PROCESS_STEP_END]
            duration: float = process_steps.at[i, PROCESS_STEP_ABS_DUR]
            summed_duration: float = duration * num_processors

            step_machine_utilization: pd.DataFrame = machine_utilization.query(f"{MACHINE_UTILIZATION_TIME} >= {start} & {MACHINE_UTILIZATION_TIME} <= {end}")
            # step_data_transfer: pd.DataFrame = data_transfers.query(f"{DATA_TRANSFER_START} <= {end} & {DATA_TRANSFER_END} >= {start}")

            machine_avg: pd.DataFrame = step_machine_utilization.mean()
            machine_max: pd.DataFrame = step_machine_utilization.max()

            processor_step_stats = self._processor_stats_in(process_steps.iloc[i], processor_step_stats)
            p_stats: pd.DataFrame = processor_step_stats.query(f"{PROCESS_STEP_NAME} == '{step_name}'")
            p_stats_sum: pd.DataFrame = p_stats.sum()
            p_stats_max: pd.DataFrame = p_stats.max()

            data = data.append({PROCESS_STEP_NAME: step_name,
                                PROCESS_STEP_START: start,
                                PROCESS_STEP_END: end,
                                PROCESS_STEP_ABS_DUR: duration,
                                PROCESS_STEP_REL_DUR: process_steps.at[i, PROCESS_STEP_REL_DUR],
                                STEP_STATS_SUM_ABS_DURATION: summed_duration,
                                STEP_STATS_SUM_REL_DURATION: summed_duration / (self._process_duration * num_processors),
                                STEP_STATS_AVG_CPU: machine_avg[MACHINE_UTILIZATION_AVG_CPU],
                                STEP_STATS_AVG_ABS_MEM: machine_avg[MACHINE_UTILIZATION_AVG_ABS_MEM],
                                STEP_STATS_AVG_REL_MEM: machine_avg[MACHINE_UTILIZATION_AVG_REL_MEM],
                                STEP_STATS_MAX_ABS_MEM: machine_max[MACHINE_UTILIZATION_MAX_ABS_MEM],
                                STEP_STATS_MAX_REL_MEM: machine_max[MACHINE_UTILIZATION_MAX_REL_MEM],
                                STEP_STATS_TOT_ABS_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME],
                                STEP_STATS_TOT_REL_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / summed_duration,
                                STEP_STATS_AVG_ABS_IDLE_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / num_processors,
                                STEP_STATS_AVG_REL_IDLE_TIME: (p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / num_processors) / duration,
                                STEP_STATS_MAX_ABS_IDLE_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME],
                                STEP_STATS_MAX_REL_IDLE_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_IDLE_TIME] / duration,
                                STEP_STATS_TOT_ABS_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME],
                                STEP_STATS_TOT_REL_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / summed_duration,
                                STEP_STATS_AVG_ABS_BUSY_TIME: p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / num_processors,
                                STEP_STATS_AVG_REL_BUSY_TIME: (p_stats_sum[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / num_processors) / duration,
                                STEP_STATS_MAX_ABS_BUSY_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME],
                                STEP_STATS_MAX_REL_BUSY_TIME: p_stats_max[PROCESSOR_STEP_STATS_TOT_ABS_BUSY_TIME] / duration}, ignore_index=True)

        data.to_csv("process-step-statistics.csv", sep=";", index_label="index")
        processor_step_stats.to_csv("processor-step-statistics.csv", sep=";", index_label="index")
        return data
Exemple #46
0
def report(
    df: pd.DataFrame,
    id_col: str = "Compound_Id",
    columns: List[str] = ["Compound_Id", "Smiles"],
    title: str = "Cluster Report",
    intro: str = "Large clusters first, similar clusters together.",
):
    """Write a HTML report. `Cluster_No` and `IsRepr` have to be present in the DataFrame.
    In the current setting, the largest clusters are at the top of the report,
    with similar clusters (determind by the chemical similarities of the representative structures)
    are grouped together.
    Writes the report to disk as `Clusters.html`.
    Used in `projects/paint3_anal/190328_cpd_clustering.ipynb`.

    Arguments:
        df: The input DataFrame containing the structures as Smiles.
        id_col: The name of the column to use for identity. Default is `Compound_Id`.
        columns: List of columns to include.
        title: The report title.
        intro: Some text used for introduction of the report.
    """
    def add_cluster(cl_no, sim_to=None):
        if sim_to is None:
            sim_to = ""
            html.append("<hr>")
        else:
            sim_to = f"(similar to {sim_to})"
        mf_cl = mf.MolFrame(df.query("Cluster_No == @cl_no")[columns])
        mf_cl = mf_cl.add_mols()
        html.append(
            f"<br><h2>Cluster {cl_no} ({len(mf_cl.data)} Members)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;{sim_to}</h2><br>"
        )
        grid = mfv.html_grid(mf_cl.data, id_col="Compound_Id")
        html.append(grid)

    if id_col not in columns:
        columns = [id_col] + columns
    if "Smiles" not in columns:
        columns.append("Smiles")
    df_repr = df.query("IsRepr == 'Yes'").reset_index().drop("index", axis=1)
    chem_sim = {}
    for idx, rec0 in df_repr.iterrows():
        for _, rec1 in df_repr.iloc[idx + 1:].iterrows():
            cl0 = rec0["Cluster_No"]
            cl1 = rec1["Cluster_No"]
            sim = mf.chem_sim(rec0["Smiles"], rec1["Smiles"])
            chem_sim[(cl0, cl1)] = sim
            chem_sim[(cl1, cl0)] = sim

    cl_sizes = (df[["Cluster_No", "Compound_Id"
                    ]].groupby(by="Cluster_No").count().reset_index().rename(
                        columns={"Compound_Id": "Size"}))
    cl_sizes = cl_sizes.sort_values("Size", ascending=False)
    cl_order = {x: True for x in cl_sizes["Cluster_No"].values}

    html = [f"<h1>{title}</h1><br>{intro}<br><br>"]
    while len(cl_order) > 0:
        cl_no = list(cl_order.keys())[0]
        add_cluster(cl_no)
        cl_order.pop(cl_no)
        to_remove = []
        for sim_cl in cl_order:
            if chem_sim[(cl_no, sim_cl)] > 0.45:
                add_cluster(sim_cl, cl_no)
                to_remove.append(sim_cl)
        for x in to_remove:
            cl_order.pop(x)

    mfht.write(mfht.page("\n".join(html)), "Clusters.html")
def generate_causal_graph(place_change_events: DataFrame,
                          transition_events: DataFrame, time_per_step: float):
    g = nx.DiGraph(
    )  # Nodes are occasions and edges leading in their prehensions

    # Add the initial state for each node as an occasion with no past
    initial_occasions = place_change_events.query('tstep == 0')
    for occ in initial_occasions.itertuples():
        g.add_node(Occasion(int(occ.num), occ.name,
                            occ.time))  # unit, state, time

    # Visit each transition and identify i) its output node and its 2 input nodes
    for trans in transition_events.itertuples():
        # row has: tstep, time, name, unit, neighbour & count

        # TODO: IS IT SAFE TO IGNORE THIS?
        # assert trans.count == 1  # Statistically likely to happen as simulations get more complex or are undersampled. Consider what to do if this occurs --Rob

        # Create new occasion in graph for this transition
        # output_state = trans.name[1]  # ab -> b
        prefix, input_state, output_state = expand_transition_name(
            trans.name)  # strings
        if math.isnan(trans.unit):
            print(f"*** {trans.unit} {output_state} {trans.time}")
            continue
        output_occasion = Occasion(int(trans.unit), output_state, trans.time)
        g.add_node(output_occasion)

        def choose_best_upstream_occasion(target_unit, target_state_name,
                                          source_time):
            query = f"num=={target_unit} & name=='{target_state_name}' & time<{source_time}"
            last_transition_time = place_change_events.query(
                query)['time'].max()
            if math.isnan(last_transition_time):
                #  Try including the source time
                query = f"num=={target_unit} & name=='{target_state_name}' & time=={source_time}"
                last_transition_time = place_change_events.query(
                    query)['time'].min()
                if math.isnan(last_transition_time):
                    #  Try including the step after
                    query = f"num=={target_unit} & name=='{target_state_name}' & time<={source_time + time_per_step}"
                    last_transition_time = place_change_events.query(
                        query)['time'].min()
            return Occasion(target_unit, target_state_name,
                            last_transition_time)

        # Determine local input node from same unit
        # state_name = trans.name[0]  # ab -> a
        local_input_occasion = choose_best_upstream_occasion(
            trans.unit, input_state, trans.time)
        g.add_edge(local_input_occasion, output_occasion)

        # Determine input node from neighbour
        # state_name = trans.name[1]  # ab -> b
        neighbour_input_occasion = choose_best_upstream_occasion(
            trans.neighbour, output_state, trans.time)
        g.add_edge(neighbour_input_occasion, output_occasion)

        # Determine input node from neighbour2 if set
        if not math.isnan(trans.neighbour2):
            # state_name = trans.name[1]  # ab -> b  # neighbour2 assumed pulling state forward (like neighbour)
            neighbour2_input_occasion = choose_best_upstream_occasion(
                trans.neighbour2, output_state, trans.time)
            g.add_edge(neighbour2_input_occasion, output_occasion)

    return g
Exemple #48
0
def get_vinbigdata_dicts(
    imgdir: Path,
    train_df: pd.DataFrame,
    train_data_type: str = "original",
    use_cache: bool = True,
    debug: bool = True,
    target_indices: Optional[np.ndarray] = None,
):
    debug_str = f"_debug{int(debug)}"
    train_data_type_str = f"_{train_data_type}"
    cache_path = Path(
        ".") / f"dataset_dicts_cache{train_data_type_str}{debug_str}.pkl"
    if not use_cache or not cache_path.exists():
        print("Creating data...")
        train_meta = pd.read_csv(imgdir / "train_meta.csv")
        if debug:
            train_meta = train_meta.iloc[:500]  # For debug....

        # Load 1 image to get image size.
        image_id = train_meta.loc[0, "image_id"]
        image_path = str(imgdir / "train" / f"{image_id}.png")
        image = cv2.imread(image_path)
        resized_height, resized_width, ch = image.shape
        print(f"image shape: {image.shape}")

        dataset_dicts = []
        for index, train_meta_row in tqdm(train_meta.iterrows(),
                                          total=len(train_meta)):
            record = {}

            image_id, height, width = train_meta_row.values
            filename = str(imgdir / "train" / f"{image_id}.png")
            record["file_name"] = filename
            record["image_id"] = image_id
            record["height"] = resized_height
            record["width"] = resized_width
            objs = []
            for index2, row in train_df.query(
                    "image_id == @image_id").iterrows():
                # print(row)
                # print(row["class_name"])
                # class_name = row["class_name"]
                class_id = row["class_id"]
                if class_id == 14:
                    # It is "No finding"
                    # This annotator does not find anything, skip.
                    pass
                else:
                    # bbox_original = [int(row["x_min"]), int(row["y_min"]), int(row["x_max"]), int(row["y_max"])]
                    h_ratio = resized_height / height
                    w_ratio = resized_width / width
                    bbox_resized = [
                        int(row["x_min"]) * w_ratio,
                        int(row["y_min"]) * h_ratio,
                        int(row["x_max"]) * w_ratio,
                        int(row["y_max"]) * h_ratio,
                    ]
                    obj = {
                        "bbox": bbox_resized,
                        "bbox_mode": BoxMode.XYXY_ABS,
                        "category_id": class_id,
                    }
                    objs.append(obj)
            record["annotations"] = objs
            dataset_dicts.append(record)
        with open(cache_path, mode="wb") as f:
            pickle.dump(dataset_dicts, f)

    print(f"Load from cache {cache_path}")
    with open(cache_path, mode="rb") as f:
        dataset_dicts = pickle.load(f)
    if target_indices is not None:
        dataset_dicts = [dataset_dicts[i] for i in target_indices]
    return dataset_dicts
def parse_raw_zooniverse_file(
        raw_zooniverse_classifications: pd.DataFrame) -> pd.DataFrame:
    filtered_raw_zooniverse = raw_zooniverse_classifications.query(
        'workflow_name == "Transcribe Words" and workflow_version == 3.7'
    ).copy()

    def clean_text_values(txt: str):
        txt = txt.replace('null', 'None')
        txt = ast.literal_eval(txt)
        if type(txt) is dict:  # for subject_data
            txt = [*txt.values()]
            txt = txt[0]
        return txt

    filtered_raw_zooniverse.loc[:, 'annotations'] = filtered_raw_zooniverse[
        'annotations'].apply(clean_text_values)
    filtered_raw_zooniverse.loc[:, 'subject_data'] = filtered_raw_zooniverse[
        'subject_data'].apply(clean_text_values)

    parsed_zooniverse_classifications = pd.DataFrame()
    parsed_zooniverse_classifications['id'] = filtered_raw_zooniverse[
        'subject_data'].apply(
            lambda annotation: annotation['image_of_boxed_letter'].replace(
                'wordbox-', '').replace('.jpg', '').replace('label-', ''))

    def parse_subject(s):
        barcode = s['barcode'].split('-')[
            0]  # in case the file name includes "-label"
        image_name = s['image_of_boxed_letter']
        col_names = [
            'barcode', 'block', 'paragraph', 'word', 'gcv_identification',
            'image_location'
        ]
        result = pd.Series([
            barcode,
            int(s['block_no']),
            int(s['paragraph_no']),
            int(s['word_no']), s['#GCV_identification'], image_name
        ],
                           index=col_names)
        return result

    parsed_subjects = filtered_raw_zooniverse['subject_data'].apply(
        parse_subject)
    parsed_zooniverse_classifications = pd.concat(
        [parsed_zooniverse_classifications, parsed_subjects], axis=1)
    parsed_zooniverse_classifications['handwritten'] = filtered_raw_zooniverse[
        'annotations'].apply(
            lambda annotation: annotation[0]['value'] == 'handwritten')
    parsed_zooniverse_classifications[
        'human_transcription'] = filtered_raw_zooniverse['annotations'].apply(
            lambda annotation: annotation[1]['value'])
    parsed_zooniverse_classifications[
        'unclear'] = parsed_zooniverse_classifications[
            'human_transcription'].apply(
                lambda transcription: '[unclear]' in transcription and
                '[/unclear]' in transcription)
    parsed_zooniverse_classifications['human_transcription'] = \
        parsed_zooniverse_classifications['human_transcription'] \
            .apply(lambda transcription: transcription.replace('[unclear][/unclear]', ''))

    parsed_zooniverse_classifications[
        'seen_count'] = parsed_zooniverse_classifications.groupby(
            'id')['block'].transform(len)
    parsed_zooniverse_classifications['confidence'] = 1.0
    parsed_zooniverse_classifications['status'] = 'In Progress'
    return parsed_zooniverse_classifications
Exemple #50
0
 def test_query_syntax_error(self):
     engine, parser = self.engine, self.parser
     df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)})
     msg = "invalid syntax"
     with pytest.raises(SyntaxError, match=msg):
         df.query("i - +", engine=engine, parser=parser)
Exemple #51
0
from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

# 데이터 프레임
df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# 다중 조건 행 조회
# or 조건
all_index = df.query('국어 < 50 or 영어 < 40')

# 출력
print_df(all_index)
def preprocess_data(X: pd.DataFrame,
                    add_province_columns: bool = False,
                    drop_first_day: bool = False) -> pd.DataFrame:
    """
    Preprocess data to be used in StemPoissonRegressor by adding columns for previous day information and also adding columns for all regions as predictors

    Args:
        X (pd.DataFrame): Dataframe with columns province, date, active_cases, percent_susceptible
        add_province_columns (bool, optional): If variables active_cases and percent_susceptible should be added as columns for each province. Defaults to False.
        drop_first_day (bool, optional): Whether to drop first day of each province or not. Defaults to False.

    Returns:
        pd.DataFrame: Preprocess dataframe with columns for all provinces
    """
    # Add columns with log transformation
    X = X.assign(
        log_active_cases=lambda x: np.log(x["active_cases"] + 1),
        log_percent_susceptible=lambda x: np.log(x["percent_susceptible"]),
    )

    # Add columns for previous day information
    previous_day = (X.groupby("province").shift(periods=1, axis=0).loc[:, [
        "active_cases",
        "percent_susceptible",
        "log_active_cases",
        "log_percent_susceptible",
    ], ])
    X = X.assign(
        active_cases_yesterday=previous_day["active_cases"],
        percent_susceptible_yesterday=previous_day["percent_susceptible"],
        log_active_cases_yesterday=previous_day["log_active_cases"],
        log_percent_susceptible_yesterday=previous_day[
            "log_percent_susceptible"],
    )

    # Add previous day columns for each province
    X_new = X.copy()
    if add_province_columns:
        provinces = X_new["province"].unique()
        for province in provinces:
            # Get province data rows and duplicate n times for concat column wise
            prov_data = X.query("province == @province").loc[:, [
                "active_cases_yesterday",
                "percent_susceptible_yesterday",
                "log_active_cases_yesterday",
                "log_percent_susceptible_yesterday",
                "active_cases",
                "percent_susceptible",
                "log_active_cases",
                "log_percent_susceptible",
            ], ]
            prov_data = pd.concat([prov_data] * len(provinces),
                                  ignore_index=True)

            # Append name of province to each column name
            for col in prov_data.columns:
                prov_data.rename(columns={col: f"{province}_{col}"},
                                 inplace=True)

            X_new = pd.concat([X_new, prov_data], axis=1)

    # Drop first days missing t-1 information
    if drop_first_day:
        X_new = X_new.query("active_cases_yesterday == active_cases_yesterday")

    X_new.reset_index(drop=True, inplace=True)

    return X_new
Exemple #53
0
    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        bads = [r",", r"%", "nan"]
        str_cols = ["County", "FileNumber", "ProviderName"]
        df = self._clean_cols(df, bads, str_cols)
        df["location_name"] = df["County"].str.title()

        # Create new columns
        df["ICU Census"] = df["Adult ICU Census"] + df["Pediatric ICU Census"]
        df["ICU Capacity"] = (df["Total AdultICU Capacity"] +
                              df["Total PediatricICU Capacity"])
        df["Available ICU"] = df["Available Adult ICU"] + df[
            "Available Pediatric ICU"]

        # Rename appropriate columns
        crename = {
            "Adult ICU Census":
            CMU(category="adult_icu_beds_in_use",
                measurement="current",
                unit="beds"),
            "Available Adult ICU":
            CMU(category="adult_icu_beds_available",
                measurement="current",
                unit="beds"),
            "Total AdultICU Capacity":
            CMU(category="adult_icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "Pediatric ICU Census":
            CMU(category="pediatric_icu_beds_in_use",
                measurement="current",
                unit="beds"),
            "Available Pediatric ICU":
            CMU(
                category="pediatric_icu_beds_available",
                measurement="current",
                unit="beds",
            ),
            "Total PediatricICU Capacity":
            CMU(
                category="pediatric_icu_beds_capacity",
                measurement="current",
                unit="beds",
            ),
            "ICU Census":
            CMU(category="icu_beds_in_use", measurement="current",
                unit="beds"),
            "ICU Capacity":
            CMU(category="icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "Available ICU":
            CMU(category="icu_beds_available",
                measurement="current",
                unit="beds"),
        }

        # Drop grand total and melt
        out = (df.query("location_name != 'Grand Total'").melt(
            id_vars=["location_name"], value_vars=crename.keys()).dropna())
        out["value"] = pd.to_numeric(out["value"])
        out = out.groupby(["location_name", "variable"]).sum().reset_index()
        out.loc[out["location_name"] == "Desoto", "location_name"] = "DeSoto"

        # Extract category information and add other context
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Eastern")
        out["vintage"] = self._retrieve_vintage()
        self.clean_desoto(out)
        return out.loc[:, self.out_cols]
 def test_query_with_nested_special_character(self, parser, engine):
     skip_if_no_pandas_parser(parser)
     df = DataFrame({'a': ['a', 'b', 'test & test'], 'b': [1, 2, 3]})
     res = df.query('a == "test & test"', parser=parser, engine=engine)
     expec = df[df.a == 'test & test']
     assert_frame_equal(res, expec)
Exemple #55
0
def py_query(
    data: pd.DataFrame,
    query,
    *,
    use_pd_query=False,
    allow_empty_result=False,
    setup_code='',
    globals=None,
    return_selected_data=True,
):
    """
    Alternative: pd.DataFrame.query:
        supports a subset of this function, but is faster

    >>> df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
    >>> df
       a  b
    0  1  2
    1  3  4
    >>> py_query(df, 'a == 1')
       a  b
    0  1  2
    >>> py_query(df, 'a == 1', use_pd_query=True)
       a  b
    0  1  2
    >>> py_query(df, 'int(a) == 1')
       a  b
    0  1  2
    >>> py_query(df, ['int(a) == 1', 'b == 2'])
       a  b
    0  1  2
    >>> py_query(df, ['index == 1'])  # get second row
       a  b
    1  3  4
    >>> py_query(df, ['index == 1'], use_pd_query=True)
       a  b
    1  3  4

    To access column names that aren't valid python identifiers (e.g. the name
    contains a whitespace), you have to use the kwargs dictionary:
    >>> df = pd.DataFrame([{'a b': 1, 'b': 2}, {'a b': 3, 'b': 4}])
    >>> py_query(df, 'kwargs["a b"] == 1')
       a b  b
    0    1  2

    When you need a package function, you have to specify it in the globals
    dict. e.g.:
    >>> import numpy as np
    >>> df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
    >>> py_query(df, 'np.equal(a, 1)', globals={'np': np})
       a  b
    0  1  2

    Args:
        data: pandas.DataFrame
        query: str or list of str. If list of str the strings get join by a
            logical `and` to be a str. For examples see doctest.
             Note: Use index to get access to the index.
        use_pd_query:  Pandas query is much faster but limited
        allow_empty_result:
        setup_code: legacy argument, Superseded by the globals argument.
            Additional code which runs before the query conditions.
            You may use this for additional imports.
        globals: Specify some global names. Useful for imports. See doctest how
            to use it.
        return_selected_data: Whether to return the selection of the data or
            the selection indices.

    Returns:
        data[selection] if not return_selection else selection

    """
    if query is False:
        return data

    if query in [[], tuple(), '']:
        return data

    if isinstance(query, (list, tuple)):
        if len(query) == 1:
            query, = query
        else:
            query = ') and ('.join(query)
            query = f'({query})'
    else:
        assert isinstance(query, str)

    if use_pd_query is True:
        return data.query(query)
    elif use_pd_query == 'try':
        try:
            return data.query(query)
        except Exception:
            pass
    else:
        assert use_pd_query is False, use_pd_query

    keywords = ['index'] + list(data)

    def is_valid_variable_name(name):
        import ast
        # https://stackoverflow.com/a/36331242/5766934
        try:
            ast.parse('{} = None'.format(name))
            return True
        except (SyntaxError, ValueError, TypeError):
            return False

    keywords = [k
                for k in keywords if is_valid_variable_name(k)] + ['**kwargs']

    d = {}
    code = f"""
def func({', '.join(keywords)}):
    {setup_code}
    try:
        return {query}
    except Exception:
        raise Exception('See above error message. Locals are:', locals())
"""

    if globals is None:
        globals = {}
    else:
        globals = globals.copy()
    try:
        exec(code, globals, d)
        func = d['func']
    except Exception as e:
        raise Exception(code) from e

    selection = data.apply(lambda row: func(row.name, **row), axis=1)
    assert allow_empty_result or len(selection) > 0, len(selection)
    if return_selected_data:
        return data[selection]
    else:
        return selection
Exemple #56
0
 def test_query_with_nested_special_character(self, parser, engine):
     skip_if_no_pandas_parser(parser)
     df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]})
     res = df.query('a == "test & test"', parser=parser, engine=engine)
     expec = df[df.a == "test & test"]
     tm.assert_frame_equal(res, expec)
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.

        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, province, active_cases, percent_susceptible,
                              and all columns for provinces for {province_name}_active_cases_yesterday, {province_name}_percent_susceptible_yesterday,
                              as well as all log features
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns date, province, cases, removed
        """
        self.X_original = X.copy()
        self.Y_original = Y.copy()
        self.provinces = X["province"].unique()

        # Fit model for each province
        self.X_cases = {}
        self.Y_cases = {}
        self.X_removed = {}
        self.Y_removed = {}
        self.poisson_gam_cases = {}
        self.poisson_gam_removed = {}

        for province in self.provinces:
            # Remove extra columns for given province in form {province}_column_name
            cols_drop = X.filter(regex=province, axis=1).columns
            X_province = X.query(f"province == '{province}'").drop(cols_drop,
                                                                   axis=1)
            Y_province = Y.query(f"province == '{province}'")

            # Store case dataframe used to train model for each province
            self.X_cases[province] = X_province.filter(
                regex=
                r"(log_active_cases_yesterday|log_percent_susceptible_yesterday)"
            )
            self.Y_cases[province] = Y_province["cases"]

            # Add terms for each province I_t-1 and Z_t-1. Either splines or linear terms
            if self.use_splines:
                terms = s(0, lam=self.lam_main) + s(1, lam=self.lam_main)
                for i in range(1, len(self.provinces)):
                    terms += s(i * 2, lam=self.lam_other) + s(
                        i * 2 + 1, lam=self.lam_other)
            else:
                terms = l(0, lam=self.lam_main) + l(1, lam=self.lam_other)
                for i in range(1, len(self.provinces)):
                    terms += l(i * 2, lam=self.lam_other) + l(
                        i * 2 + 1, lam=self.lam_other)

            # Fit cases model for province
            cases_model = PoissonGAM(terms, verbose=self.verbose)
            cases_model.fit(self.X_cases[province], self.Y_cases[province])
            self.poisson_gam_cases[province] = cases_model

            # Store remove dataframe used to train model for each province
            self.X_removed[province] = X_province.filter(
                regex=r"log_active_cases_yesterday")
            self.Y_removed[province] = Y_province["removed"]

            # Add terms for each province I_t-1
            terms = l(0, lam=self.lam_main)
            for i in range(1, len(self.provinces)):
                terms += l(i, lam=self.lam_other)

            # Fit removed model for each province
            removed_model = PoissonGAM(terms, verbose=self.verbose)
            removed_model.fit(self.X_removed[province], self.Y_cases[province])
            self.poisson_gam_removed[province] = removed_model

        return
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.
        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, active_cases, percent_susceptible
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns cases, removed
        """
        # Remove days in data that are after the latest twitter data given
        if self.twitter_data is not None:
            remove_date = self.twitter_data["date"].max()
        else:
            remove_date = X["date"].max()

        X = X.query("date <= @remove_date")
        Y = Y.query("date <= @remove_date")

        self.X_original = X.copy()
        self.Y_original = Y.copy()

        # Separate data for each model
        self.X_cases = X[[
            "date", "log_active_cases_yesterday",
            "log_percent_susceptible_yesterday"
        ]].copy()
        self.Y_cases = Y["cases"]
        self.X_removed = X[["date", "log_active_cases_yesterday"]].copy()
        self.Y_removed = Y["removed"]

        # Preprocess twitter data by shifting it by twitter_offset days so each row contains the twitter data from twitter_offset days ago
        if self.twitter_data is not None:
            twitter_shifted = self.twitter_data.drop(
                ["date", "province"],
                axis=1).shift(periods=self.twitter_offset, axis=0)
            twitter_shifted.columns = [
                f"{col}_shifted" for col in twitter_shifted.columns
            ]
            twitter_shifted = twitter_shifted.assign(
                date=self.twitter_data["date"])

            # Add twitter data to use in both cases and removed models
            self.X_cases = self.X_cases.merge(twitter_shifted,
                                              how="left",
                                              on=["date"])
            self.X_removed = self.X_removed.merge(twitter_shifted,
                                                  how="left",
                                                  on=["date"])

        # Drop date columns not used anymore
        self.X_cases = self.X_cases.drop("date", axis=1)
        self.X_removed = self.X_removed.drop("date", axis=1)

        # Setup terms for covid19 data to use in GLM
        term = s if self.use_spline else l
        terms_cases = term(0, lam=self.lam) + term(1, lam=self.lam)
        terms_removed = term(0, lam=self.lam)

        # Add terms for twitter data
        twitter_cols = self.twitter_data.columns.drop(["date", "province"])
        for i in range(0, len(twitter_cols)):
            terms_cases = terms_cases + term(i + 2, lam=self.lam)
            terms_removed = terms_removed + term(i + 1, lam=self.lam)

        # Model new cases data using infections and percentage susceptible at time t-1
        self.poisson_gam_cases = PoissonGAM(terms_cases, verbose=self.verbose)
        self.poisson_gam_cases.fit(self.X_cases, self.Y_cases)

        # Model removed cases using infections at time t-1
        self.poisson_gam_removed = PoissonGAM(terms_removed,
                                              verbose=self.verbose)
        self.poisson_gam_removed.fit(self.X_removed, self.Y_removed)

        return
Exemple #59
0
def data(ws, mongodb, slug):
    if not ws:
        abort(400, 'Expected WebSocket request.')

    DW = DataWarehouse()

    element = mongodb['element'].find_one({'slug': slug})

    element['page_limit'] = 50
    if request.GET.get('limit', True) is False:
        element['page_limit'] = 9999999999

    data = DW.get(element.get('cube'))
    columns = data.get('columns') or []

    fields = columns
    if request.GET.get('fields', None):
        fields = request.GET.get('fields').split(',')

    cube_last_update = mongodb['cube'].find_one({'slug': element.get('cube')})
    ws.send(json.dumps({'type': 'last_update',
                        'data': str(cube_last_update.get('lastupdate', ''))}))

    ws.send(json.dumps({'type': 'columns', 'data': fields}))

    filters = [i[0] for i in request.GET.iteritems()
               if len(i[0].split('filter__')) > 1]

    if element['type'] == 'grid':
        page = int(request.GET.get('page', 1))
        page_start = 0
        page_end = element['page_limit']
        if page >= 2:
            page_end = element['page_limit'] * page
            page_start = page_end - element['page_limit']
    else:
        page_start = None
        page_end = None

    df = DataFrame(data.get('data') or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split('__')
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == 'like':
                df = df[df[field].str.contains(value)]
            elif operator == 'regex':
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get('groupby', None):
        groupby = request.GET.get('groupby', ).split(',')
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if request.GET.get('orderby',
                       element.get('orderby', None)) and request.GET.get(
            'orderby', element.get('orderby', None)) in fields:

        orderby = request.GET.get('orderby', element.get('orderby', ''))
        if type(orderby) == str:
            orderby = orderby.split(',')
        orderby__order = request.GET.get('orderby__order',
                                         element.get('orderby__order', ''))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(',')
        ind = 0
        for orde in orderby__order:
            if orde == '0':
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    ws.send(json.dumps({'type': 'max_page', 'data': len(df)}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []
    for i in df.to_dict(outtype='records')[page_start:page_end]:
        if element.get('categories', None):
            categories.append(i[element.get('categories')])
        ws.send(json.dumps({'type': 'data', 'data': i}))

    # CLEAN MEMORY
    del df
    gc.collect()

    ws.send(json.dumps({'type': 'categories', 'data': categories}))
    ws.send(json.dumps({'type': 'close'}))

    # CLEAN MEMORY
    del categories
    gc.collect()
 def _get_covered(self, pred: pd.DataFrame):
     catalog = self.catalog
     return set(pred.query('to_id in @catalog')['to_id'])