def post(self, dataset_key, optional_q): if optional_q: q_dict = self.q_json_to_dict(decoded_body(self.request)) if q_dict is not None: self.query(dataset_key, q_dict) return t0 = time.time() self.operation = "store" if dataset_key in self.dataset_cache: self.stats.inc("replace_count") del self.dataset_cache[dataset_key] content_type = self.content_type() input_data = decoded_body(self.request) if content_type == CONTENT_TYPE_CSV: durations_until_eviction = self.dataset_cache.ensure_free(len(input_data)) qf = QFrame.from_csv(input_data, column_types=self.dtypes(), stand_in_columns=self.stand_in_columns()) else: # This is a waste of CPU cycles, first the JSON decoder decodes all strings # from UTF-8 then we immediately encode them back into UTF-8. Couldn't # find an easy solution to this though. durations_until_eviction = self.dataset_cache.ensure_free(len(input_data) / 2) data = json.loads(input_data, cls=UTF8JSONDecoder) qf = QFrame.from_dicts(data, stand_in_columns=self.stand_in_columns()) self.dataset_cache[dataset_key] = qf self.set_status(ResponseCode.CREATED) self.stats.inc("size_evict_count", count=len(durations_until_eviction)) self.stats.inc("store_count") self.stats.append("store_row_counts", len(qf)) self.stats.append("store_durations", time.time() - t0) self.stats.extend("durations_until_eviction", durations_until_eviction) self.write("")
def test_large_frame_msgpack(large_frame): # NOTE: This implementation does not exist but once did as an experiment # This test is left as reference and reminder with timeit('to_msgpack'): msgpack_string = large_frame.to_msgpack() with timeit('from_msgpack'): QFrame.from_msgpack(msgpack_string)
def test_sub_select(data, engine): frame = QFrame.from_csv(data) result = frame.query({'where': ['in', 'bar', {'where': ['==', 'foo', 2]}]}, filter_engine=engine) assert_rows(result, [1, 2])
def test_unicode_content_from_dicts(): data = [{'foo': 'aaa', 'bar': u'Iñtërnâtiônàližætiøn'}, {'foo': 'bbb', 'bar': u'räksmörgås'.encode(encoding='utf-8')}] input_frame = QFrame.from_dicts(data) frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]}) assert_rows(frame, ['bbb'])
def test_count_without_aggregation(basic_frame): expected = QFrame.from_csv(""" count 3""") frame = basic_frame.query({'select': [['count']]}) assert frame.to_csv() == expected.to_csv()
def test_max_without_aggregation(basic_frame): expected = QFrame.from_csv(""" baz 9""") frame = basic_frame.query({'select': [['max', 'baz']]}) assert frame.to_csv() == expected.to_csv()
def frame_with_zero(): data = """ foo,bar 1,0 1,11""" return QFrame.from_csv(data)
def test_sub_select_in_column_missing_in_sub_select(engine): frame = QFrame.from_csv("""foo,bar 1,aa""") with pytest.raises(MalformedQueryException): frame.query({'where': ['in', 'bar', {'select': ['foo'], 'where': ['==', 'foo', 2]}]}, filter_engine=engine)
def string_frame(): data = """foo,bar 1,abcd 2,defg 3,ghij 4,gxyj""" return QFrame.from_csv(data)
def subselect_frame(): data = """ foo,bar 1,10 1,15 5,50""" return QFrame.from_csv(data)
def basic_frame(): data = """ foo,bar,baz,qux bbb,1.25,5,qqq aaa,3.25,7,qqq ccc,,9,www""" return QFrame.from_csv(data)
def bitwise_frame(): data = """foo,bar,baz 1,1.5,abc 2,1.5,def 3,1.5,ghi 4,1.5,ijk 5,1.5,lmn""" return QFrame.from_csv(data)
def calculation_frame(): data = """ foo,bar 1,10 1,11 2,20 3,30 3,33""" return QFrame.from_csv(data)
def test_unicode_content_from_csv(): data = u"""foo,bar aaa,Iñtërnâtiônàližætiøn bbb,räksmörgås ccc,""" input_frame = QFrame.from_csv(data) frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]}) assert_rows(frame, ['bbb'])
def test_basic_count_aggregation(basic_frame): expected = QFrame.from_csv(""" qux,baz qqq,2 www,1""") frame = basic_frame.query({ 'select': ['qux', ['count', 'baz']], 'group_by': ['qux']}) assert frame.to_csv() == expected.to_csv()
def test_basic_sum_aggregation(basic_frame): expected = QFrame.from_csv(""" qux,baz www,9 qqq,12""") frame = basic_frame.query({ 'select': ['qux', ['sum', 'baz']], 'group_by': ['qux'], 'order_by': ['baz']}) assert frame.to_csv() == expected.to_csv()
def test_large_frame_csv(large_frame): with timeit('to_csv'): csv_string = large_frame.to_csv() with timeit('from_csv'): QFrame.from_csv(csv_string)
def large_frame(): d = 1000000 * [{'aaa': 123456789, 'bbb': 'abcdefghijklmnopqrvwxyz', 'ccc': 1.23456789}] return QFrame.from_dicts(d)
def test_enum_from_dicts(enum_frame): cat_frame = QFrame.from_dicts(enum_frame.to_dicts(), column_types={'foo': 'category'}) frame = QFrame.from_dicts(enum_frame.to_dicts()) assert cat_frame.byte_size() < frame.byte_size()
def test_enum_size(enum_frame, enum_data): # Space savings should be possible using categorials # when multiple rows containing the same value exists. frame = QFrame.from_csv(enum_data) assert enum_frame.byte_size() < frame.byte_size()
def enum_frame(enum_data): return QFrame.from_csv(enum_data, column_types={'foo': 'category'})