def test_csv_roaring_import(self): client = self.get_client() text = u""" 10, 7 10, 5 2, 3 7, 1 """ reader = csv_column_reader(StringIO(text)) field = self.index.field("importfield-fast") client.ensure_field(field) client.import_field(field, reader, fast_import=True) bq = self.index.batch_query( field.row(2), field.row(7), field.row(10), ) response = client.query(bq) target = [3, 1, 5] self.assertEqual(3, len(response.results)) self.assertEqual( target, [result.row.columns[0] for result in response.results]) # test clear import reader = csv_column_reader(StringIO(text)) client.import_field(field, reader, fast_import=True, clear=True) bq = self.index.batch_query( field.row(2), field.row(7), field.row(10), ) response = client.query(bq) self.assertEqual(3, len(response.results)) for result in response.results: self.assertEqual([], result.row.columns)
def test_csvbititerator_customtimefunc(self): class UtcTzinfo(datetime.tzinfo): ZERO = datetime.timedelta(0) def utcoffset(self, dt): return UtcTzinfo.ZERO def dst(self, dt): return UtcTzinfo.ZERO def tzname(self, dt): return "UTC" def timefunc_utcstr(timeval): dt = datetime.datetime.strptime(timeval, '%Y-%m-%dT%H:%M:%S') dt = dt.replace(tzinfo=UtcTzinfo()) return calendar.timegm(dt.timetuple()) reader = csv_column_reader(StringIO(u""" 1,10,1991-09-02T06:33:20 5,20,1991-09-02T06:35:00 3,41,1991-09-02T06:36:25 10,10485760,1991-09-02T06:36:25 """), timefunc=timefunc_utcstr) rows = list(reader) self.assertEqual(len(rows), 4) self.assertEqual(rows[0], (1, 10, 683793200)) self.assertEqual(rows[1], (5, 20, 683793300)) self.assertEqual(rows[2], (3, 41, 683793385)) self.assertEqual(rows[3], (10, 10485760, 683793385))
def test_csv_roaring_import_time_field(self): client = self.get_client() text = u""" 10, 7, 1542199376 10, 5, 1483273800 2, 3, 1520268300 7, 1, 1330965900 """ reader = csv_column_reader(StringIO(text)) field = self.index.field("importfield-fast-time", time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR) client.ensure_field(field) client.import_field(field, reader, fast_import=True) bq = self.index.batch_query( field.row(2), field.row(7), field.row(10), ) response = client.query(bq) target = [3, 1, 5] self.assertEqual(3, len(response.results)) self.assertEqual( target, [result.row.columns[0] for result in response.results]) target = [5, 7] start = datetime(2016, 1, 1, 0, 0) end = datetime(2019, 1, 1, 0, 0, 0) response = client.query(field.range(10, start, end)) self.assertEqual(target, response.result.row.columns) # test clear import reader = csv_column_reader(StringIO(text)) client.import_field(field, reader, fast_import=True, clear=True) bq = self.index.batch_query( field.row(2), field.row(7), field.row(10), ) response = client.query(bq) self.assertEqual(3, len(response.results)) for result in response.results: self.assertEqual([], result.row.columns)
def test_csv_import2(self): # Checks against encoding errors on Python 2.x text = u""" 1,10,683793200 5,20,683793300 3,41,683793385 10,10485760,683793385 """ reader = csv_column_reader(StringIO(text)) client = self.get_client() schema = client.schema() field = schema.index(self.index.name).field("importfield", time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR) client.sync_schema(schema) client.import_field(field, reader)
def test_invalid_input(self): invalid_inputs = [ # less than 2 columns u"155", # invalid row ID u"a5,155", # invalid column ID u"155,a5", # invalid timestamp u"155,255,a5", ] for text in invalid_inputs: reader = csv_column_reader(StringIO(text)) self.assertRaises(PilosaError, list, reader)
def test_csv_column_reader_row_key_column_id(self): reader = csv_column_reader(StringIO(u""" one,10,683793200 five,20,683793300 three,41,683793385 ten,10485760,683793385 """), formatfunc=csv_row_key_column_id) ls = list(reader) target = [ Column(row_key="one", column_id=10, timestamp=683793200), Column(row_key="five", column_id=20, timestamp=683793300), Column(row_key="three", column_id=41, timestamp=683793385), Column(row_key="ten", column_id=10485760, timestamp=683793385) ] self.assertEqual(target, ls)
def test_csv_column_reader_row_id_column_key(self): reader = csv_column_reader(StringIO(u""" 1,ten,683793200 5,twenty,683793300 3,forty-one,683793385 10,a-big-number,683793385 """), formatfunc=csv_row_id_column_key) ls = list(reader) target = [ Column(row_id=1, column_key="ten", timestamp=683793200), Column(row_id=5, column_key="twenty", timestamp=683793300), Column(row_id=3, column_key="forty-one", timestamp=683793385), Column(row_id=10, column_key="a-big-number", timestamp=683793385) ] self.assertEqual(target, ls)
def test_csv_field_value_column_key(self): reader = csv_column_reader(StringIO(u""" ten,1 twenty,5 forty-one,3 a-big-number,10 """), formatfunc=csv_column_key_value) ls = list(reader) target = [ FieldValue(column_key="ten", value=1), FieldValue(column_key="twenty", value=5), FieldValue(column_key="forty-one", value=3), FieldValue(column_key="a-big-number", value=10) ] self.assertEqual(target, ls)
def test_csv_import_time_field(self): text = u""" 1,10,683793200 5,20,683793300 3,41,683793385 10,10485760,683793385 """ reader = csv_column_reader(StringIO(text)) client = self.get_client() schema = client.schema() field = schema.index(self.index.name).field( "importfield", time_quantum=TimeQuantum.YEAR_MONTH_DAY_HOUR) client.sync_schema(schema) client.import_field(field, reader) bq = self.index.batch_query(field.row(1), field.row(5), field.row(3), field.row(10)) response = client.query(bq) target = [10, 20, 41, 10485760] self.assertEqual( target, [result.row.columns[0] for result in response.results])
def test_csv_column_reader_row_key_column_key(self): reader = csv_column_reader(StringIO(u""" one,ten,683793200 five,twenty,683793300 three,forty-one,683793385 ten,a-big-number,683793385 """), formatfunc=csv_row_key_column_key) ls = list(reader) target = [ Column(row_key="one", column_key="ten", timestamp=683793200), Column(row_key="five", column_key="twenty", timestamp=683793300), Column(row_key="three", column_key="forty-one", timestamp=683793385), Column(row_key="ten", column_key="a-big-number", timestamp=683793385) ] self.assertEqual(target, ls)
def test_csv_import_row_keys_manual_address(self): client = self.get_client_manual_address() text = u""" ten, 7 ten, 5 two, 3 seven, 1 """ reader = csv_column_reader(StringIO(text), formatfunc=csv_row_key_column_id) field = self.index.field("importfield-keys", keys=True) client.ensure_field(field) client.import_field(field, reader) bq = self.index.batch_query( field.row("two"), field.row("seven"), field.row("ten"), ) response = client.query(bq) target = [3, 1, 5] self.assertEqual(3, len(response.results)) self.assertEqual(target, [result.row.columns[0] for result in response.results])
def test_csvbititerator(self): reader = csv_column_reader( StringIO(u""" 1,10,683793200 5,20,683793300 3,41,683793385 10,10485760,683793385 """)) shard_bit_groups = list(batch_columns(reader, 2)) self.assertEqual(3, len(shard_bit_groups)) shard1, batch1 = shard_bit_groups[0] self.assertEqual(shard1, 0) self.assertEqual(2, len(list(batch1))) shard2, batch2 = shard_bit_groups[1] self.assertEqual(shard2, 0) self.assertEqual(1, len(list(batch2))) shard3, batch3 = shard_bit_groups[2] self.assertEqual(shard3, 10) self.assertEqual(1, len(list(batch3)))
def test_csv_column_reader_row_id_column_id(self): reader = csv_column_reader( StringIO(u""" 1,10,683793200 5,20,683793300 3,41,683793385 10,10485760,683793385 """)) from pilosa.client import DEFAULT_SHARD_WIDTH shard_bit_groups = list(batch_columns(reader, 2, DEFAULT_SHARD_WIDTH)) self.assertEqual(3, len(shard_bit_groups)) shard1, batch1 = shard_bit_groups[0] self.assertEqual(shard1, 0) self.assertEqual(2, len(list(batch1))) shard2, batch2 = shard_bit_groups[1] self.assertEqual(shard2, 0) self.assertEqual(1, len(list(batch2))) shard3, batch3 = shard_bit_groups[2] self.assertEqual(shard3, 10) self.assertEqual(1, len(list(batch3)))
# Creating the Schema client = pilosa.Client() schema = client.schema() # This is where the index will go later # This is where the fields will go later repository = schema.index("repository") stargazer = repository.field("stargazer", time_quantum=pilosa.TimeQuantum.YEAR_MONTH_DAY) language = repository.field("language") client.sync_schema(schema) # Now we are loading our data into the stargazer field time_func = lambda s: int(time.mktime(time.strptime(s, "%Y-%m-%dT%H:%M"))) with open("stargazer.csv") as f: stargazer_reader = csv_column_reader(f, timefunc=time_func) client.import_field(stargazer, stargazer_reader) # Now we are loading our data into the langauge field with open("language.csv") as f: language_reader = csv_column_reader(f, csv_row_id_column_id) client.import_field(language, language_reader) # Now lets make some queries on csv files to measure the peformance of pilosa bitmapping technique :- # Query 1: Let's find out which repositories did user 14 starred. response = client.query(stargazer.row(14)) print("User 14 starredd: ", response.result.row.columns) # Query2 : What are the top 5 programming language in sample data.