def test_partition(self): a = Babe().pull(string=self.s, format='csv') a = a.partition(field='date') d = {} a.push(stream_dict=d, format="csv") self.assertEquals(d['2012-04-04'].getvalue(), 'date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n') self.assertEquals(d['2012-04-05'].getvalue(), 'date,name,value\n2012-04-05,John,1\n')
def test_gz(self): a = Babe().pull(stream=StringIO(self.s), format='csv', name='Test') a.push(filename='test.csv.gz') b = Babe().pull(filename='test.csv.gz') buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s)
def test_replace(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : [row.foo+1, row.bar*2], fields=['a','b']) buf = StringIO() a.push(stream=buf, format='csv') s = """a,b\n2,4\n4,8\n""" self.assertEquals(buf.getvalue(), s)
def test_twitter(self): a = Babe().pull_twitter() a = a.filterColumns(keep_fields= ["author_name", "author_id", "author_screen_name", "created_at", "hashtags", "text", "in_reply_to_status_id_str"]) a = a.typedetect() buf = StringIO() a.push(stream=buf, format='csv')
def test_partition(self): a = Babe().pull(string=self.s, format="csv") a = a.partition(field="date") d = {} a.push(stream_dict=d, format="csv") self.assertEquals(d["2012-04-04"].getvalue(), "date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n") self.assertEquals(d["2012-04-05"].getvalue(), "date,name,value\n2012-04-05,John,1\n")
def test_partition_s3(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.partition(field='date') a.push(protocol="s3", bucket="florian-test", format="csv", filename_template='foobar/$date.csv.gz')
def test_filter_values(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter_values(a=3, b=4) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "a,b\n3,4\n")
def test_filter2(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filterColumns(remove_fields=['a']) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "b\n2\n4\n4\n")
def test_multi(self): a = Babe() a = a.pull(stream=StringIO(self.s), format='csv').pull(stream=StringIO(self.s), format='csv') buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s + self.s)
def test_gz(self): a = Babe().pull(stream=StringIO(self.s), format='csv', name='Test') a.push(filename='test.csv.gz') b = Babe().pull(filename='test.csv.gz') buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s)
def test_bulk(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.typedetect() a = a.bulkMapTo(lambda list: [[sum([r.a for r in list])]] * len(list), bulk_size=2, insert_fields=["b"]) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), self.s2)
def test_s3(self): s = "a,b\n1,a\n3,b\n" filename = 'tests/test_bq.csv' a = Babe().pull(string=s, format='csv', name='Test') a.push(filename=filename, format='csv', delimiter='\t', quotechar='|', encoding='utf8', bucket='bertrandtest', protocol='gs') b = Babe() b.push_bigquery(filename=filename, bucket='bertrandtest', project_id='bigquery-testing-1098', dataset_id='ladata', table_name='tests', schema=[ { "name": "entier", "type": "INTEGER", "mode": "REQUIRED" }, { "name": "string", "type": "STRING", "mode": "REQUIRED" } ])
def test_replace(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row: [row.foo + 1, row.bar * 2], fields=['a', 'b']) buf = StringIO() a.push(stream=buf, format='csv') s = """a,b\n2,4\n4,8\n""" self.assertEquals(buf.getvalue(), s)
def test_tuple(self): a = Babe().pull(stream=StringIO("a,b\n1,2:3\n4,5:6\n"), format="csv") a = a.flatMap( lambda row: [row._replace(b=i) for i in row.b.split(':')]) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "a,b\n1,2\n1,3\n4,5\n4,6\n")
def test_min(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.minN(column='a', n=2) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'a,b\n1,2\n1,4\n')
def test_transpose(self): a = Babe().pull(stream=StringIO(self.s), format='csv', primary_key='city').transpose() buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_rename(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.rename(a="c") buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'c,b\n1,2\n3,4\n1,4\n')
def test_filter(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.filter(function=lambda x: x.a == 3) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'a,b\n3,4\n')
def test_multi2(self): a = Babe() a = a.pull(stream=StringIO(self.s), format='csv').pull(string=self.s, format='csv') a = a.merge_substreams() buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_windowMap(self): a = Babe().pull(stream=StringIO('a\n1\n2\n3\n4\n5\n6\n7\n'), format="csv").typedetect() a = a.windowMap( 3, lambda rows: rows[-1]._make([sum([row.a for row in rows])])) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'a\n1\n3\n6\n9\n12\n15\n18\n')
def test_load_partition(self): start_time = '2012-04-23 11:00' end_time = '2012-04-23 12:00' a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) a = a.head(n=10) d = {} a.push(stream_dict=d, format='csv') self.assertEquals(list(d.keys()), ['2012-04-23_11'])
def test_multi2(self): a = Babe() a = a.pull(stream=StringIO(self.s), format='csv').pull(string=self.s, format='csv') a = a.merge_substreams() buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_zip(self): babe = Babe() a = babe.pull(stream=StringIO(self.s), format="csv") a.push(filename='tests/test.zip') b = Babe().pull(filename='tests/test.zip') buf = StringIO() b.push(stream=buf) self.assertEquals(buf.getvalue(), self.s)
def test_load(self): start_time = '2012-04-23 11:00' end_time = '2012-04-23 12:00' a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) buf = StringIO() a = a.head(n=10) a.push(stream=buf, format='csv') print buf.getvalue()
def test_zip(self): babe = Babe() a = babe.pull(stream=StringIO(self.s), format="csv") a.push(filename='tests/test.zip') b = Babe().pull(filename='tests/test.zip') buf = StringIO() b.push(stream=buf) self.assertEquals(buf.getvalue(), self.s)
def test_vectorwise(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='vectorwise', database='pybabe_test', drop_table = True, create_table=True) b = Babe().pull_sql(database_kind='vectorwise', database='pybabe_test', table='test_table') buf = StringIO() b.push(stream=buf, format='csv', delimiter=',') self.assertEquals(buf.getvalue(), self.s)
def test_buzzdata(self): a = Babe().pull(protocol='buzzdata', dataroom='best-city-contest-worldwide-cost-of-living-index', uuid='aINAPyLGur4y37yAyCM7w3', username='******', format='xls') a = a.head(2) buf = StringIO() a.push(stream=buf, format='csv')
def test_load(self): start_time = '2012-04-23 11:00' end_time = '2012-04-23 12:00' a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) buf = StringIO() a = a.head(n=10) a.push(stream=buf, format='csv') print buf.getvalue()
def test_pushpull(self): a = Babe().pull(stream=StringIO(self.s2), format='csv', primary_key='rown') a = a.typedetect() a.push_mongo(db='pybabe_test',collection='test_pushpull', drop_collection=True) b = Babe().pull_mongo(db="pybabe_test", fields=['rown', 'f', 's'], collection='test_pushpull') buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_sqldump(self): a = Babe().pull(stream=StringIO(self.s), format='sql', table='foobar', fields=['id', 'number', 'title', 'datetime']) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_groupAll(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.groupAll(reducer=lambda rows: (max([row.b for row in rows]), ), fields=['max']) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "max\n4\n")
def test_load_partition(self): start_time = "2012-04-23 11:00" end_time = "2012-04-23 12:00" a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) a = a.head(n=10) d = {} a.push(stream_dict=d, format="csv") self.assertEquals(list(d.keys()), ["2012-04-23_11"])
def test_groupby(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.groupBy(key="a", reducer=lambda key, rows: (key, sum([row.b for row in rows]))) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), "a,b\n1,6\n3,4\n")
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test3.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='test3.csv', name='Test', bucket='florian-test', protocol="s3") self.assertEquals(b.to_string(), s)
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test_gs.csv', bucket='bertrandtest', delimiter="\t", protocol="gs")
def test_s3_glob2(self): s = "a,b\n1,2\n3,4\n" buf1 = StringIO(s) a = Babe().pull(stream=buf1, format='csv', name='Test') a.push(filename='foofoobar/test_glob_4.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='foofoobar/test_glob_?.csv', name='Test', bucket='florian-test', protocol="s3") buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), s)
def test_bulk(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.typedetect() a = a.bulkMapTo(lambda list: [[sum([r.a for r in list])]] * len(list), bulk_size=2, insert_fields=["b"]) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), self.s2)
def test_user_agent(self): a = Babe().pull(stream=StringIO(self.s), format="csv") buf = StringIO() a = a.user_agent(field="useragent", output_os="os", output_browser="browser", output_browser_version="browser_version") a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_twitter(self): a = Babe().pull_twitter() a = a.filterColumns(keep_fields=[ "author_name", "author_id", "author_screen_name", "created_at", "hashtags", "text", "in_reply_to_status_id_str" ]) a = a.typedetect() buf = StringIO() a.push(stream=buf, format='csv')
def test_join(self): a = Babe().pull(stream=StringIO(self.s1), format='csv') a = a.join(join_stream=Babe().pull(stream=StringIO(self.s2), format='csv'), key='country', join_key='country_code') buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.sjoined)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : row.foo+1, insert_fields=['fooplus']) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(buf.getvalue(), s)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row: row.foo + 1, insert_fields=['fooplus']) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(buf.getvalue(), s)
def test_join_none(self): a = Babe().pull(stream=StringIO(self.s1), format='csv') a = a.join(join_stream=Babe().pull(stream=StringIO(self.s2_bis), format='csv'), key='country', join_key='country_code', on_error=Babe.ON_ERROR_NONE) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.sjoined_bis)
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj : obj._replace(foo=obj.foo + 1)) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(buf.getvalue(), s)
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj: obj._replace(foo=obj.foo + 1)) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(buf.getvalue(), s)
def test_partition(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.partition(field='date') d = {} a.push(stream_dict=d, format="csv") self.assertEquals( d['2012-04-04'].getvalue(), 'date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n') self.assertEquals(d['2012-04-05'].getvalue(), 'date,name,value\n2012-04-05,John,1\n')
def test_buzzdata(self): a = Babe().pull( protocol='buzzdata', dataroom='best-city-contest-worldwide-cost-of-living-index', uuid='aINAPyLGur4y37yAyCM7w3', username='******', format='xls') a = a.head(2) buf = StringIO() a.push(stream=buf, format='csv')
def test_parse(self): a = Babe().pull(stream=StringIO(self.s), format='csv') buf = StringIO() a = a.parse_time(field="time", output_time="time", output_date="date", output_hour="hour", input_timezone="CET", output_timezone="GMT") a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def wordcount(): a = Babe().pull(protocol='http', host='www.ietf.org', filename='rfc/rfc1149.txt') a = a.flatMap(lambda row: [(w, 1) for w in re.findall('\w+', row.text)], columns=['word', 'count']) a = a.groupBy(key='word', reducer=lambda word, rows: (word, sum([row.count for row in rows]))) a = a.maxN(column='count', n=10) a.push(stream=sys.stdout, format='csv')
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test3.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='test3.csv', name='Test', bucket='florian-test', protocol="s3") self.assertEquals(b.to_string(), s)
def test_pushsqlite_partition(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', drop_table = True, create_table=True) a = Babe().pull(stream=StringIO(self.s2), format='csv') a = a.typedetect() a = a.partition(field='id') a.push_sql(table='test_table', database_kind='sqlite', database='test.sqlite', delete_partition=True) b = Babe().pull_sql(database_kind='sqlite', database='test.sqlite', table='test_table') b = b.sort(field="id") buf = StringIO() b.push(stream=buf, format='csv', delimiter=',') self.assertEquals(buf.getvalue(), self.sr)
def test_csv_escape(self): s = """a\tb\tc 1\tab\t{\\"hello, buzz\\"} 2\tcd\t """ class Dialect(csv.Dialect): lineterminator = '\n' delimiter = ',' doublequote = False escapechar = '\\' quoting = csv.QUOTE_MINIMAL quotechar = '|' b = Babe() b = b.pull(string=s, format='csv', name='Test') b.push(filename='tests/files/test3.csv', dialect=Dialect)
def test_memo(self): tmpfile = NamedTemporaryFile() tmpfile.write(self.s) tmpfile.flush() a = Babe().pull(filename=tmpfile.name, memoize=True, format="csv") buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), self.s) #os.remove(tmpfile.name) tmpfile.close() self.assertFalse(os.path.exists(tmpfile.name)) b = Babe().pull(filename=tmpfile.name, memoize=True, format="csv") buf2 = StringIO() b.push(stream=buf2, format="csv") self.assertEquals(buf2.getvalue(), self.s) c = Babe().pull(filename=tmpfile.name, memoize=False, format="csv") buf3 = StringIO() self.assertRaises(IOError, lambda : c.push(stream=buf3, format="csv"))
def test_html(self): a = Babe().pull(stream=StringIO(self.s), format="csv") buf = StringIO() a.push(stream=buf, format="html") print buf.getvalue()
def test_partition_s3(self): a = Babe().pull(string=self.s, format="csv") a = a.partition(field="date") a.push(protocol="s3", bucket="florian-test", format="csv", filename_template="foobar/$date.csv.gz")
def test_null(self): a = Babe().pull(stream=StringIO(self.s), format='csv', null_value="NULL") buf = StringIO() a = a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), self.s2)
def test_gz(self): a = Babe().pull(string=self.s, format="csv", name="Test") a.push(filename="tests/files/test.csv.gz") b = Babe().pull(filename="tests/files/test.csv.gz") self.assertEquals(b.to_string(), self.s)
def test_gs_load_from_kontagent(self): # export 1 full day bucket = 'bertrandtest' game = 'wordox' day = '20151021' hour = '14' table_name = '{}_{}'.format(game, day) filename = '{}.csv'.format(table_name + hour) result = time.strptime(day + ' ' + hour, '%Y%m%d %H') start_time = datetime(result.tm_year, result.tm_mon, result.tm_mday, result.tm_hour) end_time = start_time + timedelta(hours=1) a = Babe() a = a.pull_kontagent(start_time=start_time, sample_mode=False, end_time=end_time, KT_APPID='869fb4a24faa4c61b702ea137cbe16ad', discard_names=["PointSend"]) a = a.mapTo(decode_data, insert_fields=["decoded_data"]) a = a.filterColumns(keep_fields=v1) a = a.filter(lambda row: uid_type_check(row) is True) a.push(filename=filename, format='csv', delimiter='\t', quotechar='|', encoding='utf8', bucket=bucket, protocol='gs') a.push_bigquery(filename=filename, bucket=bucket, project_id='bigquery-testing-1098', dataset_id='ladata', table_name=table_name, schema=[ { "name": "date", "type": "STRING", "mode": "REQUIRED" }, { "name": "hour", "type": "INTEGER", "mode": "REQUIRED" }, { "name": "time", "type": "TIMESTAMP", "mode": "REQUIRED" }, { "name": "name", "type": "STRING", "mode": "REQUIRED" }, { "name": "uid", "type": "INTEGER" }, { "name": "st1", "type": "STRING" }, { "name": "st2", "type": "STRING" }, { "name": "st3", "type": "STRING" }, { "name": "channel_type", "type": "STRING" }, { "name": "value", "type": "INTEGER" }, { "name": "level", "type": "INTEGER" }, { "name": "recipients", "type": "STRING" }, { "name": "tracking_data", "type": "STRING" }, { "name": "data", "type": "STRING" } ], job_id='{}_{}'.format(start_time, end_time), num_retries=5)
def test_zip(self): a = Babe().pull(string=self.s, format="csv") a.push(filename='tests/files/test.zip') b = Babe().pull(filename='tests/files/test.zip') self.assertEquals(b.to_string(), self.s)
def test_join_none(self): a = Babe().pull(stream=StringIO(self.s1), format='csv') a = a.join(join_stream=Babe().pull(stream=StringIO(self.s2_bis), format='csv'), key='country', join_key='country_code', on_error=Babe.ON_ERROR_NONE) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.sjoined_bis)