def test_html(self): a = Babe().pull(string=self.s, format="csv") self.assertEqual(a.to_string(format="html"), """<h2></h2><table> <tr><th>a</th><th>b</th></tr> <tr><td>1</td><td>2</td></tr> </table> """)
def test_http(self): a = Babe().pull(protocol='http', host='localhost', name='Test', filename='remote/files/test.csv', port=self.port) self.assertEquals(a.to_string(), 'foo,bar,f,d\n1,2,3.2,2010/10/02\n3,4,1.2,2011/02/02\n')
def test_join_none(self): a = Babe().pull(string=self.s1, format='csv') a = a.join(join_stream=Babe().pull(string=self.s2_bis, format='csv'), key='country', join_key='country_code', on_error=Babe.ON_ERROR_NONE) self.assertEquals(a.to_string(), self.sjoined_bis)
def test_replace(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : [row.foo+1, row.bar*2], fields=['a','b']) buf = StringIO() a.push(stream=buf, format='csv') s = """a,b\n2,4\n4,8\n""" self.assertEquals(buf.getvalue(), s)
def test_multi2(self): a = Babe() a = a.pull(stream=StringIO(self.s), format='csv').pull(string=self.s, format='csv') a = a.merge_substreams() buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_twitter(self): a = Babe().pull_twitter() a = a.filterColumns(keep_fields= ["author_name", "author_id", "author_screen_name", "created_at", "hashtags", "text", "in_reply_to_status_id_str"]) a = a.typedetect() buf = StringIO() a.push(stream=buf, format='csv')
def test_partition(self): a = Babe().pull(string=self.s, format='csv') a = a.partition(field='date') d = {} a.push(stream_dict=d, format="csv") self.assertEquals(d['2012-04-04'].getvalue(), 'date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n') self.assertEquals(d['2012-04-05'].getvalue(), 'date,name,value\n2012-04-05,John,1\n')
def test_user_agent(self): a = Babe().pull(string=self.s, format="csv") a = a.user_agent(field="useragent", output_os="os", output_browser="browser", output_browser_version="browser_version") self.assertEquals(a.to_string(), self.s2)
def test_partition(self): a = Babe().pull(string=self.s, format="csv") a = a.partition(field="date") d = {} a.push(stream_dict=d, format="csv") self.assertEquals(d["2012-04-04"].getvalue(), "date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n") self.assertEquals(d["2012-04-05"].getvalue(), "date,name,value\n2012-04-05,John,1\n")
def test_bulk(self): a = Babe().pull(stream=StringIO(self.s), format="csv") a = a.typedetect() a = a.bulkMapTo(lambda list: [[sum([r.a for r in list])]] * len(list), bulk_size=2, insert_fields=["b"]) self.assertEquals(a.to_string(), self.s2)
def test_zip(self): babe = Babe() a = babe.pull(stream=StringIO(self.s), format="csv") a.push(filename='tests/test.zip') b = Babe().pull(filename='tests/test.zip') buf = StringIO() b.push(stream=buf) self.assertEquals(buf.getvalue(), self.s)
def test_load(self): start_time = '2012-04-23 11:00' end_time = '2012-04-23 12:00' a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) buf = StringIO() a = a.head(n=10) a.push(stream=buf, format='csv') print buf.getvalue()
def test_buzzdata(self): a = Babe().pull(protocol='buzzdata', dataroom='best-city-contest-worldwide-cost-of-living-index', uuid='aINAPyLGur4y37yAyCM7w3', username='******', format='xls') a = a.head(2) buf = StringIO() a.push(stream=buf, format='csv')
def test_groupAll(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.groupAll(reducer=lambda rows: (max([row.b for row in rows]), ), fields=['max']) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "max\n4\n")
def test_sort(self): babe = Babe() s = '\n'.join(['k,v'] + [ '%u,%u' % (i,-i) for i in xrange(0,10001)]) a = babe.pull(string=s, name='test', format='csv') a = a.typedetect() a = a.sort(field='v') a = a.head(n=1) self.assertEquals(a.to_string(), 'k,v\n10000,-10000\n')
def test_pull_process(self): babe = Babe() a = babe.pull(command=['/bin/ls', '-1', '.'], source='ls', fields=['filename'], format="csv", encoding='utf8') a.push(filename='tests/ls.csv')
def test_windowMap(self): a = Babe().pull(stream=StringIO('a\n1\n2\n3\n4\n5\n6\n7\n'), format="csv").typedetect() a = a.windowMap( 3, lambda rows: rows[-1]._make([sum([row.a for row in rows])])) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'a\n1\n3\n6\n9\n12\n15\n18\n')
def test_sqldump(self): a = Babe().pull(stream=StringIO(self.s), format='sql', table='foobar', fields=['id', 'number', 'title', 'datetime']) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_load_partition(self): start_time = '2012-04-23 11:00' end_time = '2012-04-23 12:00' a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) a = a.head(n=10) d = {} a.push(stream_dict=d, format='csv') self.assertEquals(list(d.keys()), ['2012-04-23_11'])
def test_sort(self): babe = Babe() s = "\n".join(["k,v"] + ["%u,%u" % (i, -i) for i in xrange(0, 10001)]) a = babe.pull(string=s, name="test", format="csv") a = a.typedetect() a = a.sort(field="v") a = a.head(n=1) self.assertEquals(a.to_string(), "k,v\n10000,-10000\n")
def test_mail(self): a = Babe().pull(stream=StringIO(self.s1), source="Table 1", format='csv') a = a.pull(stream=StringIO(self.s2), source="Table 2", format='csv') a.mail(subject="Test", recipients="*****@*****.**", in_body=True)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : row.foo+1, insert_fields=['fooplus']) s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(a.to_string(), s)
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj: obj._replace(foo=obj.foo + 1)) s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(a.to_string(), s)
def test_sort(self): babe = Babe() s = '\n'.join(['k,v'] + ['%u,%u' % (i, -i) for i in xrange(0, 10001)]) a = babe.pull(string=s, name='test', format='csv') a = a.typedetect() a = a.sort(field='v') a = a.head(n=1) self.assertEquals(a.to_string(), 'k,v\n10000,-10000\n')
def test_load_partition(self): start_time = "2012-04-23 11:00" end_time = "2012-04-23 12:00" a = Babe().pull_kontagent(start_time, end_time, sample_mode=True) a = a.head(n=10) d = {} a.push(stream_dict=d, format="csv") self.assertEquals(list(d.keys()), ["2012-04-23_11"])
def test_ftp(self): babe = Babe() a = babe.pull(filename='tests/test.csv', name='Test') a.push(filename='test.csv', protocol='ftp', user=self.user, password=self.password, host='localhost', port=self.port, protocol_early_check= False) b = babe.pull(filename='test.csv', name='Test', protocol='ftp', user=self.user, password=self.password, host='localhost', port=self.port) buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), test_csv_content)
def test_s3(self): s = "a,b\n1,2\n3,4\n" a = Babe().pull(string=s, format='csv', name='Test') a.push(filename='test_gs.csv', bucket='bertrandtest', delimiter="\t", protocol="gs")
def test_groupby(self): a = Babe().pull(stream=StringIO('a,b\n1,2\n3,4\n1,4\n'), format="csv").typedetect() a = a.groupBy(key="a", reducer=lambda key, rows: (key, sum([row.b for row in rows]))) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), "a,b\n1,6\n3,4\n")
def test_parse(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.parse_time(field="time", output_time="time", output_date="date", output_hour="hour", input_timezone="CET", output_timezone="GMT") self.assertEquals(a.to_string(), self.s2)
def test_pull_push(self): babe = Babe() a = babe.pull('tests/test.csv', name='Test').typedetect() a = a.map('foo', lambda x: -x).multimap({ 'bar': lambda x: x + 1, 'f': lambda f: f / 2 }).sort('foo') a = a.groupkey('foo', int.__add__, 0, keepOriginal=True) a.push(filename='tests/test2.csv')
def test_user_agent(self): a = Babe().pull(stream=StringIO(self.s), format="csv") buf = StringIO() a = a.user_agent(field="useragent", output_os="os", output_browser="browser", output_browser_version="browser_version") a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_buzzdata(self): a = Babe().pull( protocol='buzzdata', dataroom='best-city-contest-worldwide-cost-of-living-index', uuid='aINAPyLGur4y37yAyCM7w3', username='******', format='xls') a = a.head(2) a.to_string()
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj: obj._replace(foo=obj.foo + 1)) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(buf.getvalue(), s)
def test_partition(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.partition(field='date') d = {} a.push(stream_dict=d, format="csv") self.assertEquals( d['2012-04-04'].getvalue(), 'date,name,value\n2012-04-04,John,1\n2012-04-04,Luke,2\n') self.assertEquals(d['2012-04-05'].getvalue(), 'date,name,value\n2012-04-05,John,1\n')
def test_mail(self): a = Babe().pull(string=self.s1, source="Table 1", format='csv') a = a.pull(string=self.s2, source="Table 2", format='csv') a.mail(subject="Test", recipients="*****@*****.**", in_body=True)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row: row.foo + 1, insert_fields=['fooplus']) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(buf.getvalue(), s)
def test_buzzdata(self): a = Babe().pull( protocol="buzzdata", dataroom="best-city-contest-worldwide-cost-of-living-index", uuid="aINAPyLGur4y37yAyCM7w3", username="******", format="xls", ) a = a.head(2) a.to_string()
def test_csv_read_write_2_default_delimiter_to_string_bug(self): s = """foo,bar,f,d 1,2,3.2,2010/10/02 3,4,1.2,2011/02/02 """ babe = Babe() b = babe.pull(string=s, format='csv', name='Test') b.push(filename='tests/files/test4.csv') with open('tests/files/test4.csv') as f: self.assertEquals(f.read(), s)
def test_sortdiskbased(self): babe = Babe() s = '\n'.join(['k,v'] + [ '%u,%u' % (i,-i) for i in xrange(0,100001)]) a = babe.pull(stream=StringIO(s), name='test', format='csv') a = a.typedetect() a = a.sort_diskbased(field='v', nsize=10000) a = a.head(n=1) buf = StringIO() a = a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'k,v\n100000,-100000\n')
def test_sortdiskbased(self): babe = Babe() s = '\n'.join(['k,v'] + ['%u,%u' % (i, -i) for i in xrange(0, 100001)]) a = babe.pull(stream=StringIO(s), name='test', format='csv') a = a.typedetect() a = a.sort_diskbased(field='v', nsize=10000) a = a.head(n=1) buf = StringIO() a = a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), 'k,v\n100000,-100000\n')
def test_tuple(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda obj : obj._replace(foo=obj.foo + 1)) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d 2,2,3.2,2010/10/02 4,4,1.2,2011/02/02 """ self.assertEquals(buf.getvalue(), s)
def test_insert(self): a = Babe().pull(filename='tests/test.csv', name='Test').typedetect() a = a.mapTo(lambda row : row.foo+1, insert_fields=['fooplus']) buf = StringIO() a.push(stream=buf, format='csv') s = """foo,bar,f,d,fooplus 1,2,3.2,2010/10/02,2 3,4,1.2,2011/02/02,4 """ self.assertEquals(buf.getvalue(), s)
def test_csv_read_write(self): s = """foo\tbar\tf\td 1\t2\t3.2\t2010/10/02 3\t4\t1.2\t2011/02/02 """ babe = Babe() b = babe.pull(string=s, format='csv', name='Test', delimiter='\t') b.push(filename='tests/files/test2.csv', delimiter='\t') with open('tests/files/test2.csv') as f: self.assertEquals(f.read(), s)
def test_ftpzip(self): babe = Babe() a = babe.pull(filename='tests/test.csv', name='Test') a.push(filename='test.csv', compress='test.zip', protocol='ftp', user=self.user, password=self.password, host='localhost', port=self.port, protocol_early_check=False)
def test_parse(self): a = Babe().pull(stream=StringIO(self.s), format='csv') buf = StringIO() a = a.parse_time(field="time", output_time="time", output_date="date", output_hour="hour", input_timezone="CET", output_timezone="GMT") a.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_http(self): a = Babe().pull(protocol='http', host='localhost', name='Test', filename='remote/test.csv', port=self.port) buf = StringIO() a.push(stream=buf, format='csv') self.assertEquals( buf.getvalue(), 'foo,bar,f,d\n1,2,3.2,2010/10/02\n3,4,1.2,2011/02/02\n')
def pull_kontagent(nostream, start_time, end_time, sample_mode=False, discard_names=None, **kwargs): """ Generate streams from kontagent logs. Generates a stream per hour and per message type. The streams are outputed per hour. babes = Babe().pull_kontagent_streams(start_time='...', 'end_time='...') for babe in babes: babe.push_sql(table='$typename') start_time : hour of the first stream to getattr end_time : hour of the first stream to getattr referent_timezone (optional, default utc): the timezone to use to interpret "day" KT_USER : user id KT_APPID : id of the app KT_PASS : password of the user KT_FILECACHE : local copy of kontagent files. Version 1.1 """ referent_timezone = Babe.get_config_with_env("kontagent", "timezone", kwargs, "utc") kt_user = Babe.get_config_with_env("kontagent", "KT_USER", kwargs) kt_pass = Babe.get_config_with_env("kontagent", "KT_PASS", kwargs) kt_filecache = Babe.get_config_with_env(section='kontagent', key='KT_FILECACHE') if discard_names: discard_names = set(discard_names) else: discard_names = set() if not os.path.exists(kt_filecache): os.makedirs(kt_filecache) kt_appid = Babe.get_config_with_env("kontagent", "KT_APPID", kwargs) for hour in enumerate_period_per_hour(start_time, end_time, referent_timezone): url = get_url(hour, kt_user, kt_pass, kt_appid) log.info("Kontagent: retrieving list: %s" % url) s = urllib.urlopen(url).read() if s == "No files available": continue file_urls = json.loads(s) if sample_mode and len(file_urls) > 0: # Sample mode: just process the first file. file_urls = file_urls[:1] p = Pool(8) downloaded_files = p.map(lambda url: read_url_with_cache( url, kt_user, kt_pass, kt_filecache), file_urls) p.close() header = kt_msg.replace(partition=[ ("date", datetime.date(hour.year, hour.month, hour.day)), ("hour", hour.hour)]) yield header gzips = [Popen(['gzip', '-d', '-c', f], stdin=PIPE, stdout=PIPE) for f in downloaded_files] for gzip in gzips: for row in process_file(hour, gzip.stdout, discard_names): yield row gzip.stdin.close() gzip.wait() yield StreamFooter()
def test_ftpzip(self): babe = Babe() a = babe.pull(filename='tests/files/test.csv', name='Test') a.push(filename='test.csv', compress='test.zip', protocol='ftp', user=self.user, password=self.password, host='localhost', port=self.port, protocol_early_check=False)
def test_log(self): buf = StringIO() buf2 = StringIO() babe = Babe() a = babe.pull('tests/test.csv', name='Test') a = a.log(stream=buf) a.push(stream=buf2, format='csv') s = """foo bar f d 1 2 3.2 2010/10/02 3 4 1.2 2011/02/02 """ self.assertEqual(s, buf.getvalue()) self.assertEqual(s, buf2.getvalue())
def test_split(self): babe = Babe() s = """a,b 1,3:4 2,7 """ a = babe.pull(string=s, format='csv', name='Test') a = a.split(field='b', separator=':') self.assertEquals(a.to_string(), """a,b 1,3 1,4 2,7 """)
def test_log(self): buf = StringIO() buf2 = StringIO() babe = Babe() a = babe.pull(filename='tests/test.csv', source='Test') a = a.log(logfile=buf) a.push(stream=buf2, format='csv') s = """foo,bar,f,d 1,2,3.2,2010/10/02 3,4,1.2,2011/02/02 """ self.assertEqual(s, buf.getvalue()) self.assertEqual(s, buf2.getvalue())
def test_gz(self): a = Babe().pull(stream=StringIO(self.s), format='csv', name='Test') a.push(filename='test.csv.gz') b = Babe().pull(filename='test.csv.gz') buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s)
def test_split(self): babe = Babe() buf = StringIO("""a,b 1,3:4 2,7 """) a = babe.pull(stream=buf, format='csv', name='Test') a = a.split(field='b', separator=':') buf2 = StringIO() a.push(stream=buf2, format='csv') self.assertEquals(buf2.getvalue(), """a,b 1,3 1,4 2,7 """)
def test_pushpull(self): a = Babe().pull(stream=StringIO(self.s2), format='csv', primary_key='rown') a = a.typedetect() a.push_mongo(db='pybabe_test', collection='test_pushpull', drop_collection=True) b = Babe().pull_mongo(db="pybabe_test", fields=['rown', 'f', 's'], collection='test_pushpull') buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), self.s2)
def test_vectorwise(self): a = Babe().pull(stream=StringIO(self.s), format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='vectorwise', database='pybabe_test', drop_table=True, create_table=True) b = Babe().pull_sql(database_kind='vectorwise', database='pybabe_test', table='test_table') buf = StringIO() b.push(stream=buf, format='csv', delimiter=',') self.assertEquals(buf.getvalue(), self.s)
def test_vectorwise(self): a = Babe().pull(string=self.s, format='csv') a = a.typedetect() a.push_sql(table='test_table', database_kind='vectorwise', database='pybabe_test', drop_table=True, create_table=True) b = Babe().pull_sql(database_kind='vectorwise', database='pybabe_test', table='test_table') self.assertEquals(b.to_string(), self.s)
def test_s3(self): s = "a,b\n1,2\n3,4\n" buf1 = StringIO(s) a = Babe().pull(stream=buf1, format='csv', name='Test') a.push(filename='test3.csv', bucket='florian-test', protocol="s3") b = Babe().pull(filename='test3.csv', name='Test', bucket='florian-test', protocol="s3") buf = StringIO() b.push(stream=buf, format='csv') self.assertEquals(buf.getvalue(), s)