def test_tuple(self): a = Babe().pull(stream=StringIO("a,b\n1,2:3\n4,5:6\n"), format="csv") a = a.flatMap( lambda row: [row._replace(b=i) for i in row.b.split(':')]) buf = StringIO() a.push(stream=buf, format="csv") self.assertEquals(buf.getvalue(), "a,b\n1,2\n1,3\n4,5\n4,6\n")
def wordcount(): a = Babe().pull(protocol='http', host='www.ietf.org', filename='rfc/rfc1149.txt') a = a.flatMap(lambda row: [(w, 1) for w in re.findall('\w+', row.text)], columns=['word', 'count']) a = a.groupBy(key='word', reducer=lambda word, rows: (word, sum([row.count for row in rows]))) a = a.maxN(column='count', n=10) a.push(stream=sys.stdout, format='csv')
def test_tuple(self): a = Babe().pull(stream=StringIO("a,b\n1,2:3\n4,5:6\n"), format="csv") a = a.flatMap(lambda row: [row._replace(b=i) for i in row.b.split(':')]) self.assertEquals(a.to_string(), "a,b\n1,2\n1,3\n4,5\n4,6\n")