d[t] = float(v) else: try: d[t] = parse_datetime(v) except ValueError: try: d[t] = parse_date(v) except ValueError: pass if len(d) > 0: return elt._replace(**d) else: return elt BabeBase.register("typedetect", typedetect) def primary_key_detect(stream, max=None): d = deque() it = iter(stream) for linecount, row in enumerate(it): d.append(row) if isinstance(row, StreamHeader): metainfo = row values = [set() for k in metainfo.fields] keys = set(xrange(0, len(metainfo.fields))) elif isinstance(row, StreamMeta): pass else: for idx, val in enumerate(row):
field="ip", country_code="country_code", ignore_error=False, geoip_file=None): """" Add a 'country_code' field from IP address in field "IP" """ gic = get_gic() for r in stream: if isinstance(r, StreamHeader): header = r.insert(typename=None, fields=[country_code]) yield header elif isinstance(r, StreamMeta): yield r else: ip = getattr(r, field) try: cc = gic.country_code_by_addr(ip) except Exception, e: if ignore_error: cc = None pass else: raise e yield header.t(*(r + (cc, ))) ## TODO : full region parsing BabeBase.register("geoip_country_code", geoip_country_code)
else: fields = [field for field in join_header.fields if field != join_key] header = row.insert(typename=None, fields=fields) yield header elif isinstance(row, StreamMeta): yield row else: k = getattr(row, key) if k in d: dd = row._asdict() jrow = d[k] for field in fields: dd[field] = getattr(jrow, field) yield header.t(**dd) else: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("join", row, "Not matching value for key") elif on_error == BabeBase.ON_ERROR_FAIL: raise Exception("No matching value for key %s" % k) elif on_error == BabeBase.ON_ERROR_NONE: dd = row._asdict() for f in fields: dd[f] = None yield header.t(**dd) elif on_error == BabeBase.ON_ERROR_SKIP: pass BabeBase.register("join", join)
if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception, e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d) BabeBase.register("parse_time", stream_parse_datetime) if __name__ == "__main__": print parse_date('2011/04/01') print parse_date('01 June 2009') print parse_datetime('2011/04/01 03:43') print parse_datetime('2011/04/01 3pm45')
def sort(stream, field, reverse=False): buf = [] for elt in stream: if isinstance(elt, StreamHeader): yield elt elif isinstance(elt, StreamFooter): buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse) for row in buf: yield row yield elt else: buf.append(elt) BabeBase.register('sort', sort) def sort_diskbased(stream, field, nsize=100000): buf = [] files = [] count = 0 t = None def iter_on_file(f): try: while True: (key, v) = cPickle.load(f) yield (key, t._make(v)) except EOFError: f.close()
doublequote = False escapechar = '\\' quoting = csv.QUOTE_MINIMAL quotechar = '"' def log(stream, logfile=None): if not logfile: logstream = sys.stderr do_close = False elif isinstance(logfile, basestring): logstream = open(logfile, 'wb') do_close = True else: logstream = logfile do_close = False for row in stream: if isinstance(row, StreamHeader): writer = csv.writer(logstream, log_dialect) writer.writerow(row.fields) elif isinstance(row, StreamMeta): pass else: writer.writerow(list(row)) yield row if do_close: logstream.close() BabeBase.register("log", log)
if k == pk: reducer.row(elt) else: if pk is not None: eg = reducer.end_group(metainfo.t) if isinstance(eg, list): for e in eg: yield e else: yield eg pk = k reducer.begin_group(k) reducer.row(elt) BabeBase.register('groupBy', group) BabeBase.register('group', group) def group_all(stream, reducer, typename=None, fields=None): """ Group all keys reducer can either be a function or a reducer object if a function, reducer(t, rows) will be called with all the rows as a parameter if an object, reducer.begin_group(), reducer.row() and reducer.end_group() will be called """ reducer = build_reducer(reducer) reducer.begin_group(None) for elt in stream: if isinstance(elt, StreamHeader):
class Bunch: def __init__(self, dictionary): self.__dict__ = dictionary def iterate(stream, function, insert_fields=None, typename=None): metainfo = None for row in stream: if isinstance(row, StreamHeader): metainfo = row if insert_fields is not None: metainfo = metainfo.insert(typename=typename, fields=insert_fields) yield metainfo elif isinstance(row, StreamMeta): yield row else: d = row._asdict() # values = tuple(row) if insert_fields is not None: for field in insert_fields: d[field] = None # values = metainfo.t._make(values) result = function(Bunch(d)) yield metainfo.t._make(d.values()) # yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields]) BabeBase.register("iterate", iterate)
""" Deduplicate a stream If columns is specified only apply the deduplication on the specified columns Otherwise apply the deduplication over all values. """ for row in stream: if isinstance(row, StreamHeader): metainfo = row if fields: indexes = [metainfo.fields.index(c) for c in fields] else: indexes = None s = set() yield row elif isinstance(row, StreamMeta): yield row else: if indexes: l = list(row) v = tuple([l[i] for i in indexes]) else: v = row if v in s: pass else: yield row s.add(v) BabeBase.register('dedup', dedup)
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename' in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader( **dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter() BabeBase.registerFinalMethod("push_mongo", push_mongo) BabeBase.register("pull_mongo", pull_mongo)
header = None for row in stream: if isinstance(row, StreamHeader): if header == None: header = row.replace(partition=partition) yield header else: if not equals_types(header.t, row.t): raise Exception('Header types do not match') elif isinstance(row, StreamFooter): footer = row else: yield row yield footer BabeBase.register('merge_substreams', merge_substreams) def partition(stream, field): """Create substream per different value of 'column'""" beginning = False last_value = None header = None for row in stream: if isinstance(row, StreamHeader): beginning = True header = row elif isinstance(row, StreamFooter): if beginning == True: beginning = False continue ## Empty partition: Emit neither header nor footer yield row
fields = add_fields else: fields = [field for field in join_header.fields if field != join_key] header = row.insert(typename=None, fields=fields) yield header elif isinstance(row, StreamMeta): yield row else: k = getattr(row, key) if k in d: dd = row._asdict() jrow = d[k] for field in fields: dd[field] = getattr(jrow, field) yield header.t(**dd) else: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("join", row, "Not matching value for key") elif on_error == BabeBase.ON_ERROR_FAIL: raise Exception("No matching value for key %s" % k) elif on_error == BabeBase.ON_ERROR_NONE: dd = row._asdict() for f in fields: dd[f] = None yield header.t(**dd) elif on_error == BabeBase.ON_ERROR_SKIP: pass BabeBase.register("join", join)
import_query = db_params["import_query"] % (tmpfifo.filename, table_name) p.stdin.write(import_query) p.stdin.flush() writestream = tmpfifo.open_write() elif "load_command" in db_params: load_command = [ Template(s).substitute(table=table_name, database=database) for s in db_params["load_command"] ] print load_command pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None) writestream = pp.stdin else: raise Exception("Missing load_command or import_query in db_kind spec") writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8") # writer = csv.writer(writestream, dialect=sql_dialect()) elif isinstance(row, StreamFooter): if "import_query" in db_params: tmpfifo.close() p.stdin.close() p.wait() elif "load_command" in db_params: pp.stdin.close() pp.wait() else: writer.writerow(row) BabeBase.register("pull_sql", pull_sql) BabeBase.registerFinalMethod("push_sql", push_sql)
p = Popen(c, stdin=PIPE, stdout=None, stderr=None) tmpfifo = TempFifo() import_query = db_params['import_query'] % (tmpfifo.filename, table_name) p.stdin.write(import_query) p.stdin.flush() writestream = tmpfifo.open_write() elif 'load_command' in db_params: load_command = [Template(s).substitute(table=table_name, database=database) for s in db_params['load_command']] print load_command pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None) writestream = pp.stdin else: raise Exception("Missing load_command or import_query in db_kind spec") writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8") #writer = csv.writer(writestream, dialect=sql_dialect()) elif isinstance(row, StreamFooter): if "import_query" in db_params: tmpfifo.close() p.stdin.close() p.wait() elif 'load_command' in db_params: pp.stdin.close() pp.wait() else: writer.writerow(row) BabeBase.register('pull_sql', pull_sql) BabeBase.registerFinalMethod('push_sql', push_sql)
page_token = response.get('pageToken', None) query_complete = response.get('jobComplete', False) if query_complete: if not metainfo: fields = [f['name'] for f in response['schema']['fields']] typename = kwargs.get('typename', 'BigQuery') metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo for row in response['rows']: yield metainfo.t(*[field['v'] for field in row['f']]) if page_token is None: # The query is done and there are no more results # to read. yield StreamFooter() break response = bigquery.jobs().getQueryResults( pageToken=page_token, timeoutMs=timeout, **job_ref ).execute( num_retries=num_retries ) BabeBase.register('pull_bigquery', pull_bigquery) BabeBase.registerFinalMethod('push_bigquery', push_bigquery)
if isinstance(row, StreamHeader): if header == None: header = row.replace(partition=partition) yield header else: if not equals_types(header.t, row.t): raise Exception('Header types do not match') elif isinstance(row, StreamFooter): footer = row else: yield row if footer: yield footer BabeBase.register('merge_substreams', merge_substreams) def partition(stream, field): """Create substream per different value of 'column'""" beginning = False last_value = None header = None for row in stream: if isinstance(row, StreamHeader): beginning = True header = row elif isinstance(row, StreamFooter): if beginning == True: beginning = False continue # Empty partition: Emit neither header nor footer
def do_detect(s): global http_detect if not http_detect: from httpagentparser import detect http_detect = detect return http_detect(s) def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None): for row in stream: if isinstance(row, StreamHeader): header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version])) yield header elif isinstance(row, StreamMeta): yield row else: useragent = getattr(row, field) o = do_detect(useragent) d = [] if output_os: ## On device such as "Ipad", os is in "flavor" and device name in "dist" d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None) if output_browser: d.append(o['browser']['name'] if 'browser' in o else None) if output_browser_version: d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None) yield header.t(*(row + tuple(d))) BabeBase.register("user_agent", user_agent)
api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter( [getattr(s, key) for key in names]) yield u yield StreamFooter() BabeBase.register('pull_twitter', pull_twitter)
def minmaxN(stream, column, n, max=True): "Keep the n rows maximizing value for 'column' for each stream" itt = iter(stream) while True: elt = itt.next() if not isinstance(elt, StreamHeader): raise Exception("Missing metainfo") yield elt g = Guard() it = itertools.takewhile(g.filter, itt) f = heapq.nlargest if max else heapq.nsmallest for elt in f(n, it, key=lambda row: getattr(row, column)): yield elt yield g.footer def maxN(stream, column, n): for k in minmaxN(stream, column, n, max=True): yield k def minN(stream, column, n): for k in minmaxN(stream, column, n, max=False): yield k BabeBase.register('maxN', maxN) BabeBase.register('minN', minN)
for row in stream: if isinstance(row, StreamHeader): metainfo = row.augment(typename=typename, fields=[]) yield metainfo elif isinstance(row, StreamMeta): yield row else: yield metainfo.t._make(list(function(row))) else: for row in stream: if isinstance(row, StreamMeta): yield row else: yield function(row) BabeBase.register("mapTo", mapTo) def bulkMapTo(stream, function, bulk_size, insert_fields=None, fields=None): header = None buf = [] for row in stream: if isinstance(row, StreamHeader): if insert_fields: header = row.insert(typename=None, fields=insert_fields) elif fields: header = row.insert(typename=None, fields=fields) else: header = row yield header elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter([getattr(s, key) for key in names]) yield u yield StreamFooter() BabeBase.register('pull_twitter', pull_twitter)
if isinstance(row, StreamHeader): header = row elif isinstance(row, StreamFooter): # HEADER IS : GROUP + (OTHER FIELDS * EACH VALUE other_fields = [f for f in header.fields if not f in group and not f == pivot] other_fields_k = map(StreamHeader.keynormalize, other_fields) fields = group + [f + "-" + str(v) for v in pivot_values.list for f in other_fields] newheader = header.replace(fields=fields) yield newheader for _, row_dict in groups.iteritems(): ## Create a line per group mrow = row_dict.itervalues().next() group_cols = [getattr(mrow, col) for col in group_n] for v in pivot_values: if v in row_dict: mrow = row_dict[v] group_cols.extend([getattr(mrow, col) for col in other_fields_k]) else: group_cols.extend([None for col in other_fields]) yield group_cols yield row else: kgroup = "" for f in group_n: kgroup = kgroup + str(getattr(row, f)) groups[kgroup][getattr(row, pivot)] = row pivot_values.add(getattr(row, pivot)) BabeBase.register("pivot", pivot)
from collections import * class Bunch: def __init__(self, dictionary): self.__dict__ = dictionary def iterate(stream, function, insert_fields=None, typename=None): metainfo = None for row in stream: if isinstance(row, StreamHeader): metainfo = row if insert_fields is not None: metainfo = metainfo.insert(typename=typename, fields=insert_fields) yield metainfo elif isinstance(row, StreamMeta): yield row else: d = row._asdict() # values = tuple(row) if insert_fields is not None: for field in insert_fields: d[field] = None # values = metainfo.t._make(values) result = function(Bunch(d)) yield metainfo.t._make(d.values()) # yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields]) BabeBase.register("iterate", iterate)
def geoip_country_code(stream, field="ip", country_code="country_code", ignore_error=False, geoip_file = None): """" Add a 'country_code' field from IP address in field "IP" """ gic = get_gic() for r in stream: if isinstance(r, StreamHeader): header = r.insert(typename=None, fields=[country_code]) yield header elif isinstance(r, StreamMeta): yield r else: ip = getattr(r, field) try: cc = gic.country_code_by_addr(ip) except Exception, e: if ignore_error: cc = None pass else: raise e yield header.t(*(r + (cc,))) ## TODO : full region parsing BabeBase.register("geoip_country_code", geoip_country_code)
return False else: return True def minmaxN(stream, column, n, max=True): "Keep the n rows maximizing value for 'column' for each stream" itt = iter(stream) while True: elt = itt.next() if not isinstance(elt, StreamHeader): raise Exception("Missing metainfo") yield elt g = Guard() it = itertools.takewhile(g.filter, itt) f = heapq.nlargest if max else heapq.nsmallest for elt in f(n, it, key=lambda row : getattr(row, column)): yield elt yield g.footer def maxN(stream, column, n): for k in minmaxN(stream, column, n, max=True): yield k def minN(stream, column, n): for k in minmaxN(stream, column, n, max=False): yield k BabeBase.register('maxN', maxN) BabeBase.register('minN', minN)
return {} def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None): for row in stream: if isinstance(row, StreamHeader): header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version])) yield header elif isinstance(row, StreamMeta): yield row else: useragent = getattr(row, field) o = do_detect(useragent) d = [] if output_os: d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None) pass if output_browser: d.append(o['browser']['name'] if 'browser' in o else None) if output_browser_version: d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None) yield header.t(*(row + tuple(d))) BabeBase.register("user_agent", user_agent)
for row in stream: if isinstance(row, StreamHeader): metainfo = row.augment(typename=typename, fields=[]) yield metainfo elif isinstance(row, StreamMeta): yield row else: yield metainfo.t._make(list(function(row))) else: for row in stream: if isinstance(row, StreamMeta): yield row else: yield function(row) BabeBase.register("mapTo", mapTo) def bulkMapTo(stream, function, bulk_size, insert_fields = None, fields = None): header = None buf = [] for row in stream: if isinstance(row, StreamHeader): if insert_fields: header = row.insert(typename=None, fields=insert_fields) elif fields: header = row.insert(typename=None, fields=fields) else: header = row yield header elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1: if not isinstance(row, StreamFooter):
elif 'load_command' in db_params: load_command = [ Template(s).substitute(table=table_name, database=database) for s in db_params['load_command'] ] print load_command pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None) writestream = pp.stdin else: raise Exception( "Missing load_command or import_query in db_kind spec") writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8") #writer = csv.writer(writestream, dialect=sql_dialect()) elif isinstance(row, StreamFooter): if "import_query" in db_params: tmpfifo.close() p.stdin.close() p.wait() elif 'load_command' in db_params: pp.stdin.close() pp.wait() else: writer.writerow(row) BabeBase.register('pull_sql', pull_sql) BabeBase.registerFinalMethod('push_sql', push_sql)
d[t] = int(v) elif g.group('float'): d[t] = float(v) else: try: d[t] = parse_datetime(v) except ValueError: try: d[t] = parse_date(v) except ValueError: pass if len(d) > 0: return elt._replace(**d) else: return elt BabeBase.register("typedetect", typedetect) def primary_key_detect(stream, max=None): d = deque() it = iter(stream) for linecount, row in enumerate(it): d.append(row) if isinstance(row,StreamHeader): metainfo = row values = [set() for k in metainfo.fields] keys = set(xrange(0,len(metainfo.fields))) elif isinstance(row, StreamMeta): pass else: for idx, val in enumerate(row):
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename'in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter() BabeBase.registerFinalMethod("push_mongo", push_mongo) BabeBase.register("pull_mongo", pull_mongo)
import itertools def sort(stream, field, reverse=False): buf = [] for elt in stream: if isinstance(elt, StreamHeader): yield elt elif isinstance(elt, StreamFooter): buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse) for row in buf: yield row yield elt else: buf.append(elt) BabeBase.register('sort', sort) def sort_diskbased(stream, field, nsize=100000): buf = [] files = [] count = 0 t = None def iter_on_file(f): try: while True: (key, v) = cPickle.load(f) yield (key, t._make(v)) except EOFError: f.close() for elt in stream: if isinstance(elt, StreamHeader):
if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception as e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d) BabeBase.register("parse_time", stream_parse_datetime) if __name__ == "__main__": print(parse_date('2011/04/01')) print(parse_date('01 June 2009')) print(parse_datetime('2011/04/01 03:43')) print(parse_datetime('2011/04/01 3pm45'))