def get_bucket(kwargs): from boto.s3.connection import S3Connection key_id = BabeBase.get_config_with_env('s3', 'AWS_ACCESS_KEY_ID', kwargs) access_key = BabeBase.get_config_with_env('s3', 'AWS_SECRET_ACCESS_KEY', kwargs) conn = S3Connection(key_id, access_key) bucket = conn.get_bucket(kwargs['bucket']) return bucket
def pull(filename_remote, **kwargs): bucket = get_bucket(kwargs) cache = BabeBase.get_config("s3", "cache", default=False) fail_on_empty = kwargs.get("fail_on_empty", True) if cache: default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER') cache_dir = BabeBase.get_config("s3", "cache_dir", default=default_cache_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir) keys = get_keys(bucket, filename_remote, fail_on_empty=fail_on_empty) files = [] for key in keys: logging.info("S3 Load: %s", key) if cache: f = os.path.join(cache_dir, os.path.basename(key.name) + "-" + key.etag.replace('"', '')) if os.path.exists(f): files.append(open(f, "r")) else: key.get_contents_to_filename(f + ".tmp", cb=progress_call_back) os.rename(f + ".tmp", f) files.append(open(f, "r")) else: files.append(ReadLineWrapper(key)) return files
def pull(filename_remote, **kwargs): bucket = get_bucket(kwargs) cache = BabeBase.get_config("s3", "cache", default=False) fail_on_empty = kwargs.get("fail_on_empty", True) if cache: default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER') cache_dir = BabeBase.get_config("s3", "cache_dir", default=default_cache_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir) keys = get_keys(bucket, filename_remote, fail_on_empty=fail_on_empty) files = [] for key in keys: logging.info("S3 Load: %s", key) if cache: f = os.path.join( cache_dir, os.path.basename(key.name) + "-" + key.etag.replace('"', '')) if os.path.exists(f): files.append(open(f, "r")) else: key.get_contents_to_filename(f + ".tmp") os.rename(f + ".tmp", f) files.append(open(f, "r")) else: files.append(ReadLineWrapper(key)) return files
def __init__(self, **kwargs): self.size_limit = kwargs.get('size_limit',5<<30) #1<<20 (1MB), 1<<30 (1GB) self.cache_directories = [] cache = BabeBase.get_config("s3", "cache", default=False) if cache: default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER') cache_dir = BabeBase.get_config("s3", "cache_dir", default=default_cache_dir) self.cache_directories.append(cache_dir) self.cache_directories.append(BabeBase.get_config_with_env(section='kontagent', key='KT_FILECACHE', default='/tmp/kontagent-cache'))
def stream_parse_datetime(stream, field, input_timezone, output_timezone, output_date=None, output_time=None, output_hour=None, on_error=BabeBase.ON_ERROR_WARN): input_tz = timezone(input_timezone) output_tz = timezone(output_timezone) header = None for row in stream: if isinstance(row, StreamHeader): added_fields = [ f for f in [output_time, output_date, output_hour] if f and not f in row.fields ] if added_fields: header = row.insert(None, added_fields) else: header = row yield header elif isinstance(row, StreamMeta): yield row else: try: time_value = input_tz.localize( parse_datetime(getattr(row, field))) time_value_ext = time_value.astimezone(output_tz) d = row._asdict() if output_time: d[output_time] = time_value_ext if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception, e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d)
def pull_buzz(stream, username, dataroom, uuid, **kwargs): url = "https://buzzdata.com/api/%s/%s/%s/download_request" % (username, dataroom, uuid) if "api_key" in kwargs: api_key = kwargs["api_key"] elif BabeBase.get_config("buzzdata", "api_key"): api_key = BabeBase.get_config("buzzdata", "api_key") else: raise Exception("Missing api_key") data = urllib.urlencode([("api_key", api_key)]) drequest = urllib2.urlopen(url, data).read() obj = json.loads(drequest) download_url = obj["download_request"]["url"] return urllib2.urlopen(download_url)
def pull_buzz(stream, username, dataroom, uuid, **kwargs): url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (username, dataroom, uuid) if 'api_key' in kwargs: api_key = kwargs['api_key'] elif BabeBase.get_config('buzzdata', 'api_key'): api_key = BabeBase.get_config('buzzdata', 'api_key') else: raise Exception('Missing api_key') data = urllib.urlencode([('api_key', api_key)]) drequest = urllib2.urlopen(url, data).read() obj = json.loads(drequest) download_url = obj['download_request']['url'] return urllib2.urlopen(download_url)
def join(stream, join_stream, key, join_key, add_fields=None, on_error=BabeBase.ON_ERROR_WARN): d = {} join_header = None for row in join_stream: if isinstance(row, StreamHeader): join_header = row elif isinstance(row, StreamFooter): break else: k = getattr(row, join_key) if not k in d: d[k] = row for row in stream: if isinstance(row, StreamHeader): if add_fields: fields = add_fields else: fields = [ field for field in join_header.fields if field != join_key ] header = row.insert(typename=None, fields=fields) yield header elif isinstance(row, StreamMeta): yield row else: k = getattr(row, key) if k in d: dd = row._asdict() jrow = d[k] for field in fields: dd[field] = getattr(jrow, field) yield header.t(**dd) else: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("join", row, "Not matching value for key") elif on_error == BabeBase.ON_ERROR_FAIL: raise Exception("No matching value for key %s" % k) elif on_error == BabeBase.ON_ERROR_NONE: dd = row._asdict() for f in fields: dd[f] = None yield header.t(**dd) elif on_error == BabeBase.ON_ERROR_SKIP: pass
def pull_buzz(stream, username, dataroom, uuid, **kwargs): url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % ( username, dataroom, uuid) if 'api_key' in kwargs: api_key = kwargs['api_key'] elif BabeBase.get_config('buzzdata', 'api_key'): api_key = BabeBase.get_config('buzzdata', 'api_key') else: raise Exception('Missing api_key') data = urllib.urlencode([('api_key', api_key)]) drequest = urllib2.urlopen(url, data).read() obj = json.loads(drequest) download_url = obj['download_request']['url'] return urllib2.urlopen(download_url)
def stream_parse_datetime(stream, field, input_timezone, output_timezone, output_date=None, output_time=None, output_hour=None, on_error=BabeBase.ON_ERROR_WARN): input_tz = timezone(input_timezone) output_tz = timezone(output_timezone) header = None for row in stream: if isinstance(row, StreamHeader): added_fields = [f for f in [output_time, output_date, output_hour] if f and f not in row.fields] if added_fields: header = row.insert(None, added_fields) else: header = row yield header elif isinstance(row, StreamMeta): yield row else: try: time_value = input_tz.localize(parse_datetime(getattr(row, field))) time_value_ext = time_value.astimezone(output_tz) d = row._asdict() if output_time: d[output_time] = time_value_ext if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception as e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d)
def join(stream, join_stream, key,join_key, add_fields=None, on_error=BabeBase.ON_ERROR_WARN): d = {} join_header = None for row in join_stream: if isinstance(row, StreamHeader): join_header = row elif isinstance(row, StreamFooter): break else: k = getattr(row, join_key) if not k in d: d[k] = row for row in stream: if isinstance(row, StreamHeader): if add_fields: fields = add_fields else: fields = [field for field in join_header.fields if field != join_key] header = row.insert(typename=None, fields=fields) yield header elif isinstance(row, StreamMeta): yield row else: k = getattr(row, key) if k in d: dd = row._asdict() jrow = d[k] for field in fields: dd[field] = getattr(jrow, field) yield header.t(**dd) else: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("join", row, "Not matching value for key") elif on_error == BabeBase.ON_ERROR_FAIL: raise Exception("No matching value for key %s" % k) elif on_error == BabeBase.ON_ERROR_NONE: dd = row._asdict() for f in fields: dd[f] = None yield header.t(**dd) elif on_error == BabeBase.ON_ERROR_SKIP: pass
def get_gic(): global gic if gic == None: if os.path.exists('/usr/share/GeoIP/GeoIP.dat'): default = "/usr/share/GeoIP/GeoIP.dat" elif os.path.exists("/usr/local/share/GeoIP/GeoLiteCity.dat"): default = "/usr/local/share/GeoIP/GeoLiteCity.dat" elif os.path.exists("/usr/local/var/lib/GeoLiteCity.dat"): default = "/usr/local/var/lib/GeoLiteCity.dat" else: default = None filename = BabeBase.get_config_with_env('geoip', 'GEOIP_FILE', {}, default) from pygeoip import GeoIP gic = GeoIP(filename) return gic
elif 'load_command' in db_params: load_command = [ Template(s).substitute(table=table_name, database=database) for s in db_params['load_command'] ] print load_command pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None) writestream = pp.stdin else: raise Exception( "Missing load_command or import_query in db_kind spec") writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8") #writer = csv.writer(writestream, dialect=sql_dialect()) elif isinstance(row, StreamFooter): if "import_query" in db_params: tmpfifo.close() p.stdin.close() p.wait() elif 'load_command' in db_params: pp.stdin.close() pp.wait() else: writer.writerow(row) BabeBase.register('pull_sql', pull_sql) BabeBase.registerFinalMethod('push_sql', push_sql)
return {} def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None): for row in stream: if isinstance(row, StreamHeader): header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version])) yield header elif isinstance(row, StreamMeta): yield row else: useragent = getattr(row, field) o = do_detect(useragent) d = [] if output_os: d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None) pass if output_browser: d.append(o['browser']['name'] if 'browser' in o else None) if output_browser_version: d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None) yield header.t(*(row + tuple(d))) BabeBase.register("user_agent", user_agent)
if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception, e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d) BabeBase.register("parse_time", stream_parse_datetime) if __name__ == "__main__": print parse_date('2011/04/01') print parse_date('01 June 2009') print parse_datetime('2011/04/01 03:43') print parse_datetime('2011/04/01 3pm45')
return False else: return True def minmaxN(stream, column, n, max=True): "Keep the n rows maximizing value for 'column' for each stream" itt = iter(stream) while True: elt = itt.next() if not isinstance(elt, StreamHeader): raise Exception("Missing metainfo") yield elt g = Guard() it = itertools.takewhile(g.filter, itt) f = heapq.nlargest if max else heapq.nsmallest for elt in f(n, it, key=lambda row : getattr(row, column)): yield elt yield g.footer def maxN(stream, column, n): for k in minmaxN(stream, column, n, max=True): yield k def minN(stream, column, n): for k in minmaxN(stream, column, n, max=False): yield k BabeBase.register('maxN', maxN) BabeBase.register('minN', minN)
else: fields = [field for field in join_header.fields if field != join_key] header = row.insert(typename=None, fields=fields) yield header elif isinstance(row, StreamMeta): yield row else: k = getattr(row, key) if k in d: dd = row._asdict() jrow = d[k] for field in fields: dd[field] = getattr(jrow, field) yield header.t(**dd) else: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("join", row, "Not matching value for key") elif on_error == BabeBase.ON_ERROR_FAIL: raise Exception("No matching value for key %s" % k) elif on_error == BabeBase.ON_ERROR_NONE: dd = row._asdict() for f in fields: dd[f] = None yield header.t(**dd) elif on_error == BabeBase.ON_ERROR_SKIP: pass BabeBase.register("join", join)
if output_date: date = datetime.date(time_value_ext.year, time_value_ext.month, time_value_ext.day) d[output_date] = date if output_hour: d[output_hour] = time_value_ext.hour yield header.t(**d) except Exception as e: if on_error == BabeBase.ON_ERROR_WARN: BabeBase.log_warn("parse_time", row, e) elif on_error == BabeBase.ON_ERROR_FAIL: raise e elif on_error == BabeBase.ON_ERROR_SKIP: pass elif on_error == BabeBase.ON_ERROR_NONE: d = row._asdict() for k in [output_time, output_date, output_hour]: if k: d[k] = None yield header.t(**d) BabeBase.register("parse_time", stream_parse_datetime) if __name__ == "__main__": print(parse_date('2011/04/01')) print(parse_date('01 June 2009')) print(parse_datetime('2011/04/01 03:43')) print(parse_datetime('2011/04/01 3pm45'))
buf.write('<html><body>\n') for filename in d: buf.write(d[filename].getvalue()) buf.write('\n') buf.write('\n</body></html>') att = MIMEText(buf.getvalue(), "html") msg.attach(att) else: for filename in d: c = d[filename].getvalue() (maintype, subtype) = BabeBase.getMimeType(format) att = MIMEBase(maintype, subtype) att.set_payload(c) encoders.encode_base64(att) att.add_header('Content-Disposition', 'attachment', filename=filename + "." + format) msg.attach(att) s = smtplib.SMTP(smtp_server, smtp_port) s.ehlo() if smtp_tls: s.starttls() s.ehlo() s.login(smtp_login, smtp_password) s.sendmail(author, recipients, msg.as_string()) s.quit() BabeBase.registerFinalMethod('mail', mail)
def sort(stream, field, reverse=False): buf = [] for elt in stream: if isinstance(elt, StreamHeader): yield elt elif isinstance(elt, StreamFooter): buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse) for row in buf: yield row yield elt else: buf.append(elt) BabeBase.register('sort', sort) def sort_diskbased(stream, field, nsize=100000): buf = [] files = [] count = 0 t = None def iter_on_file(f): try: while True: (key, v) = cPickle.load(f) yield (key, t._make(v)) except EOFError: f.close()
from base import StreamHeader, BabeBase, StreamFooter def valuenormalize(cell): return cell.value def read(format, stream, kwargs): import xlrd wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None)) ws = wb.sheet_by_index(0) nrows = ws.nrows fields = kwargs.get('fields', None) if not fields: b = 1 fields = [cell.value for cell in ws.row(0)] else: b = 0 metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for i in xrange(b, nrows): cells = ws.row(i) yield metainfo.t._make(map(valuenormalize, cells)) yield StreamFooter() BabeBase.addPullPlugin('xls', ['xls'], read, need_seek=False)
def compress(compress_outstream, inputfile_filename, inarchive_filename): f = open(compress_outstream, 'w') p = Popen(['gzip', '-c', inputfile_filename], stdout=f) p.communicate() f.close() def get_content_list(compress_instream, filename): if not hasattr(compress_instream, 'fileno'): tf = tempfile.NamedTemporaryFile() tf.write(compress_instream.read()) tf.flush() p = Popen(['gzip', '-d', '-c', tf.name], stdin=None, stdout=PIPE) else: tf = None p = Popen(['gzip', '-d', '-c'], stdin=compress_instream, stdout=PIPE) f = os.path.splitext(os.path.basename(filename))[0] if filename else None return ((tf, p.stdout), [f]) def uncompress(handle, name): return handle[1] BabeBase.addCompressPushPlugin('gz', ['gz'], compress) BabeBase.addCompressPullPlugin('gz', ['gz'], get_content_list, uncompress, need_seek=False)
def build_host(kwargs): host = kwargs['host'] if 'port' in kwargs: host = host + ':' + str(kwargs['port']) if 'user' in kwargs: host = kwargs['user'] + ':' + kwargs['password'] + '@' + host return host def push(filename_topush, filename_remote, **kwargs): req = urllib2.Request() req.add_data host = build_host(kwargs) f = open(filename_topush, 'rb') urllib2.urlopen(url='%s://%s/%s' % (kwargs['protocol'], host, filename_remote), data=f) f.close() def pull(filename_remote, **kwargs): host = build_host(kwargs) url = '%s://%s/%s' % (kwargs['protocol'], host, filename_remote) return urllib2.urlopen(url) BabeBase.addProtocolPushPlugin('http', push, None) BabeBase.addProtocolPullPlugin('http', pull) BabeBase.addProtocolPushPlugin('https', push, None) BabeBase.addProtocolPullPlugin('https', pull)
from base import BabeBase import urllib2 import urllib import json def pull_buzz(stream, username, dataroom, uuid, **kwargs): url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (username, dataroom, uuid) if 'api_key' in kwargs: api_key = kwargs['api_key'] elif BabeBase.get_config('buzzdata', 'api_key'): api_key = BabeBase.get_config('buzzdata', 'api_key') else: raise Exception('Missing api_key') data = urllib.urlencode([('api_key', api_key)]) drequest = urllib2.urlopen(url, data).read() obj = json.loads(drequest) download_url = obj['download_request']['url'] return urllib2.urlopen(download_url) BabeBase.addProtocolPullPlugin('buzzdata', pull_buzz)
d[t] = int(v) elif g.group('float'): d[t] = float(v) else: try: d[t] = parse_datetime(v) except ValueError: try: d[t] = parse_date(v) except ValueError: pass if len(d) > 0: return elt._replace(**d) else: return elt BabeBase.register("typedetect", typedetect) def primary_key_detect(stream, max=None): d = deque() it = iter(stream) for linecount, row in enumerate(it): d.append(row) if isinstance(row,StreamHeader): metainfo = row values = [set() for k in metainfo.fields] keys = set(xrange(0,len(metainfo.fields))) elif isinstance(row, StreamMeta): pass else: for idx, val in enumerate(row):
for row in stream: if isinstance(row, StreamHeader): metainfo = row.augment(typename=typename, fields=[]) yield metainfo elif isinstance(row, StreamMeta): yield row else: yield metainfo.t._make(list(function(row))) else: for row in stream: if isinstance(row, StreamMeta): yield row else: yield function(row) BabeBase.register("mapTo", mapTo) def bulkMapTo(stream, function, bulk_size, insert_fields = None, fields = None): header = None buf = [] for row in stream: if isinstance(row, StreamHeader): if insert_fields: header = row.insert(typename=None, fields=insert_fields) elif fields: header = row.insert(typename=None, fields=fields) else: header = row yield header elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1: if not isinstance(row, StreamFooter):
import itertools def sort(stream, field, reverse=False): buf = [] for elt in stream: if isinstance(elt, StreamHeader): yield elt elif isinstance(elt, StreamFooter): buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse) for row in buf: yield row yield elt else: buf.append(elt) BabeBase.register('sort', sort) def sort_diskbased(stream, field, nsize=100000): buf = [] files = [] count = 0 t = None def iter_on_file(f): try: while True: (key, v) = cPickle.load(f) yield (key, t._make(v)) except EOFError: f.close() for elt in stream: if isinstance(elt, StreamHeader):
service = get_storage() req = service.objects().get( bucket=kwargs['bucket'], object=filename_remote) try: req.execute() return True except HttpError as e: if e.resp.status == 404: return False else: raise def pull(filename_remote, **kwargs): service = get_storage() req = service.objects().get( object=filename_remote, bucket=kwargs['bucket']) resp = req.execute() print(resp) files = [] for key in keys: logging.info("S3 Load: %s", key) files.append(ReadLineWrapper(key)) return files BabeBase.addProtocolPushPlugin('gs', push, None, check_exists) BabeBase.addProtocolPullPlugin('gs', pull)
def write(format, header, instream, outfile, encoding, **kwargs): if not encoding: encoding = "utf-8" outfile.write("<h2>") outfile.write(header.get_stream_name()) outfile.write("</h2>") if header.description: outfile.write("<p><i>") outfile.write(header.description) outfile.write("</i></p>") outfile.write('<table>\n<tr>') for field in header.fields: outfile.write("<th>") outfile.write(write_value(field, encoding)) outfile.write("</th>") outfile.write("</tr>\n") for row in instream: if isinstance(row, StreamFooter): outfile.write("</table>\n") break else: outfile.write("<tr>") for cell in row: outfile.write("<td>") outfile.write(write_value(cell, encoding)) outfile.write("</td>") outfile.write("</tr>\n") BabeBase.addPushPlugin('html', ['html', 'htm'], write)
if k == pk: reducer.row(elt) else: if pk is not None: eg = reducer.end_group(metainfo.t) if isinstance(eg, list): for e in eg: yield e else: yield eg pk = k reducer.begin_group(k) reducer.row(elt) BabeBase.register('groupBy', group) BabeBase.register('group', group) def group_all(stream, reducer, typename=None, fields=None): """ Group all keys reducer can either be a function or a reducer object if a function, reducer(t, rows) will be called with all the rows as a parameter if an object, reducer.begin_group(), reducer.row() and reducer.end_group() will be called """ reducer = build_reducer(reducer) reducer.begin_group(None) for elt in stream: if isinstance(elt, StreamHeader):
fields = kwargs['fields'] table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos + 1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e yield StreamFooter() BabeBase.addPullPlugin("sql", ["sql"], pull) if __name__ == "__main__": for line in sys.stdin: print parse_tuple(0, line)
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter([getattr(s, key) for key in names]) yield u yield StreamFooter() BabeBase.register('pull_twitter', pull_twitter)
def geoip_country_code(stream, field="ip", country_code="country_code", ignore_error=False, geoip_file = None): """" Add a 'country_code' field from IP address in field "IP" """ gic = get_gic() for r in stream: if isinstance(r, StreamHeader): header = r.insert(typename=None, fields=[country_code]) yield header elif isinstance(r, StreamMeta): yield r else: ip = getattr(r, field) try: cc = gic.country_code_by_addr(ip) except Exception, e: if ignore_error: cc = None pass else: raise e yield header.t(*(r + (cc,))) ## TODO : full region parsing BabeBase.register("geoip_country_code", geoip_country_code)
from collections import * class Bunch: def __init__(self, dictionary): self.__dict__ = dictionary def iterate(stream, function, insert_fields=None, typename=None): metainfo = None for row in stream: if isinstance(row, StreamHeader): metainfo = row if insert_fields is not None: metainfo = metainfo.insert(typename=typename, fields=insert_fields) yield metainfo elif isinstance(row, StreamMeta): yield row else: d = row._asdict() # values = tuple(row) if insert_fields is not None: for field in insert_fields: d[field] = None # values = metainfo.t._make(values) result = function(Bunch(d)) yield metainfo.t._make(d.values()) # yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields]) BabeBase.register("iterate", iterate)
fields = kwargs.get('fields', None) if not fields: fields = [cell.internal_value for cell in it.next()] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo # it brings a new method: iter_rows() for row in it: ## stop at the first row with "none" nrow = map(valuenormalize, row) if not any(nrow): break yield metainfo.t._make(nrow) yield StreamFooter() def write(format, metainfo, instream, outfile, encoding, **kwargs): from openpyxl import Workbook wb = Workbook(optimized_write=True) ws = wb.create_sheet() ws.append(metainfo.fields) for k in instream: if isinstance(k, StreamFooter): break else: ws.append(list(k)) wb.save(outfile) BabeBase.addPullPlugin('xlsx', ['xlsx'], read, need_seek=True) BabeBase.addPushPlugin('xlsx', ['xlsx'], write)
api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter( [getattr(s, key) for key in names]) yield u yield StreamFooter() BabeBase.register('pull_twitter', pull_twitter)
it = ws.iter_rows() fields = kwargs.get('fields', None) if not fields: fields = [cell.internal_value for cell in it.next()] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo # it brings a new method: iter_rows() for row in it: ## stop at the first row with "none" nrow = map(valuenormalize, row) if not any(nrow): break yield metainfo.t._make(nrow) yield StreamFooter() def write(format, metainfo, instream, outfile, encoding, **kwargs): from openpyxl import Workbook wb = Workbook(optimized_write=True) ws = wb.create_sheet() ws.append(metainfo.fields) for k in instream: if isinstance(k, StreamFooter): break else: ws.append(list(k)) wb.save(outfile) BabeBase.addPullPlugin('xlsx', ['xlsx'], read, need_seek=True) BabeBase.addPushPlugin('xlsx', ['xlsx'], write)
delimiter = ',' doublequote = False escapechar = '\\' quoting = csv.QUOTE_MINIMAL quotechar = '"' def push(format, metainfo, instream, outfile, encoding, delimiter=None, **kwargs): if not encoding: encoding = "utf8" dialect = kwargs.get('dialect', default_dialect) if delimiter: dialect.delimiter = delimiter writer = UnicodeCSVWriter(outfile, dialect=dialect, encoding=encoding) writer.writerow(metainfo.fields) for k in instream: if isinstance(k, StreamFooter): break else: writer.writerow(k) BabeBase.addPullPlugin('csv', ['csv', 'tsv', 'txt'], pull) BabeBase.addPushPlugin('csv', ['csv', 'tsv', 'txt'], push)
import codecs from base import StreamHeader, BabeBase, StreamFooter def pull(format, stream, kwargs): stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream) fields = kwargs.get('fields', ['text']) metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for line in stream: yield metainfo.t._make([line]) yield StreamFooter() def push(format, metainfo, instream, outfile, encoding, **kwargs): outstream = codecs.getwriter(kwargs.get('encoding', 'utf8'))(outfile) for row in instream: if isinstance(row, StreamFooter): break else: for cell in row: outstream.write(cell) outstream.flush() BabeBase.addPullPlugin('txt', ['txt'], pull) BabeBase.addPushPlugin('txt', ['txt'], push)
from base import BabeBase import urllib2 import urllib import json def pull_buzz(stream, username, dataroom, uuid, **kwargs): url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % ( username, dataroom, uuid) if 'api_key' in kwargs: api_key = kwargs['api_key'] elif BabeBase.get_config('buzzdata', 'api_key'): api_key = BabeBase.get_config('buzzdata', 'api_key') else: raise Exception('Missing api_key') data = urllib.urlencode([('api_key', api_key)]) drequest = urllib2.urlopen(url, data).read() obj = json.loads(drequest) download_url = obj['download_request']['url'] return urllib2.urlopen(download_url) BabeBase.addProtocolPullPlugin('buzzdata', pull_buzz)
d[t] = float(v) else: try: d[t] = parse_datetime(v) except ValueError: try: d[t] = parse_date(v) except ValueError: pass if len(d) > 0: return elt._replace(**d) else: return elt BabeBase.register("typedetect", typedetect) def primary_key_detect(stream, max=None): d = deque() it = iter(stream) for linecount, row in enumerate(it): d.append(row) if isinstance(row, StreamHeader): metainfo = row values = [set() for k in metainfo.fields] keys = set(xrange(0, len(metainfo.fields))) elif isinstance(row, StreamMeta): pass else: for idx, val in enumerate(row):
field="ip", country_code="country_code", ignore_error=False, geoip_file=None): """" Add a 'country_code' field from IP address in field "IP" """ gic = get_gic() for r in stream: if isinstance(r, StreamHeader): header = r.insert(typename=None, fields=[country_code]) yield header elif isinstance(r, StreamMeta): yield r else: ip = getattr(r, field) try: cc = gic.country_code_by_addr(ip) except Exception, e: if ignore_error: cc = None pass else: raise e yield header.t(*(r + (cc, ))) ## TODO : full region parsing BabeBase.register("geoip_country_code", geoip_country_code)
def do_detect(s): global http_detect if not http_detect: from httpagentparser import detect http_detect = detect return http_detect(s) def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None): for row in stream: if isinstance(row, StreamHeader): header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version])) yield header elif isinstance(row, StreamMeta): yield row else: useragent = getattr(row, field) o = do_detect(useragent) d = [] if output_os: ## On device such as "Ipad", os is in "flavor" and device name in "dist" d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None) if output_browser: d.append(o['browser']['name'] if 'browser' in o else None) if output_browser_version: d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None) yield header.t(*(row + tuple(d))) BabeBase.register("user_agent", user_agent)
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename'in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter() BabeBase.registerFinalMethod("push_mongo", push_mongo) BabeBase.register("pull_mongo", pull_mongo)
p = Popen(c, stdin=PIPE, stdout=None, stderr=None) tmpfifo = TempFifo() import_query = db_params['import_query'] % (tmpfifo.filename, table_name) p.stdin.write(import_query) p.stdin.flush() writestream = tmpfifo.open_write() elif 'load_command' in db_params: load_command = [Template(s).substitute(table=table_name, database=database) for s in db_params['load_command']] print load_command pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None) writestream = pp.stdin else: raise Exception("Missing load_command or import_query in db_kind spec") writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8") #writer = csv.writer(writestream, dialect=sql_dialect()) elif isinstance(row, StreamFooter): if "import_query" in db_params: tmpfifo.close() p.stdin.close() p.wait() elif 'load_command' in db_params: pp.stdin.close() pp.wait() else: writer.writerow(row) BabeBase.register('pull_sql', pull_sql) BabeBase.registerFinalMethod('push_sql', push_sql)
for row in stream: if isinstance(row, StreamHeader): metainfo = row.augment(typename=typename, fields=[]) yield metainfo elif isinstance(row, StreamMeta): yield row else: yield metainfo.t._make(list(function(row))) else: for row in stream: if isinstance(row, StreamMeta): yield row else: yield function(row) BabeBase.register("mapTo", mapTo) def bulkMapTo(stream, function, bulk_size, insert_fields=None, fields=None): header = None buf = [] for row in stream: if isinstance(row, StreamHeader): if insert_fields: header = row.insert(typename=None, fields=insert_fields) elif fields: header = row.insert(typename=None, fields=fields) else: header = row yield header elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
def mail(stream, subject, recipients, in_body=False, in_body_row_limit=None, attach_formats="csv", **kwargs): """Format a stream in a mail and send it. Recipients: list of recipients mail addresses in_body: format (in HTML & text) the content in_body_row_limit : maximum number of line in body attach_format : file format to use for attachment """ smtp_server = BabeBase.get_config('smtp', 'server', kwargs) smtp_port = BabeBase.get_config('smtp', 'port', kwargs) smtp_tls = BabeBase.get_config('smtp', 'tls', kwargs, False) smtp_login = BabeBase.get_config('smtp', 'login', kwargs) smtp_password = BabeBase.get_config('smtp', 'password', kwargs) author = BabeBase.get_config('smtp', 'author', kwargs) formats = [] if in_body: formats.append("html") if attach_formats: if isinstance(attach_formats, basestring): formats.append(attach_formats) else: formats.extend(attach_formats) if isinstance(recipients, basestring): recipients = [recipients] babes = stream.tee(len(formats)) if in_body and in_body_row_limit: babes[0] = babes[0].head(in_body_row_limit, all_streams=True) buffer_dicts = [] for format, babe in izip(formats, babes): d = ordered_dict() babe.push(stream_dict=d, format=format) buffer_dicts.append((format, d)) msg = MIMEMultipart() msg['Subject'] = subject msg['From'] = author msg['To'] = ', '.join(recipients) for format, d in buffer_dicts: if format == "html": buf = StringIO() buf.write('<html><body>\n') for filename in d: buf.write(d[filename].getvalue()) buf.write('\n') buf.write('\n</body></html>') att = MIMEText(buf.getvalue(), "html") msg.attach(att) else: for filename in d: c = d[filename].getvalue() (maintype, subtype) = BabeBase.getMimeType(format) att = MIMEBase(maintype, subtype) att.set_payload(c) encoders.encode_base64(att) att.add_header('Content-Disposition', 'attachment', filename=filename + "." + format) msg.attach(att) s = smtplib.SMTP(smtp_server, smtp_port) s.ehlo() if smtp_tls: s.starttls() s.ehlo() s.login(smtp_login, smtp_password) s.sendmail(author, recipients, msg.as_string()) s.quit()
doublequote = False escapechar = '\\' quoting = csv.QUOTE_MINIMAL quotechar = '"' def log(stream, logfile=None): if not logfile: logstream = sys.stderr do_close = False elif isinstance(logfile, basestring): logstream = open(logfile, 'wb') do_close = True else: logstream = logfile do_close = False for row in stream: if isinstance(row, StreamHeader): writer = csv.writer(logstream, log_dialect) writer.writerow(row.fields) elif isinstance(row, StreamMeta): pass else: writer.writerow(list(row)) yield row if do_close: logstream.close() BabeBase.register("log", log)
if isinstance(row, StreamHeader): if header == None: header = row.replace(partition=partition) yield header else: if not equals_types(header.t, row.t): raise Exception('Header types do not match') elif isinstance(row, StreamFooter): footer = row else: yield row if footer: yield footer BabeBase.register('merge_substreams', merge_substreams) def partition(stream, field): """Create substream per different value of 'column'""" beginning = False last_value = None header = None for row in stream: if isinstance(row, StreamHeader): beginning = True header = row elif isinstance(row, StreamFooter): if beginning == True: beginning = False continue # Empty partition: Emit neither header nor footer
if isinstance(row, StreamHeader): header = row elif isinstance(row, StreamFooter): # HEADER IS : GROUP + (OTHER FIELDS * EACH VALUE other_fields = [f for f in header.fields if not f in group and not f == pivot] other_fields_k = map(StreamHeader.keynormalize, other_fields) fields = group + [f + "-" + str(v) for v in pivot_values.list for f in other_fields] newheader = header.replace(fields=fields) yield newheader for _, row_dict in groups.iteritems(): ## Create a line per group mrow = row_dict.itervalues().next() group_cols = [getattr(mrow, col) for col in group_n] for v in pivot_values: if v in row_dict: mrow = row_dict[v] group_cols.extend([getattr(mrow, col) for col in other_fields_k]) else: group_cols.extend([None for col in other_fields]) yield group_cols yield row else: kgroup = "" for f in group_n: kgroup = kgroup + str(getattr(row, f)) groups[kgroup][getattr(row, pivot)] = row pivot_values.add(getattr(row, pivot)) BabeBase.register("pivot", pivot)
table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos+1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e yield StreamFooter() BabeBase.addPullPlugin("sql", ["sql"], pull) if __name__ == "__main__": for line in sys.stdin: print parse_tuple(0, line)
def minmaxN(stream, column, n, max=True): "Keep the n rows maximizing value for 'column' for each stream" itt = iter(stream) while True: elt = itt.next() if not isinstance(elt, StreamHeader): raise Exception("Missing metainfo") yield elt g = Guard() it = itertools.takewhile(g.filter, itt) f = heapq.nlargest if max else heapq.nsmallest for elt in f(n, it, key=lambda row: getattr(row, column)): yield elt yield g.footer def maxN(stream, column, n): for k in minmaxN(stream, column, n, max=True): yield k def minN(stream, column, n): for k in minmaxN(stream, column, n, max=False): yield k BabeBase.register('maxN', maxN) BabeBase.register('minN', minN)