def get_bucket(kwargs):
    from boto.s3.connection import S3Connection
    key_id = BabeBase.get_config_with_env('s3', 'AWS_ACCESS_KEY_ID', kwargs)
    access_key = BabeBase.get_config_with_env('s3', 'AWS_SECRET_ACCESS_KEY', kwargs)
    conn = S3Connection(key_id, access_key)
    bucket = conn.get_bucket(kwargs['bucket'])
    return bucket
Beispiel #2
0
def get_bucket(kwargs):
    from boto.s3.connection import S3Connection
    key_id = BabeBase.get_config_with_env('s3', 'AWS_ACCESS_KEY_ID', kwargs)
    access_key = BabeBase.get_config_with_env('s3', 'AWS_SECRET_ACCESS_KEY', kwargs)
    conn = S3Connection(key_id, access_key)
    bucket = conn.get_bucket(kwargs['bucket'])
    return bucket
def pull(filename_remote, **kwargs):
    bucket = get_bucket(kwargs)
    cache = BabeBase.get_config("s3", "cache", default=False)
    fail_on_empty = kwargs.get("fail_on_empty", True)
    if cache:
        default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER')
        cache_dir = BabeBase.get_config("s3", "cache_dir", default=default_cache_dir)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
    keys = get_keys(bucket, filename_remote, fail_on_empty=fail_on_empty)
    files = []
    for key in keys:
        logging.info("S3 Load: %s", key)
        if cache:
            f = os.path.join(cache_dir,
                             os.path.basename(key.name) + "-" + key.etag.replace('"', ''))
            if os.path.exists(f):
                files.append(open(f, "r"))
            else:
                key.get_contents_to_filename(f + ".tmp", cb=progress_call_back)
                os.rename(f + ".tmp", f)
                files.append(open(f, "r"))
        else:
            files.append(ReadLineWrapper(key))
    return files
Beispiel #4
0
def pull(filename_remote, **kwargs):
    bucket = get_bucket(kwargs)
    cache = BabeBase.get_config("s3", "cache", default=False)
    fail_on_empty = kwargs.get("fail_on_empty", True)
    if cache:
        default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER')
        cache_dir = BabeBase.get_config("s3",
                                        "cache_dir",
                                        default=default_cache_dir)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
    keys = get_keys(bucket, filename_remote, fail_on_empty=fail_on_empty)
    files = []
    for key in keys:
        logging.info("S3 Load: %s", key)
        if cache:
            f = os.path.join(
                cache_dir,
                os.path.basename(key.name) + "-" + key.etag.replace('"', ''))
            if os.path.exists(f):
                files.append(open(f, "r"))
            else:
                key.get_contents_to_filename(f + ".tmp")
                os.rename(f + ".tmp", f)
                files.append(open(f, "r"))
        else:
            files.append(ReadLineWrapper(key))
    return files
 def __init__(self, **kwargs):
     self.size_limit = kwargs.get('size_limit',5<<30) #1<<20 (1MB), 1<<30 (1GB)
     self.cache_directories = []
     cache = BabeBase.get_config("s3", "cache", default=False)
     if cache:
         default_cache_dir = "/tmp/pybabe-s3-cache-%s" % os.getenv('USER')
         cache_dir = BabeBase.get_config("s3", "cache_dir", default=default_cache_dir)
         self.cache_directories.append(cache_dir)
     self.cache_directories.append(BabeBase.get_config_with_env(section='kontagent', key='KT_FILECACHE', default='/tmp/kontagent-cache'))
Beispiel #6
0
def stream_parse_datetime(stream,
                          field,
                          input_timezone,
                          output_timezone,
                          output_date=None,
                          output_time=None,
                          output_hour=None,
                          on_error=BabeBase.ON_ERROR_WARN):
    input_tz = timezone(input_timezone)
    output_tz = timezone(output_timezone)
    header = None
    for row in stream:
        if isinstance(row, StreamHeader):
            added_fields = [
                f for f in [output_time, output_date, output_hour]
                if f and not f in row.fields
            ]
            if added_fields:
                header = row.insert(None, added_fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            try:
                time_value = input_tz.localize(
                    parse_datetime(getattr(row, field)))
                time_value_ext = time_value.astimezone(output_tz)
                d = row._asdict()
                if output_time:
                    d[output_time] = time_value_ext
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception, e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)
Beispiel #7
0
def pull_buzz(stream, username, dataroom, uuid, **kwargs):
    url = "https://buzzdata.com/api/%s/%s/%s/download_request" % (username, dataroom, uuid)
    if "api_key" in kwargs:
        api_key = kwargs["api_key"]
    elif BabeBase.get_config("buzzdata", "api_key"):
        api_key = BabeBase.get_config("buzzdata", "api_key")
    else:
        raise Exception("Missing api_key")
    data = urllib.urlencode([("api_key", api_key)])
    drequest = urllib2.urlopen(url, data).read()
    obj = json.loads(drequest)
    download_url = obj["download_request"]["url"]
    return urllib2.urlopen(download_url)
Beispiel #8
0
def pull_buzz(stream, username, dataroom, uuid, **kwargs):
    url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (username, dataroom, uuid)
    if 'api_key' in kwargs:
        api_key = kwargs['api_key']
    elif BabeBase.get_config('buzzdata', 'api_key'):
        api_key = BabeBase.get_config('buzzdata', 'api_key')
    else:
        raise Exception('Missing api_key')
    data = urllib.urlencode([('api_key', api_key)])
    drequest = urllib2.urlopen(url, data).read()
    obj = json.loads(drequest)
    download_url = obj['download_request']['url']
    return urllib2.urlopen(download_url)
Beispiel #9
0
def join(stream,
         join_stream,
         key,
         join_key,
         add_fields=None,
         on_error=BabeBase.ON_ERROR_WARN):
    d = {}
    join_header = None
    for row in join_stream:
        if isinstance(row, StreamHeader):
            join_header = row
        elif isinstance(row, StreamFooter):
            break
        else:
            k = getattr(row, join_key)
            if not k in d:
                d[k] = row

    for row in stream:
        if isinstance(row, StreamHeader):
            if add_fields:
                fields = add_fields
            else:
                fields = [
                    field for field in join_header.fields if field != join_key
                ]
            header = row.insert(typename=None, fields=fields)
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            k = getattr(row, key)
            if k in d:
                dd = row._asdict()
                jrow = d[k]
                for field in fields:
                    dd[field] = getattr(jrow, field)
                yield header.t(**dd)
            else:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("join", row,
                                      "Not matching value for key")
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise Exception("No matching value for key %s" % k)
                elif on_error == BabeBase.ON_ERROR_NONE:
                    dd = row._asdict()
                    for f in fields:
                        dd[f] = None
                    yield header.t(**dd)
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
Beispiel #10
0
def pull_buzz(stream, username, dataroom, uuid, **kwargs):
    url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (
        username, dataroom, uuid)
    if 'api_key' in kwargs:
        api_key = kwargs['api_key']
    elif BabeBase.get_config('buzzdata', 'api_key'):
        api_key = BabeBase.get_config('buzzdata', 'api_key')
    else:
        raise Exception('Missing api_key')
    data = urllib.urlencode([('api_key', api_key)])
    drequest = urllib2.urlopen(url, data).read()
    obj = json.loads(drequest)
    download_url = obj['download_request']['url']
    return urllib2.urlopen(download_url)
Beispiel #11
0
def stream_parse_datetime(stream,
                          field,
                          input_timezone,
                          output_timezone,
                          output_date=None,
                          output_time=None,
                          output_hour=None,
                          on_error=BabeBase.ON_ERROR_WARN):
    input_tz = timezone(input_timezone)
    output_tz = timezone(output_timezone)
    header = None
    for row in stream:
        if isinstance(row, StreamHeader):
            added_fields = [f for f in [output_time, output_date, output_hour]
                            if f and f not in row.fields]
            if added_fields:
                header = row.insert(None, added_fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            try:
                time_value = input_tz.localize(parse_datetime(getattr(row, field)))
                time_value_ext = time_value.astimezone(output_tz)
                d = row._asdict()
                if output_time:
                    d[output_time] = time_value_ext
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception as e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)
Beispiel #12
0
def join(stream, join_stream, key,join_key, add_fields=None, on_error=BabeBase.ON_ERROR_WARN):
	d = {}
	join_header = None
	for row in join_stream:
		if isinstance(row, StreamHeader):
			join_header = row
		elif isinstance(row, StreamFooter): 
			break 
		else: 
			k = getattr(row, join_key)
			if not k in d: 
				d[k] = row

	for row in stream: 
		if isinstance(row, StreamHeader):
			if add_fields:
				fields = add_fields
			else:
				fields = [field for field in join_header.fields if field != join_key]
			header = row.insert(typename=None, fields=fields)
			yield header
		elif isinstance(row, StreamMeta):
			yield row
		else: 
			k = getattr(row, key)
			if k in d: 
				dd = row._asdict()
				jrow = d[k]
				for field in fields: 
					dd[field] = getattr(jrow, field)
				yield header.t(**dd)
			else: 
				if on_error == BabeBase.ON_ERROR_WARN: 
					BabeBase.log_warn("join", row, "Not matching value for key")
				elif on_error == BabeBase.ON_ERROR_FAIL:
					raise Exception("No matching value for key %s" % k)
				elif on_error == BabeBase.ON_ERROR_NONE:
					dd = row._asdict()
					for f in fields:
						dd[f] = None
					yield header.t(**dd) 
				elif on_error == BabeBase.ON_ERROR_SKIP:
					pass
Beispiel #13
0
def get_gic(): 
    global gic 
    if gic == None: 
        if os.path.exists('/usr/share/GeoIP/GeoIP.dat'): 
            default = "/usr/share/GeoIP/GeoIP.dat"
        elif os.path.exists("/usr/local/share/GeoIP/GeoLiteCity.dat"):
            default = "/usr/local/share/GeoIP/GeoLiteCity.dat"
        elif os.path.exists("/usr/local/var/lib/GeoLiteCity.dat"):
            default = "/usr/local/var/lib/GeoLiteCity.dat" 
        else:
            default = None
        filename = BabeBase.get_config_with_env('geoip', 'GEOIP_FILE', {}, default)
        from pygeoip import GeoIP
        gic = GeoIP(filename)
    return gic 
Beispiel #14
0
def get_gic():
    global gic
    if gic == None:
        if os.path.exists('/usr/share/GeoIP/GeoIP.dat'):
            default = "/usr/share/GeoIP/GeoIP.dat"
        elif os.path.exists("/usr/local/share/GeoIP/GeoLiteCity.dat"):
            default = "/usr/local/share/GeoIP/GeoLiteCity.dat"
        elif os.path.exists("/usr/local/var/lib/GeoLiteCity.dat"):
            default = "/usr/local/var/lib/GeoLiteCity.dat"
        else:
            default = None
        filename = BabeBase.get_config_with_env('geoip', 'GEOIP_FILE', {},
                                                default)
        from pygeoip import GeoIP
        gic = GeoIP(filename)
    return gic
Beispiel #15
0
            elif 'load_command' in db_params:
                load_command = [
                    Template(s).substitute(table=table_name, database=database)
                    for s in db_params['load_command']
                ]
                print load_command
                pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None)
                writestream = pp.stdin
            else:
                raise Exception(
                    "Missing load_command or import_query in db_kind spec")

            writer = UnicodeCSVWriter(writestream,
                                      dialect=sql_dialect(),
                                      encoding="utf-8")
            #writer = csv.writer(writestream, dialect=sql_dialect())
        elif isinstance(row, StreamFooter):
            if "import_query" in db_params:
                tmpfifo.close()
                p.stdin.close()
                p.wait()
            elif 'load_command' in db_params:
                pp.stdin.close()
                pp.wait()
        else:
            writer.writerow(row)


BabeBase.register('pull_sql', pull_sql)
BabeBase.registerFinalMethod('push_sql', push_sql)
        return {}


def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None):
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.insert(typename=None,
                                fields=filter(lambda x: x is not None,
                                              [output_os, output_browser, output_browser_version]))
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            useragent = getattr(row, field)
            o = do_detect(useragent)
            d = []
            if output_os:
                d.append(o['os']['name']
                         if 'os' in o
                         else o['dist']['name'] if 'dist' in o else None)
                pass
            if output_browser:
                d.append(o['browser']['name'] if 'browser' in o else None)
            if output_browser_version:
                d.append(o['browser']['version']
                         if 'browser' in o and 'version' in o['browser']
                         else None)
            yield header.t(*(row + tuple(d)))

BabeBase.register("user_agent", user_agent)
Beispiel #17
0
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception, e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)


BabeBase.register("parse_time", stream_parse_datetime)

if __name__ == "__main__":
    print parse_date('2011/04/01')
    print parse_date('01 June 2009')
    print parse_datetime('2011/04/01 03:43')
    print parse_datetime('2011/04/01 3pm45')
Beispiel #18
0
            return False
        else:
            return True

def minmaxN(stream, column, n, max=True):
    "Keep the n rows maximizing value for 'column' for each stream"
    itt = iter(stream)
    while True:
        elt = itt.next()
        if not isinstance(elt, StreamHeader):
            raise Exception("Missing metainfo")
        yield elt         
        g = Guard()
        it = itertools.takewhile(g.filter, itt)
        f = heapq.nlargest if max else heapq.nsmallest 
        for elt in f(n, it, key=lambda row : getattr(row, column)):
            yield elt
        yield g.footer
            
def maxN(stream, column, n):
    for k in minmaxN(stream, column, n, max=True):
        yield k

def minN(stream, column, n):
    for k in minmaxN(stream, column, n, max=False):
        yield k 
    
BabeBase.register('maxN', maxN)
BabeBase.register('minN', minN)

    
Beispiel #19
0
			else:
				fields = [field for field in join_header.fields if field != join_key]
			header = row.insert(typename=None, fields=fields)
			yield header
		elif isinstance(row, StreamMeta):
			yield row
		else: 
			k = getattr(row, key)
			if k in d: 
				dd = row._asdict()
				jrow = d[k]
				for field in fields: 
					dd[field] = getattr(jrow, field)
				yield header.t(**dd)
			else: 
				if on_error == BabeBase.ON_ERROR_WARN: 
					BabeBase.log_warn("join", row, "Not matching value for key")
				elif on_error == BabeBase.ON_ERROR_FAIL:
					raise Exception("No matching value for key %s" % k)
				elif on_error == BabeBase.ON_ERROR_NONE:
					dd = row._asdict()
					for f in fields:
						dd[f] = None
					yield header.t(**dd) 
				elif on_error == BabeBase.ON_ERROR_SKIP:
					pass



BabeBase.register("join", join)
Beispiel #20
0
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception as e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)

BabeBase.register("parse_time", stream_parse_datetime)


if __name__ == "__main__":
    print(parse_date('2011/04/01'))
    print(parse_date('01 June 2009'))
    print(parse_datetime('2011/04/01 03:43'))
    print(parse_datetime('2011/04/01 3pm45'))
Beispiel #21
0
            buf.write('<html><body>\n')
            for filename in d:
                buf.write(d[filename].getvalue())
                buf.write('\n')
            buf.write('\n</body></html>')
            att = MIMEText(buf.getvalue(), "html")
            msg.attach(att)
        else:
            for filename in d:
                c = d[filename].getvalue()
                (maintype, subtype) = BabeBase.getMimeType(format)
                att = MIMEBase(maintype, subtype)
                att.set_payload(c)
                encoders.encode_base64(att)
                att.add_header('Content-Disposition',
                               'attachment',
                               filename=filename + "." + format)
                msg.attach(att)

    s = smtplib.SMTP(smtp_server, smtp_port)
    s.ehlo()
    if smtp_tls:
        s.starttls()
        s.ehlo()
    s.login(smtp_login, smtp_password)
    s.sendmail(author, recipients, msg.as_string())
    s.quit()


BabeBase.registerFinalMethod('mail', mail)
Beispiel #22
0

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)

BabeBase.register('sort', sort)


def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
Beispiel #23
0

from base import StreamHeader, BabeBase, StreamFooter


def valuenormalize(cell):
    return cell.value


def read(format, stream, kwargs):
    import xlrd
    wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None))
    ws = wb.sheet_by_index(0)
    nrows = ws.nrows
    fields = kwargs.get('fields', None)
    if not fields:
        b = 1
        fields = [cell.value for cell in ws.row(0)]
    else:
        b = 0
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for i in xrange(b, nrows):
        cells = ws.row(i)
        yield metainfo.t._make(map(valuenormalize, cells))
    yield StreamFooter()

BabeBase.addPullPlugin('xls', ['xls'], read, need_seek=False)
Beispiel #24
0
def compress(compress_outstream, inputfile_filename, inarchive_filename):
    f = open(compress_outstream, 'w')
    p = Popen(['gzip', '-c', inputfile_filename], stdout=f)
    p.communicate()
    f.close()


def get_content_list(compress_instream, filename):
    if not hasattr(compress_instream, 'fileno'):
        tf = tempfile.NamedTemporaryFile()
        tf.write(compress_instream.read())
        tf.flush()
        p = Popen(['gzip', '-d', '-c', tf.name], stdin=None, stdout=PIPE)
    else:
        tf = None
        p = Popen(['gzip', '-d', '-c'], stdin=compress_instream, stdout=PIPE)
    f = os.path.splitext(os.path.basename(filename))[0] if filename else None
    return ((tf, p.stdout), [f])


def uncompress(handle, name):
    return handle[1]


BabeBase.addCompressPushPlugin('gz', ['gz'], compress)
BabeBase.addCompressPullPlugin('gz', ['gz'],
                               get_content_list,
                               uncompress,
                               need_seek=False)
Beispiel #25
0
def build_host(kwargs):
    host = kwargs['host']
    if 'port' in kwargs:
        host = host + ':' + str(kwargs['port'])
    if 'user' in kwargs:
        host = kwargs['user'] + ':' + kwargs['password'] + '@' + host
    return host


def push(filename_topush, filename_remote, **kwargs):
    req = urllib2.Request()
    req.add_data
    host = build_host(kwargs)
    f = open(filename_topush, 'rb')
    urllib2.urlopen(url='%s://%s/%s' %
                    (kwargs['protocol'], host, filename_remote),
                    data=f)
    f.close()


def pull(filename_remote, **kwargs):
    host = build_host(kwargs)
    url = '%s://%s/%s' % (kwargs['protocol'], host, filename_remote)
    return urllib2.urlopen(url)


BabeBase.addProtocolPushPlugin('http', push, None)
BabeBase.addProtocolPullPlugin('http', pull)
BabeBase.addProtocolPushPlugin('https', push, None)
BabeBase.addProtocolPullPlugin('https', pull)
Beispiel #26
0

from base import BabeBase
import urllib2
import urllib
import json


def pull_buzz(stream, username, dataroom, uuid, **kwargs):
    url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (username, dataroom, uuid)
    if 'api_key' in kwargs:
        api_key = kwargs['api_key']
    elif BabeBase.get_config('buzzdata', 'api_key'):
        api_key = BabeBase.get_config('buzzdata', 'api_key')
    else:
        raise Exception('Missing api_key')
    data = urllib.urlencode([('api_key', api_key)])
    drequest = urllib2.urlopen(url, data).read()
    obj = json.loads(drequest)
    download_url = obj['download_request']['url']
    return urllib2.urlopen(download_url)

BabeBase.addProtocolPullPlugin('buzzdata', pull_buzz)
Beispiel #27
0
                    d[t] = int(v)
                elif g.group('float'):
                    d[t] = float(v)
            else:
                try: 
                    d[t] = parse_datetime(v)
                except ValueError: 
                    try: 
                        d[t] = parse_date(v)
                    except ValueError:
                        pass
        if len(d) > 0:
            return elt._replace(**d)
        else:
            return elt
BabeBase.register("typedetect", typedetect)


def primary_key_detect(stream, max=None): 
    d = deque()
    it = iter(stream)
    for linecount, row in enumerate(it):
        d.append(row)
        if isinstance(row,StreamHeader): 
            metainfo = row
            values = [set() for k in metainfo.fields]
            keys = set(xrange(0,len(metainfo.fields)))
        elif isinstance(row, StreamMeta):
            pass
        else:
            for idx, val in enumerate(row):
Beispiel #28
0
        for row in stream:
            if isinstance(row, StreamHeader):
                metainfo = row.augment(typename=typename, fields=[])
                yield metainfo
            elif isinstance(row, StreamMeta):
                yield row 
            else:
                yield metainfo.t._make(list(function(row)))
    else:
        for row in stream:
            if isinstance(row, StreamMeta):
                yield row
            else: 
                yield function(row)
    
BabeBase.register("mapTo", mapTo)

def bulkMapTo(stream, function, bulk_size, insert_fields = None, fields = None): 
    header = None
    buf = []
    for row in stream: 
        if isinstance(row, StreamHeader): 
            if insert_fields: 
                header = row.insert(typename=None, fields=insert_fields)
            elif fields: 
                header = row.insert(typename=None, fields=fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
            if not isinstance(row, StreamFooter): 
Beispiel #29
0
import itertools

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)
        
BabeBase.register('sort', sort)        

def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0 
    t = None
    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
    for elt in stream: 
        if isinstance(elt, StreamHeader):
    service = get_storage()
    req = service.objects().get(
        bucket=kwargs['bucket'],
        object=filename_remote)
    try:
        req.execute()
        return True
    except HttpError as e:
        if e.resp.status == 404:
            return False
        else:
            raise


def pull(filename_remote, **kwargs):
    service = get_storage()
    req = service.objects().get(
        object=filename_remote,
        bucket=kwargs['bucket'])
    resp = req.execute()
    print(resp)
    files = []
    for key in keys:
        logging.info("S3 Load: %s", key)
        files.append(ReadLineWrapper(key))
    return files


BabeBase.addProtocolPushPlugin('gs', push, None, check_exists)
BabeBase.addProtocolPullPlugin('gs', pull)
Beispiel #31
0
def write(format, header, instream, outfile, encoding, **kwargs):
    if not encoding:
        encoding = "utf-8"
    outfile.write("<h2>")
    outfile.write(header.get_stream_name())
    outfile.write("</h2>")
    if header.description:
        outfile.write("<p><i>")
        outfile.write(header.description)
        outfile.write("</i></p>")
    outfile.write('<table>\n<tr>')
    for field in header.fields:
        outfile.write("<th>")
        outfile.write(write_value(field, encoding))
        outfile.write("</th>")
    outfile.write("</tr>\n")
    for row in instream:
        if isinstance(row, StreamFooter):
            outfile.write("</table>\n")
            break
        else:
            outfile.write("<tr>")
            for cell in row:
                outfile.write("<td>")
                outfile.write(write_value(cell, encoding))
                outfile.write("</td>")
            outfile.write("</tr>\n")


BabeBase.addPushPlugin('html', ['html', 'htm'], write)
Beispiel #32
0
            if k == pk:
                reducer.row(elt)
            else:
                if pk is not None:
                    eg = reducer.end_group(metainfo.t)
                    if isinstance(eg, list):
                        for e in eg:
                            yield e
                    else:
                        yield eg
                pk = k
                reducer.begin_group(k)
                reducer.row(elt)


BabeBase.register('groupBy', group)
BabeBase.register('group', group)


def group_all(stream, reducer, typename=None, fields=None):
    """
    Group all keys
reducer can either be a function or a reducer object
if a function, reducer(t, rows) will be called with all the rows as a parameter
if an object, reducer.begin_group(), reducer.row()
 and reducer.end_group() will be called
    """
    reducer = build_reducer(reducer)
    reducer.begin_group(None)
    for elt in stream:
        if isinstance(elt, StreamHeader):
Beispiel #33
0
    fields = kwargs['fields']
    table = kwargs['table']
    header = StreamHeader(fields=fields, table=table)
    yield header
    prefix = "INSERT INTO `%s` VALUES " % table
    try:
        for line in stream:
            if not line.startswith(prefix):
                continue
            pos = len(prefix)
            while pos < len(line):
                (elts, pos) = parse_tuple(pos, line)
                yield header.t(*elts)
                if line[pos] == ',':
                    pos = pos + 1
                    continue
                elif line[pos] == ';':
                    break
                else:
                    raise Exception("ParseError pos %u " % pos)
    except TypeError, e:
        print len(elts), elts
        raise e
    yield StreamFooter()

BabeBase.addPullPlugin("sql", ["sql"], pull)

if __name__ == "__main__":
    for line in sys.stdin:
        print parse_tuple(0, line)
Beispiel #34
0
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter([getattr(s, key) for key in names])
        yield u
    yield StreamFooter()

BabeBase.register('pull_twitter', pull_twitter)
Beispiel #35
0

def geoip_country_code(stream, field="ip", country_code="country_code", ignore_error=False, geoip_file = None): 
    """"
Add a 'country_code' field from IP address in field "IP"
    """
    gic = get_gic()
    for r in stream:
        if isinstance(r, StreamHeader):
            header = r.insert(typename=None, fields=[country_code])
            yield header 
        elif isinstance(r, StreamMeta):
            yield r
        else:
            ip = getattr(r, field)
            try: 
                cc = gic.country_code_by_addr(ip)
            except Exception, e:
                if ignore_error:
                    cc = None
                    pass
                else:
                    raise e
            yield header.t(*(r + (cc,)))

## TODO : full region parsing

BabeBase.register("geoip_country_code", geoip_country_code)


Beispiel #36
0
from base import StreamHeader, BabeBase, StreamFooter


def valuenormalize(cell):
    return cell.value


def read(format, stream, kwargs):
    import xlrd
    wb = xlrd.open_workbook(file_contents=stream.read(),
                            encoding_override=kwargs.get('encoding', None))
    ws = wb.sheet_by_index(0)
    nrows = ws.nrows
    fields = kwargs.get('fields', None)
    if not fields:
        b = 1
        fields = [cell.value for cell in ws.row(0)]
    else:
        b = 0
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for i in xrange(b, nrows):
        cells = ws.row(i)
        yield metainfo.t._make(map(valuenormalize, cells))
    yield StreamFooter()


BabeBase.addPullPlugin('xls', ['xls'], read, need_seek=False)
Beispiel #37
0
from collections import *

class Bunch:
  def __init__(self, dictionary):
    self.__dict__ = dictionary

def iterate(stream, function, insert_fields=None, typename=None):
  metainfo = None
  for row in stream:
    if isinstance(row, StreamHeader):
      metainfo = row
      if insert_fields is not None:
        metainfo = metainfo.insert(typename=typename, fields=insert_fields)
      yield metainfo
    elif isinstance(row, StreamMeta):
      yield row
    else:
      d = row._asdict()
#      values = tuple(row)
      if insert_fields is not None:
        for field in insert_fields:
          d[field] = None
#      values = metainfo.t._make(values)
      result = function(Bunch(d))
      yield metainfo.t._make(d.values())
#      yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields])

BabeBase.register("iterate", iterate)


Beispiel #38
0
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [cell.internal_value for cell in it.next()]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    # it brings a new method: iter_rows()
    for row in it:
        ## stop at the first row with "none"
        nrow = map(valuenormalize, row)
        if not any(nrow):
            break
        yield metainfo.t._make(nrow)
    yield StreamFooter()


def write(format, metainfo, instream, outfile, encoding, **kwargs):
    from openpyxl import Workbook
    wb = Workbook(optimized_write=True)
    ws = wb.create_sheet()
    ws.append(metainfo.fields)
    for k in instream:
        if isinstance(k, StreamFooter):
            break
        else:
            ws.append(list(k))
    wb.save(outfile)


BabeBase.addPullPlugin('xlsx', ['xlsx'], read, need_seek=True)
BabeBase.addPushPlugin('xlsx', ['xlsx'], write)
Beispiel #39
0
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter(
            [getattr(s, key) for key in names])
        yield u
    yield StreamFooter()


BabeBase.register('pull_twitter', pull_twitter)
Beispiel #40
0
    it = ws.iter_rows()
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [cell.internal_value for cell in it.next()]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
     # it brings a new method: iter_rows()
    for row in it:
        ## stop at the first row with "none"
        nrow = map(valuenormalize, row)
        if not any(nrow):
            break
        yield metainfo.t._make(nrow)
    yield StreamFooter()


def write(format, metainfo, instream, outfile, encoding, **kwargs):
    from openpyxl import Workbook
    wb = Workbook(optimized_write=True)
    ws = wb.create_sheet()
    ws.append(metainfo.fields)
    for k in instream:
        if isinstance(k, StreamFooter):
            break
        else:
            ws.append(list(k))
    wb.save(outfile)

BabeBase.addPullPlugin('xlsx', ['xlsx'], read, need_seek=True)
BabeBase.addPushPlugin('xlsx', ['xlsx'], write)
Beispiel #41
0
    delimiter = ','
    doublequote = False
    escapechar = '\\'
    quoting = csv.QUOTE_MINIMAL
    quotechar = '"'


def push(format,
         metainfo,
         instream,
         outfile,
         encoding,
         delimiter=None,
         **kwargs):
    if not encoding:
        encoding = "utf8"
    dialect = kwargs.get('dialect', default_dialect)
    if delimiter:
        dialect.delimiter = delimiter
    writer = UnicodeCSVWriter(outfile, dialect=dialect, encoding=encoding)
    writer.writerow(metainfo.fields)
    for k in instream:
        if isinstance(k, StreamFooter):
            break
        else:
            writer.writerow(k)


BabeBase.addPullPlugin('csv', ['csv', 'tsv', 'txt'], pull)
BabeBase.addPushPlugin('csv', ['csv', 'tsv', 'txt'], push)
Beispiel #42
0
import codecs
from base import StreamHeader, BabeBase, StreamFooter

def pull(format, stream, kwargs):    
    stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream)

    fields = kwargs.get('fields', ['text'])
    
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo 
    
    for line in stream:
        yield metainfo.t._make([line])
    yield StreamFooter()

def push(format, metainfo, instream, outfile, encoding, **kwargs):
    outstream = codecs.getwriter(kwargs.get('encoding', 'utf8'))(outfile)
    for row in instream:
        if isinstance(row, StreamFooter):
            break
        else:
            for cell in row: 
                outstream.write(cell)
    outstream.flush()

BabeBase.addPullPlugin('txt', ['txt'], pull)
BabeBase.addPushPlugin('txt', ['txt'], push)
Beispiel #43
0
from base import BabeBase
import urllib2
import urllib
import json


def pull_buzz(stream, username, dataroom, uuid, **kwargs):
    url = 'https://buzzdata.com/api/%s/%s/%s/download_request' % (
        username, dataroom, uuid)
    if 'api_key' in kwargs:
        api_key = kwargs['api_key']
    elif BabeBase.get_config('buzzdata', 'api_key'):
        api_key = BabeBase.get_config('buzzdata', 'api_key')
    else:
        raise Exception('Missing api_key')
    data = urllib.urlencode([('api_key', api_key)])
    drequest = urllib2.urlopen(url, data).read()
    obj = json.loads(drequest)
    download_url = obj['download_request']['url']
    return urllib2.urlopen(download_url)


BabeBase.addProtocolPullPlugin('buzzdata', pull_buzz)
Beispiel #44
0
                    d[t] = float(v)
            else:
                try:
                    d[t] = parse_datetime(v)
                except ValueError:
                    try:
                        d[t] = parse_date(v)
                    except ValueError:
                        pass
        if len(d) > 0:
            return elt._replace(**d)
        else:
            return elt


BabeBase.register("typedetect", typedetect)


def primary_key_detect(stream, max=None):
    d = deque()
    it = iter(stream)
    for linecount, row in enumerate(it):
        d.append(row)
        if isinstance(row, StreamHeader):
            metainfo = row
            values = [set() for k in metainfo.fields]
            keys = set(xrange(0, len(metainfo.fields)))
        elif isinstance(row, StreamMeta):
            pass
        else:
            for idx, val in enumerate(row):
Beispiel #45
0

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)

BabeBase.register('sort', sort)


def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
Beispiel #46
0
                       field="ip",
                       country_code="country_code",
                       ignore_error=False,
                       geoip_file=None):
    """"
Add a 'country_code' field from IP address in field "IP"
    """
    gic = get_gic()
    for r in stream:
        if isinstance(r, StreamHeader):
            header = r.insert(typename=None, fields=[country_code])
            yield header
        elif isinstance(r, StreamMeta):
            yield r
        else:
            ip = getattr(r, field)
            try:
                cc = gic.country_code_by_addr(ip)
            except Exception, e:
                if ignore_error:
                    cc = None
                    pass
                else:
                    raise e
            yield header.t(*(r + (cc, )))


## TODO : full region parsing

BabeBase.register("geoip_country_code", geoip_country_code)
Beispiel #47
0
def do_detect(s):
    global http_detect
    if not http_detect:
        from httpagentparser import detect
        http_detect = detect
    return http_detect(s)


def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None):
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version]))
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            useragent = getattr(row, field)
            o = do_detect(useragent)
            d = []
            if output_os:
                ## On device such as "Ipad", os is in "flavor" and device name in "dist"
                d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None)
            if output_browser:
                d.append(o['browser']['name'] if 'browser' in o else None)
            if output_browser_version:
                d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None)
            yield header.t(*(row + tuple(d)))

BabeBase.register("user_agent", user_agent)
Beispiel #48
0

def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
    Pull objects from mongo as rows
    """
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename'in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()

BabeBase.registerFinalMethod("push_mongo", push_mongo)
BabeBase.register("pull_mongo", pull_mongo)
Beispiel #49
0
            if k == pk:
                reducer.row(elt)
            else:
                if pk is not None:
                    eg = reducer.end_group(metainfo.t)
                    if isinstance(eg, list):
                        for e in eg:
                            yield e
                    else:
                        yield eg
                pk = k
                reducer.begin_group(k)
                reducer.row(elt)


BabeBase.register('groupBy', group)
BabeBase.register('group', group)


def group_all(stream, reducer, typename=None, fields=None):
    """
    Group all keys
reducer can either be a function or a reducer object
if a function, reducer(t, rows) will be called with all the rows as a parameter
if an object, reducer.begin_group(), reducer.row()
 and reducer.end_group() will be called
    """
    reducer = build_reducer(reducer)
    reducer.begin_group(None)
    for elt in stream:
        if isinstance(elt, StreamHeader):
Beispiel #50
0
                p = Popen(c, stdin=PIPE, stdout=None, stderr=None)
                tmpfifo = TempFifo()
                import_query = db_params['import_query'] % (tmpfifo.filename, table_name)
                p.stdin.write(import_query)
                p.stdin.flush()
                writestream = tmpfifo.open_write()
            elif 'load_command' in db_params:
                load_command = [Template(s).substitute(table=table_name, database=database) for s in db_params['load_command']]
                print load_command
                pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None)
                writestream = pp.stdin
            else:
                raise Exception("Missing load_command or import_query in db_kind spec")

            writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8")
            #writer = csv.writer(writestream, dialect=sql_dialect())
        elif isinstance(row, StreamFooter):
            if "import_query" in db_params:
                tmpfifo.close()
                p.stdin.close()
                p.wait()
            elif 'load_command' in db_params:
                pp.stdin.close()
                pp.wait()
        else:
            writer.writerow(row)


BabeBase.register('pull_sql', pull_sql)
BabeBase.registerFinalMethod('push_sql', push_sql)
Beispiel #51
0
        for row in stream:
            if isinstance(row, StreamHeader):
                metainfo = row.augment(typename=typename, fields=[])
                yield metainfo
            elif isinstance(row, StreamMeta):
                yield row
            else:
                yield metainfo.t._make(list(function(row)))
    else:
        for row in stream:
            if isinstance(row, StreamMeta):
                yield row
            else:
                yield function(row)

BabeBase.register("mapTo", mapTo)


def bulkMapTo(stream, function, bulk_size, insert_fields=None, fields=None):
    header = None
    buf = []
    for row in stream:
        if isinstance(row, StreamHeader):
            if insert_fields:
                header = row.insert(typename=None, fields=insert_fields)
            elif fields:
                header = row.insert(typename=None, fields=fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
Beispiel #52
0
def mail(stream,
         subject,
         recipients,
         in_body=False,
         in_body_row_limit=None,
         attach_formats="csv",
         **kwargs):
    """Format a stream in a mail and send it. 
    Recipients: list of recipients mail addresses
    in_body: format (in HTML & text) the content
    in_body_row_limit : maximum number of line in body 
    attach_format : file format to use for attachment 
    """

    smtp_server = BabeBase.get_config('smtp', 'server', kwargs)
    smtp_port = BabeBase.get_config('smtp', 'port', kwargs)
    smtp_tls = BabeBase.get_config('smtp', 'tls', kwargs, False)
    smtp_login = BabeBase.get_config('smtp', 'login', kwargs)
    smtp_password = BabeBase.get_config('smtp', 'password', kwargs)
    author = BabeBase.get_config('smtp', 'author', kwargs)

    formats = []
    if in_body:
        formats.append("html")
    if attach_formats:
        if isinstance(attach_formats, basestring):
            formats.append(attach_formats)
        else:
            formats.extend(attach_formats)
    if isinstance(recipients, basestring):
        recipients = [recipients]

    babes = stream.tee(len(formats))
    if in_body and in_body_row_limit:
        babes[0] = babes[0].head(in_body_row_limit, all_streams=True)

    buffer_dicts = []
    for format, babe in izip(formats, babes):
        d = ordered_dict()
        babe.push(stream_dict=d, format=format)
        buffer_dicts.append((format, d))

    msg = MIMEMultipart()
    msg['Subject'] = subject
    msg['From'] = author
    msg['To'] = ', '.join(recipients)

    for format, d in buffer_dicts:
        if format == "html":
            buf = StringIO()
            buf.write('<html><body>\n')
            for filename in d:
                buf.write(d[filename].getvalue())
                buf.write('\n')
            buf.write('\n</body></html>')
            att = MIMEText(buf.getvalue(), "html")
            msg.attach(att)
        else:
            for filename in d:
                c = d[filename].getvalue()
                (maintype, subtype) = BabeBase.getMimeType(format)
                att = MIMEBase(maintype, subtype)
                att.set_payload(c)
                encoders.encode_base64(att)
                att.add_header('Content-Disposition',
                               'attachment',
                               filename=filename + "." + format)
                msg.attach(att)

    s = smtplib.SMTP(smtp_server, smtp_port)
    s.ehlo()
    if smtp_tls:
        s.starttls()
        s.ehlo()
    s.login(smtp_login, smtp_password)
    s.sendmail(author, recipients, msg.as_string())
    s.quit()
Beispiel #53
0
    doublequote = False
    escapechar = '\\'
    quoting = csv.QUOTE_MINIMAL
    quotechar = '"'


def log(stream, logfile=None):
    if not logfile:
        logstream = sys.stderr
        do_close = False
    elif isinstance(logfile, basestring):
        logstream = open(logfile, 'wb')
        do_close = True
    else:
        logstream = logfile
        do_close = False
    for row in stream:
        if isinstance(row, StreamHeader):
            writer = csv.writer(logstream, log_dialect)
            writer.writerow(row.fields)
        elif isinstance(row, StreamMeta):
            pass
        else:
            writer.writerow(list(row))
        yield row
    if do_close:
        logstream.close()


BabeBase.register("log", log)
Beispiel #54
0
        if isinstance(row, StreamHeader):
            if header == None:
                header = row.replace(partition=partition)
                yield header
            else:
                if not equals_types(header.t, row.t):
                    raise Exception('Header types do not match')
        elif isinstance(row, StreamFooter):
            footer = row
        else:
            yield row
    if footer:
        yield footer


BabeBase.register('merge_substreams', merge_substreams)


def partition(stream, field):
    """Create substream per different value of 'column'"""
    beginning = False
    last_value = None
    header = None
    for row in stream:
        if isinstance(row, StreamHeader):
            beginning = True
            header = row
        elif isinstance(row, StreamFooter):
            if beginning == True:
                beginning = False
                continue  # Empty partition: Emit neither header nor footer
Beispiel #55
0
		if isinstance(row, StreamHeader): 
			header = row
		elif isinstance(row, StreamFooter):
			# HEADER IS : GROUP + (OTHER FIELDS * EACH VALUE
			other_fields =  [f for f in header.fields if not f in group and not f == pivot]
			other_fields_k = map(StreamHeader.keynormalize, other_fields)
			fields = group + [f + "-" + str(v) 
				for v in pivot_values.list for f in other_fields]					
			newheader = header.replace(fields=fields) 
			yield newheader
			for _, row_dict in groups.iteritems(): 
				## Create a line per group
				mrow = row_dict.itervalues().next()
				group_cols = [getattr(mrow, col) for col in group_n]
				for v in pivot_values:
					if v in row_dict:
						mrow = row_dict[v]
						group_cols.extend([getattr(mrow, col) for col in other_fields_k])
					else:
						group_cols.extend([None for col in other_fields])
				yield group_cols
			yield row 
		else:
			kgroup = ""
			for f in group_n:
				kgroup = kgroup + str(getattr(row, f))
			groups[kgroup][getattr(row, pivot)] = row
			pivot_values.add(getattr(row, pivot))

BabeBase.register("pivot", pivot)
Beispiel #56
0
	table = kwargs['table']
	header = StreamHeader(fields=fields, table=table)
	yield header 
	prefix = "INSERT INTO `%s` VALUES " % table 
	try: 
		for line in stream: 
			if not line.startswith(prefix):
				continue
			pos = len(prefix)
			while pos < len(line):
				(elts, pos) = parse_tuple(pos, line)
				yield header.t(*elts)
				if line[pos] == ',':
					pos = pos+1
					continue
				elif line[pos] == ';':
					break
				else:
					raise Exception("ParseError pos %u " % pos)
	except TypeError, e:
		print len(elts), elts 
		raise e
	yield StreamFooter()

BabeBase.addPullPlugin("sql", ["sql"], pull)

if __name__ == "__main__": 
	for line in sys.stdin:
		print parse_tuple(0, line)

Beispiel #57
0

def minmaxN(stream, column, n, max=True):
    "Keep the n rows maximizing value for 'column' for each stream"
    itt = iter(stream)
    while True:
        elt = itt.next()
        if not isinstance(elt, StreamHeader):
            raise Exception("Missing metainfo")
        yield elt
        g = Guard()
        it = itertools.takewhile(g.filter, itt)
        f = heapq.nlargest if max else heapq.nsmallest
        for elt in f(n, it, key=lambda row: getattr(row, column)):
            yield elt
        yield g.footer


def maxN(stream, column, n):
    for k in minmaxN(stream, column, n, max=True):
        yield k


def minN(stream, column, n):
    for k in minmaxN(stream, column, n, max=False):
        yield k


BabeBase.register('maxN', maxN)
BabeBase.register('minN', minN)