Beispiel #1
0
                    d[t] = float(v)
            else:
                try:
                    d[t] = parse_datetime(v)
                except ValueError:
                    try:
                        d[t] = parse_date(v)
                    except ValueError:
                        pass
        if len(d) > 0:
            return elt._replace(**d)
        else:
            return elt


BabeBase.register("typedetect", typedetect)


def primary_key_detect(stream, max=None):
    d = deque()
    it = iter(stream)
    for linecount, row in enumerate(it):
        d.append(row)
        if isinstance(row, StreamHeader):
            metainfo = row
            values = [set() for k in metainfo.fields]
            keys = set(xrange(0, len(metainfo.fields)))
        elif isinstance(row, StreamMeta):
            pass
        else:
            for idx, val in enumerate(row):
Beispiel #2
0
                       field="ip",
                       country_code="country_code",
                       ignore_error=False,
                       geoip_file=None):
    """"
Add a 'country_code' field from IP address in field "IP"
    """
    gic = get_gic()
    for r in stream:
        if isinstance(r, StreamHeader):
            header = r.insert(typename=None, fields=[country_code])
            yield header
        elif isinstance(r, StreamMeta):
            yield r
        else:
            ip = getattr(r, field)
            try:
                cc = gic.country_code_by_addr(ip)
            except Exception, e:
                if ignore_error:
                    cc = None
                    pass
                else:
                    raise e
            yield header.t(*(r + (cc, )))


## TODO : full region parsing

BabeBase.register("geoip_country_code", geoip_country_code)
Beispiel #3
0
			else:
				fields = [field for field in join_header.fields if field != join_key]
			header = row.insert(typename=None, fields=fields)
			yield header
		elif isinstance(row, StreamMeta):
			yield row
		else: 
			k = getattr(row, key)
			if k in d: 
				dd = row._asdict()
				jrow = d[k]
				for field in fields: 
					dd[field] = getattr(jrow, field)
				yield header.t(**dd)
			else: 
				if on_error == BabeBase.ON_ERROR_WARN: 
					BabeBase.log_warn("join", row, "Not matching value for key")
				elif on_error == BabeBase.ON_ERROR_FAIL:
					raise Exception("No matching value for key %s" % k)
				elif on_error == BabeBase.ON_ERROR_NONE:
					dd = row._asdict()
					for f in fields:
						dd[f] = None
					yield header.t(**dd) 
				elif on_error == BabeBase.ON_ERROR_SKIP:
					pass



BabeBase.register("join", join)
Beispiel #4
0
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception, e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)


BabeBase.register("parse_time", stream_parse_datetime)

if __name__ == "__main__":
    print parse_date('2011/04/01')
    print parse_date('01 June 2009')
    print parse_datetime('2011/04/01 03:43')
    print parse_datetime('2011/04/01 3pm45')
Beispiel #5
0

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)

BabeBase.register('sort', sort)


def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
Beispiel #6
0
    doublequote = False
    escapechar = '\\'
    quoting = csv.QUOTE_MINIMAL
    quotechar = '"'


def log(stream, logfile=None):
    if not logfile:
        logstream = sys.stderr
        do_close = False
    elif isinstance(logfile, basestring):
        logstream = open(logfile, 'wb')
        do_close = True
    else:
        logstream = logfile
        do_close = False
    for row in stream:
        if isinstance(row, StreamHeader):
            writer = csv.writer(logstream, log_dialect)
            writer.writerow(row.fields)
        elif isinstance(row, StreamMeta):
            pass
        else:
            writer.writerow(list(row))
        yield row
    if do_close:
        logstream.close()


BabeBase.register("log", log)
Beispiel #7
0
            if k == pk:
                reducer.row(elt)
            else:
                if pk is not None:
                    eg = reducer.end_group(metainfo.t)
                    if isinstance(eg, list):
                        for e in eg:
                            yield e
                    else:
                        yield eg
                pk = k
                reducer.begin_group(k)
                reducer.row(elt)


BabeBase.register('groupBy', group)
BabeBase.register('group', group)


def group_all(stream, reducer, typename=None, fields=None):
    """
    Group all keys
reducer can either be a function or a reducer object
if a function, reducer(t, rows) will be called with all the rows as a parameter
if an object, reducer.begin_group(), reducer.row()
 and reducer.end_group() will be called
    """
    reducer = build_reducer(reducer)
    reducer.begin_group(None)
    for elt in stream:
        if isinstance(elt, StreamHeader):
Beispiel #8
0
class Bunch:
    def __init__(self, dictionary):
        self.__dict__ = dictionary


def iterate(stream, function, insert_fields=None, typename=None):
    metainfo = None
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            if insert_fields is not None:
                metainfo = metainfo.insert(typename=typename,
                                           fields=insert_fields)
            yield metainfo
        elif isinstance(row, StreamMeta):
            yield row
        else:
            d = row._asdict()
            #      values = tuple(row)
            if insert_fields is not None:
                for field in insert_fields:
                    d[field] = None
#      values = metainfo.t._make(values)
            result = function(Bunch(d))
            yield metainfo.t._make(d.values())


#      yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields])

BabeBase.register("iterate", iterate)
Beispiel #9
0
    """
Deduplicate a stream
If columns is specified only apply the  deduplication on the specified columns
Otherwise apply the deduplication over all values.
    """
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            if fields:
                indexes = [metainfo.fields.index(c) for c in fields]
            else:
                indexes = None
            s = set()
            yield row
        elif isinstance(row, StreamMeta):
            yield row
        else:
            if indexes:
                l = list(row)
                v = tuple([l[i] for i in indexes])
            else:
                v = row
            if v in s:
                pass
            else:
                yield row
                s.add(v)


BabeBase.register('dedup', dedup)
Beispiel #10
0
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
	Pull objects from mongo as rows
	"""
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename' in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(
                **dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()


BabeBase.registerFinalMethod("push_mongo", push_mongo)
BabeBase.register("pull_mongo", pull_mongo)
Beispiel #11
0
	header = None
	for row in stream: 
		if isinstance(row, StreamHeader):
			if header == None:
				header = row.replace(partition=partition)
				yield header
			else:
				if not equals_types(header.t, row.t):
					raise Exception('Header types do not match')
		elif isinstance(row, StreamFooter):
			footer = row
		else:
			yield row
	yield footer

BabeBase.register('merge_substreams', merge_substreams)

def partition(stream, field): 
	"""Create substream per different value of 'column'"""
	beginning = False
	last_value = None
	header = None
	for row in stream: 
		if isinstance(row, StreamHeader): 
			beginning = True
			header = row 
		elif isinstance(row, StreamFooter): 
			if beginning == True:
				beginning = False
				continue  ## Empty partition: Emit neither header nor footer
			yield row 
Beispiel #12
0
                fields = add_fields
            else:
                fields = [field for field in join_header.fields if field != join_key]
            header = row.insert(typename=None, fields=fields)
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            k = getattr(row, key)
            if k in d:
                dd = row._asdict()
                jrow = d[k]
                for field in fields:
                    dd[field] = getattr(jrow, field)
                yield header.t(**dd)
            else:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("join", row, "Not matching value for key")
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise Exception("No matching value for key %s" % k)
                elif on_error == BabeBase.ON_ERROR_NONE:
                    dd = row._asdict()
                    for f in fields:
                        dd[f] = None
                    yield header.t(**dd)
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass


BabeBase.register("join", join)
Beispiel #13
0
                import_query = db_params["import_query"] % (tmpfifo.filename, table_name)
                p.stdin.write(import_query)
                p.stdin.flush()
                writestream = tmpfifo.open_write()
            elif "load_command" in db_params:
                load_command = [
                    Template(s).substitute(table=table_name, database=database) for s in db_params["load_command"]
                ]
                print load_command
                pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None)
                writestream = pp.stdin
            else:
                raise Exception("Missing load_command or import_query in db_kind spec")

            writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8")
            # writer = csv.writer(writestream, dialect=sql_dialect())
        elif isinstance(row, StreamFooter):
            if "import_query" in db_params:
                tmpfifo.close()
                p.stdin.close()
                p.wait()
            elif "load_command" in db_params:
                pp.stdin.close()
                pp.wait()
        else:
            writer.writerow(row)


BabeBase.register("pull_sql", pull_sql)
BabeBase.registerFinalMethod("push_sql", push_sql)
Beispiel #14
0
                p = Popen(c, stdin=PIPE, stdout=None, stderr=None)
                tmpfifo = TempFifo()
                import_query = db_params['import_query'] % (tmpfifo.filename, table_name)
                p.stdin.write(import_query)
                p.stdin.flush()
                writestream = tmpfifo.open_write()
            elif 'load_command' in db_params:
                load_command = [Template(s).substitute(table=table_name, database=database) for s in db_params['load_command']]
                print load_command
                pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None)
                writestream = pp.stdin
            else:
                raise Exception("Missing load_command or import_query in db_kind spec")

            writer = UnicodeCSVWriter(writestream, dialect=sql_dialect(), encoding="utf-8")
            #writer = csv.writer(writestream, dialect=sql_dialect())
        elif isinstance(row, StreamFooter):
            if "import_query" in db_params:
                tmpfifo.close()
                p.stdin.close()
                p.wait()
            elif 'load_command' in db_params:
                pp.stdin.close()
                pp.wait()
        else:
            writer.writerow(row)


BabeBase.register('pull_sql', pull_sql)
BabeBase.registerFinalMethod('push_sql', push_sql)
Beispiel #15
0
        page_token = response.get('pageToken', None)
        query_complete = response.get('jobComplete', False)

        if query_complete:
            if not metainfo:
                fields = [f['name'] for f in response['schema']['fields']]
                typename = kwargs.get('typename', 'BigQuery')
                metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
                yield metainfo

            for row in response['rows']:
                yield metainfo.t(*[field['v'] for field in row['f']])

            if page_token is None:
                # The query is done and there are no more results
                # to read.
                yield StreamFooter()
                break

        response = bigquery.jobs().getQueryResults(
            pageToken=page_token,
            timeoutMs=timeout,
            **job_ref
        ).execute(
            num_retries=num_retries
        )


BabeBase.register('pull_bigquery', pull_bigquery)
BabeBase.registerFinalMethod('push_bigquery', push_bigquery)
Beispiel #16
0
        if isinstance(row, StreamHeader):
            if header == None:
                header = row.replace(partition=partition)
                yield header
            else:
                if not equals_types(header.t, row.t):
                    raise Exception('Header types do not match')
        elif isinstance(row, StreamFooter):
            footer = row
        else:
            yield row
    if footer:
        yield footer


BabeBase.register('merge_substreams', merge_substreams)


def partition(stream, field):
    """Create substream per different value of 'column'"""
    beginning = False
    last_value = None
    header = None
    for row in stream:
        if isinstance(row, StreamHeader):
            beginning = True
            header = row
        elif isinstance(row, StreamFooter):
            if beginning == True:
                beginning = False
                continue  # Empty partition: Emit neither header nor footer
Beispiel #17
0
def do_detect(s):
    global http_detect
    if not http_detect:
        from httpagentparser import detect
        http_detect = detect
    return http_detect(s)


def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None):
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.insert(typename=None, fields=filter(lambda x: x is not None, [output_os, output_browser, output_browser_version]))
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            useragent = getattr(row, field)
            o = do_detect(useragent)
            d = []
            if output_os:
                ## On device such as "Ipad", os is in "flavor" and device name in "dist"
                d.append(o['os']['name'] if 'os' in o else o['dist']['name'] if 'dist' in o else None)
            if output_browser:
                d.append(o['browser']['name'] if 'browser' in o else None)
            if output_browser_version:
                d.append(o['browser']['version'] if 'browser' in o and 'version' in o['browser'] else None)
            yield header.t(*(row + tuple(d)))

BabeBase.register("user_agent", user_agent)
Beispiel #18
0
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter(
            [getattr(s, key) for key in names])
        yield u
    yield StreamFooter()


BabeBase.register('pull_twitter', pull_twitter)
Beispiel #19
0

def minmaxN(stream, column, n, max=True):
    "Keep the n rows maximizing value for 'column' for each stream"
    itt = iter(stream)
    while True:
        elt = itt.next()
        if not isinstance(elt, StreamHeader):
            raise Exception("Missing metainfo")
        yield elt
        g = Guard()
        it = itertools.takewhile(g.filter, itt)
        f = heapq.nlargest if max else heapq.nsmallest
        for elt in f(n, it, key=lambda row: getattr(row, column)):
            yield elt
        yield g.footer


def maxN(stream, column, n):
    for k in minmaxN(stream, column, n, max=True):
        yield k


def minN(stream, column, n):
    for k in minmaxN(stream, column, n, max=False):
        yield k


BabeBase.register('maxN', maxN)
BabeBase.register('minN', minN)
Beispiel #20
0
        for row in stream:
            if isinstance(row, StreamHeader):
                metainfo = row.augment(typename=typename, fields=[])
                yield metainfo
            elif isinstance(row, StreamMeta):
                yield row
            else:
                yield metainfo.t._make(list(function(row)))
    else:
        for row in stream:
            if isinstance(row, StreamMeta):
                yield row
            else:
                yield function(row)

BabeBase.register("mapTo", mapTo)


def bulkMapTo(stream, function, bulk_size, insert_fields=None, fields=None):
    header = None
    buf = []
    for row in stream:
        if isinstance(row, StreamHeader):
            if insert_fields:
                header = row.insert(typename=None, fields=insert_fields)
            elif fields:
                header = row.insert(typename=None, fields=fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
Beispiel #21
0
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter([getattr(s, key) for key in names])
        yield u
    yield StreamFooter()

BabeBase.register('pull_twitter', pull_twitter)
Beispiel #22
0
		if isinstance(row, StreamHeader): 
			header = row
		elif isinstance(row, StreamFooter):
			# HEADER IS : GROUP + (OTHER FIELDS * EACH VALUE
			other_fields =  [f for f in header.fields if not f in group and not f == pivot]
			other_fields_k = map(StreamHeader.keynormalize, other_fields)
			fields = group + [f + "-" + str(v) 
				for v in pivot_values.list for f in other_fields]					
			newheader = header.replace(fields=fields) 
			yield newheader
			for _, row_dict in groups.iteritems(): 
				## Create a line per group
				mrow = row_dict.itervalues().next()
				group_cols = [getattr(mrow, col) for col in group_n]
				for v in pivot_values:
					if v in row_dict:
						mrow = row_dict[v]
						group_cols.extend([getattr(mrow, col) for col in other_fields_k])
					else:
						group_cols.extend([None for col in other_fields])
				yield group_cols
			yield row 
		else:
			kgroup = ""
			for f in group_n:
				kgroup = kgroup + str(getattr(row, f))
			groups[kgroup][getattr(row, pivot)] = row
			pivot_values.add(getattr(row, pivot))

BabeBase.register("pivot", pivot)
Beispiel #23
0
from collections import *

class Bunch:
  def __init__(self, dictionary):
    self.__dict__ = dictionary

def iterate(stream, function, insert_fields=None, typename=None):
  metainfo = None
  for row in stream:
    if isinstance(row, StreamHeader):
      metainfo = row
      if insert_fields is not None:
        metainfo = metainfo.insert(typename=typename, fields=insert_fields)
      yield metainfo
    elif isinstance(row, StreamMeta):
      yield row
    else:
      d = row._asdict()
#      values = tuple(row)
      if insert_fields is not None:
        for field in insert_fields:
          d[field] = None
#      values = metainfo.t._make(values)
      result = function(Bunch(d))
      yield metainfo.t._make(d.values())
#      yield metainfo.t._make([result.__dict__[key] for key in metainfo.t._fields])

BabeBase.register("iterate", iterate)


Beispiel #24
0

def geoip_country_code(stream, field="ip", country_code="country_code", ignore_error=False, geoip_file = None): 
    """"
Add a 'country_code' field from IP address in field "IP"
    """
    gic = get_gic()
    for r in stream:
        if isinstance(r, StreamHeader):
            header = r.insert(typename=None, fields=[country_code])
            yield header 
        elif isinstance(r, StreamMeta):
            yield r
        else:
            ip = getattr(r, field)
            try: 
                cc = gic.country_code_by_addr(ip)
            except Exception, e:
                if ignore_error:
                    cc = None
                    pass
                else:
                    raise e
            yield header.t(*(r + (cc,)))

## TODO : full region parsing

BabeBase.register("geoip_country_code", geoip_country_code)


Beispiel #25
0
            return False
        else:
            return True

def minmaxN(stream, column, n, max=True):
    "Keep the n rows maximizing value for 'column' for each stream"
    itt = iter(stream)
    while True:
        elt = itt.next()
        if not isinstance(elt, StreamHeader):
            raise Exception("Missing metainfo")
        yield elt         
        g = Guard()
        it = itertools.takewhile(g.filter, itt)
        f = heapq.nlargest if max else heapq.nsmallest 
        for elt in f(n, it, key=lambda row : getattr(row, column)):
            yield elt
        yield g.footer
            
def maxN(stream, column, n):
    for k in minmaxN(stream, column, n, max=True):
        yield k

def minN(stream, column, n):
    for k in minmaxN(stream, column, n, max=False):
        yield k 
    
BabeBase.register('maxN', maxN)
BabeBase.register('minN', minN)

    
        return {}


def user_agent(stream, field, output_os=None, output_browser=None, output_browser_version=None):
    for row in stream:
        if isinstance(row, StreamHeader):
            header = row.insert(typename=None,
                                fields=filter(lambda x: x is not None,
                                              [output_os, output_browser, output_browser_version]))
            yield header
        elif isinstance(row, StreamMeta):
            yield row
        else:
            useragent = getattr(row, field)
            o = do_detect(useragent)
            d = []
            if output_os:
                d.append(o['os']['name']
                         if 'os' in o
                         else o['dist']['name'] if 'dist' in o else None)
                pass
            if output_browser:
                d.append(o['browser']['name'] if 'browser' in o else None)
            if output_browser_version:
                d.append(o['browser']['version']
                         if 'browser' in o and 'version' in o['browser']
                         else None)
            yield header.t(*(row + tuple(d)))

BabeBase.register("user_agent", user_agent)
Beispiel #27
0
        for row in stream:
            if isinstance(row, StreamHeader):
                metainfo = row.augment(typename=typename, fields=[])
                yield metainfo
            elif isinstance(row, StreamMeta):
                yield row 
            else:
                yield metainfo.t._make(list(function(row)))
    else:
        for row in stream:
            if isinstance(row, StreamMeta):
                yield row
            else: 
                yield function(row)
    
BabeBase.register("mapTo", mapTo)

def bulkMapTo(stream, function, bulk_size, insert_fields = None, fields = None): 
    header = None
    buf = []
    for row in stream: 
        if isinstance(row, StreamHeader): 
            if insert_fields: 
                header = row.insert(typename=None, fields=insert_fields)
            elif fields: 
                header = row.insert(typename=None, fields=fields)
            else:
                header = row
            yield header
        elif isinstance(row, StreamFooter) or len(buf) == bulk_size - 1:
            if not isinstance(row, StreamFooter): 
Beispiel #28
0
            elif 'load_command' in db_params:
                load_command = [
                    Template(s).substitute(table=table_name, database=database)
                    for s in db_params['load_command']
                ]
                print load_command
                pp = Popen(load_command, stdin=PIPE, stdout=None, stderr=None)
                writestream = pp.stdin
            else:
                raise Exception(
                    "Missing load_command or import_query in db_kind spec")

            writer = UnicodeCSVWriter(writestream,
                                      dialect=sql_dialect(),
                                      encoding="utf-8")
            #writer = csv.writer(writestream, dialect=sql_dialect())
        elif isinstance(row, StreamFooter):
            if "import_query" in db_params:
                tmpfifo.close()
                p.stdin.close()
                p.wait()
            elif 'load_command' in db_params:
                pp.stdin.close()
                pp.wait()
        else:
            writer.writerow(row)


BabeBase.register('pull_sql', pull_sql)
BabeBase.registerFinalMethod('push_sql', push_sql)
Beispiel #29
0
                    d[t] = int(v)
                elif g.group('float'):
                    d[t] = float(v)
            else:
                try: 
                    d[t] = parse_datetime(v)
                except ValueError: 
                    try: 
                        d[t] = parse_date(v)
                    except ValueError:
                        pass
        if len(d) > 0:
            return elt._replace(**d)
        else:
            return elt
BabeBase.register("typedetect", typedetect)


def primary_key_detect(stream, max=None): 
    d = deque()
    it = iter(stream)
    for linecount, row in enumerate(it):
        d.append(row)
        if isinstance(row,StreamHeader): 
            metainfo = row
            values = [set() for k in metainfo.fields]
            keys = set(xrange(0,len(metainfo.fields)))
        elif isinstance(row, StreamMeta):
            pass
        else:
            for idx, val in enumerate(row):
Beispiel #30
0

def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
    Pull objects from mongo as rows
    """
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename'in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()

BabeBase.registerFinalMethod("push_mongo", push_mongo)
BabeBase.register("pull_mongo", pull_mongo)
Beispiel #31
0

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)

BabeBase.register('sort', sort)


def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
Beispiel #32
0
            if k == pk:
                reducer.row(elt)
            else:
                if pk is not None:
                    eg = reducer.end_group(metainfo.t)
                    if isinstance(eg, list):
                        for e in eg:
                            yield e
                    else:
                        yield eg
                pk = k
                reducer.begin_group(k)
                reducer.row(elt)


BabeBase.register('groupBy', group)
BabeBase.register('group', group)


def group_all(stream, reducer, typename=None, fields=None):
    """
    Group all keys
reducer can either be a function or a reducer object
if a function, reducer(t, rows) will be called with all the rows as a parameter
if an object, reducer.begin_group(), reducer.row()
 and reducer.end_group() will be called
    """
    reducer = build_reducer(reducer)
    reducer.begin_group(None)
    for elt in stream:
        if isinstance(elt, StreamHeader):
Beispiel #33
0
import itertools

def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)
        
BabeBase.register('sort', sort)        

def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0 
    t = None
    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
    for elt in stream: 
        if isinstance(elt, StreamHeader):
Beispiel #34
0
                if output_date:
                    date = datetime.date(time_value_ext.year,
                                         time_value_ext.month,
                                         time_value_ext.day)
                    d[output_date] = date
                if output_hour:
                    d[output_hour] = time_value_ext.hour
                yield header.t(**d)
            except Exception as e:
                if on_error == BabeBase.ON_ERROR_WARN:
                    BabeBase.log_warn("parse_time", row, e)
                elif on_error == BabeBase.ON_ERROR_FAIL:
                    raise e
                elif on_error == BabeBase.ON_ERROR_SKIP:
                    pass
                elif on_error == BabeBase.ON_ERROR_NONE:
                    d = row._asdict()
                    for k in [output_time, output_date, output_hour]:
                        if k:
                            d[k] = None
                    yield header.t(**d)

BabeBase.register("parse_time", stream_parse_datetime)


if __name__ == "__main__":
    print(parse_date('2011/04/01'))
    print(parse_date('01 June 2009'))
    print(parse_datetime('2011/04/01 03:43'))
    print(parse_datetime('2011/04/01 3pm45'))
Beispiel #35
0
def sort(stream, field, reverse=False):
    buf = []
    for elt in stream:
        if isinstance(elt, StreamHeader):
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort(key=lambda obj: getattr(obj, field), reverse=reverse)
            for row in buf:
                yield row
            yield elt
        else:
            buf.append(elt)


BabeBase.register('sort', sort)


def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()