def pull(format, stream, kwargs): """ Read a SQL dump "INSERT VALUE" statements from a single table table = The name of the table to read (mandatory) fields = The sets """ fields = kwargs['fields'] table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos + 1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename'in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter()
def pull(format, stream, kwargs): """ Read a SQL dump "INSERT VALUE" statements from a single table table = The name of the table to read (mandatory) fields = The sets """ fields = kwargs['fields'] table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos+1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename' in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader( **dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter()
def pull_bigquery(false_stream, project_id, query=None, timeout=10000, num_retries=2, **kwargs): bigquery = get_bigquery() query_data = { 'query': query, 'timeoutMs': 0, # use a timeout of 0 means we'll always need # to get the results via getQueryResults } response = bigquery.jobs().query( projectId=project_id, body=query_data ).execute( num_retries=num_retries ) metainfo = None job_ref = response['jobReference'] while True: page_token = response.get('pageToken', None) query_complete = response.get('jobComplete', False) if query_complete: if not metainfo: fields = [f['name'] for f in response['schema']['fields']] typename = kwargs.get('typename', 'BigQuery') metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo for row in response['rows']: yield metainfo.t(*[field['v'] for field in row['f']]) if page_token is None: # The query is done and there are no more results # to read. yield StreamFooter() break response = bigquery.jobs().getQueryResults( pageToken=page_token, timeoutMs=timeout, **job_ref ).execute( num_retries=num_retries )
def transpose(stream, typename=None): """ Transpose a stream. For each row, the 'unique identifier' for this row will be used as a column name. city, b, c PARIS, foo, bas LONDON, coucou, salut field, PARIS,LONDON city, PARIS, LONDON b, foo, coucou c, bas, salut b,c foo,bar coucou,salut field, 1, 2 b,foo, coucou c,bar,salut """ for row in stream: if isinstance(row, StreamHeader): metainfo = row linecount = 0 t_names = ['field'] t_primary_key = 'field' t_rows = [[name] for name in metainfo.fields] elif isinstance(row, StreamFooter): t_metainfo = StreamHeader(source=metainfo.source, typename=typename, fields=t_names, primary_key=t_primary_key) yield t_metainfo for t_row in t_rows: if t_row[0] == metainfo.primary_key: # Skip primary key continue yield t_metainfo.t(*t_row) yield row else: linecount = linecount + 1 c_id = metainfo.get_primary_identifier(row, linecount) t_names.append(c_id) for i, cell in enumerate(row): t_rows[i].append(cell)