def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename' in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader( **dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter()
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename'in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter()
def pull(format, stream, kwargs): """ Read a SQL dump "INSERT VALUE" statements from a single table table = The name of the table to read (mandatory) fields = The sets """ fields = kwargs['fields'] table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos + 1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e
def pull(format, stream, kwargs): """ Read a SQL dump "INSERT VALUE" statements from a single table table = The name of the table to read (mandatory) fields = The sets """ fields = kwargs['fields'] table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos+1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e
def pull_bigquery(false_stream, project_id, query=None, timeout=10000, num_retries=2, **kwargs): bigquery = get_bigquery() query_data = { 'query': query, 'timeoutMs': 0, # use a timeout of 0 means we'll always need # to get the results via getQueryResults } response = bigquery.jobs().query( projectId=project_id, body=query_data ).execute( num_retries=num_retries ) metainfo = None job_ref = response['jobReference'] while True: page_token = response.get('pageToken', None) query_complete = response.get('jobComplete', False) if query_complete: if not metainfo: fields = [f['name'] for f in response['schema']['fields']] typename = kwargs.get('typename', 'BigQuery') metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields)) yield metainfo for row in response['rows']: yield metainfo.t(*[field['v'] for field in row['f']]) if page_token is None: # The query is done and there are no more results # to read. yield StreamFooter() break response = bigquery.jobs().getQueryResults( pageToken=page_token, timeoutMs=timeout, **job_ref ).execute( num_retries=num_retries )
def transpose(stream, typename=None): """ Transpose a stream. For each row, the 'unique identifier' for this row will be used as a column name. city, b, c PARIS, foo, bas LONDON, coucou, salut field, PARIS,LONDON city, PARIS, LONDON b, foo, coucou c, bas, salut b,c foo,bar coucou,salut field, 1, 2 b,foo, coucou c,bar,salut """ for row in stream: if isinstance(row, StreamHeader): metainfo = row linecount = 0 t_names = ['field'] t_primary_key = 'field' t_rows = [[name] for name in metainfo.fields] elif isinstance(row, StreamFooter): t_metainfo = StreamHeader(source=metainfo.source, typename=typename, fields=t_names, primary_key=t_primary_key) yield t_metainfo for t_row in t_rows: if t_row[0] == metainfo.primary_key: # Skip primary key continue yield t_metainfo.t(*t_row) yield row else: linecount = linecount + 1 c_id = metainfo.get_primary_identifier(row, linecount) t_names.append(c_id) for i, cell in enumerate(row): t_rows[i].append(cell)
def linepull(stream, dialect, kwargs): it = iter(stream) fields = kwargs.get('fields', None) if not fields: fields = [it.next().rstrip('\r\n')] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for row in it: yield metainfo.t._make([row.rstrip('\r\n')]) yield StreamFooter()
def pull(format, stream, kwargs): stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream) fields = kwargs.get('fields', ['text']) metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for line in stream: yield metainfo.t._make([line]) yield StreamFooter()
def pull(format, stream, kwargs): stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream) previous_fields = None for line in stream: data = json.loads(line) fields = data.keys() if previous_fields != fields: metainfo = StreamHeader(**dict(kwargs, fields=fields)) previous_fields = fields yield metainfo yield metainfo.t._make(data.values()) yield StreamFooter()
def csvpull(stream, dialect, kwargs): reader = csv.reader(stream, dialect) fields = kwargs.get('fields', None) null_value = kwargs.get('null_value', "") ignore_malformed = kwargs.get('ignore_bad_lines', False) if not fields: fields = reader.next() metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for row in reader: try: yield metainfo.t._make([build_value(x, null_value) for x in row]) except Exception, e: if ignore_malformed: log.warn("Malformed line: %s, %s" % (row, e)) else: raise e
def read(format, stream, kwargs): import xlrd wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None)) ws = wb.sheet_by_index(0) nrows = ws.nrows fields = kwargs.get('fields', None) if not fields: b = 1 fields = [cell.value for cell in ws.row(0)] else: b = 0 metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for i in xrange(b, nrows): cells = ws.row(i) yield metainfo.t._make(map(valuenormalize, cells)) yield StreamFooter()
def read(format, stream, kwargs): from openpyxl import load_workbook wb = load_workbook(filename=stream, use_iterators=True) ws = wb.get_active_sheet() it = ws.iter_rows() fields = kwargs.get('fields', None) if not fields: fields = [cell.internal_value for cell in it.next()] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for row in it: # it brings a new method: iter_rows() ## stop at the first row with "none" nrow = map(valuenormalize, row) if not any(nrow): break yield metainfo.t._make(nrow) yield StreamFooter()
def pull_twitter(false_stream, consumer_key=None, consumer_secret=None, access_token=None, access_token_secret=None): import tweepy if consumer_key: auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter( [getattr(s, key) for key in names]) yield u yield StreamFooter()
def pull_sql(false_stream, query=None, table=None, host=None, database_kind=None, database=None, ssh_host=None, user=None, password=None, sql_command=None, **kwargs): """Pull from SQL query to the database. query : The query to execute, if not SELECT * FROM table table : The table to fetch from db : The database to query host : The host to connect to ssh_host : SSH to a remote connection. HOST or USER@HOST command : Override the connection command string prefix """ ignore_bad_lines = kwargs.get('ignore_bad_lines', False) # Existing iterator go first. if hasattr(false_stream, 'stream') and false_stream.stream: for row in false_stream: yield row db_params = PULL_DB[database_kind] if sql_command: c = sql_command else: c = db_params['command'] if 'separator' in db_params: c = c + [db_params['separator'] % '\t'] if user: c = c + [db_params['user'] % user] if password: c = c + [db_params['password'] % password] c = c + [database] if not query: query = 'SELECT * FROM %s' % table if db_params.get('need_pipe', False): tmpfifo = TempFifo() readstream = tmpfifo.open_read() else: tmpfifo = None readstream = None query_ins = Template(db_params['query_template']).substitute( query=query, out_filename=tmpfifo.filename if tmpfifo else None) p = Popen(c, stdin=PIPE, stdout=None if readstream else PIPE, stderr=None) p.stdin.write(query_ins) p.stdin.flush() p.stdin.close() dialect = sql_dialect() stream = readstream if readstream else p.stdout #if kwargs.get('utf8_cleanup', False): # stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8')) #elif codecs.getreader(kwargs.get('encoding', 'utf-8')) != codecs.getreader('utf-8'): # stream = UTF8Recoder(stream, kwargs.get('encoding', None)) #else: # pass reader = csv.reader(stream, dialect=dialect) fields = reader.next() ## Vectorwise specifics ... ## Remove the last characeter (space on the l) if database_kind == 'vectorwise': fields[-1] = fields[-1][:-1] if fields[0].startswith("E_"): print >> sys.stderr, ' '.join(fields) for line in stream: print >> sys.stderr, line.rstrip() raise Exception("Error in SQL Command") metainfo = StreamHeader(**dict(kwargs, typename=table, fields=fields)) yield metainfo for row in reader: if database_kind == 'vectorwise': if len(row) == 0: print 'Error, empty row: %s ' % row continue row[-1] = row[-1][:-1] try: yield metainfo.t._make([unicode(x, 'utf-8') for x in row]) except UnicodeDecodeError: if ignore_bad_lines: print "Error on line ", x else: raise p.wait() if p.returncode != 0: raise Exception("SQL process failed with errcode %u" % p.returncode) yield StreamFooter()