def head(stream, n, all_streams = False): """Retrieve only the first n lines. If all_streams is true, apply head on each substream Otherwise (default), only keep the first substream.""" if not all_streams: for row in stream: if isinstance(row, StreamHeader): count = 0 elif isinstance(row, StreamFooter): break else: if count >= n: yield StreamFooter() break count = count + 1 yield row else: skip = False for row in stream: if isinstance(row, StreamHeader): count = 0 elif isinstance(row, StreamFooter): skip = False else: if count >= n: skip = True count = count + 1 if not skip: yield row
def partition(stream, field): """Create substream per different value of 'column'""" beginning = False last_value = None header = None for row in stream: if isinstance(row, StreamHeader): beginning = True header = row elif isinstance(row, StreamFooter): if beginning == True: beginning = False continue # Empty partition: Emit neither header nor footer yield row else: v = getattr(row, field) if beginning: beginning = False last_value = v yield header.replace(partition=[(field, v)]) elif v != last_value: yield StreamFooter() yield header.replace(partition=[(field, v)]) last_value = v yield row
def pull_mongo(false_stream, db, collection, spec=None, **kwargs): """ Pull objects from mongo as rows """ k = kwargs.copy() if 'fields' in k: del k['fields'] if 'typename' in k: del k['typename'] connection = Connection(**k) db_ = connection[db] coll = db_[collection] metainfo = None for doc in coll.find(spec, **k): if not metainfo: fields = kwargs.get('fields', None) if not fields: fields = [StreamHeader.keynormalize(n) for n in doc] fields.sort() # Mandatory for determisn. typename = kwargs.get('typename', collection) metainfo = StreamHeader( **dict(kwargs, typename=typename, fields=fields)) yield metainfo yield metainfo.t(*[doc[field] for field in fields]) if metainfo: yield StreamFooter()
def linepull(stream, dialect, kwargs): it = iter(stream) fields = kwargs.get('fields', None) if not fields: fields = [it.next().rstrip('\r\n')] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for row in it: yield metainfo.t._make([row.rstrip('\r\n')]) yield StreamFooter()
def pull(format, stream, kwargs): stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream) fields = kwargs.get('fields', ['text']) metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for line in stream: yield metainfo.t._make([line]) yield StreamFooter()
def pull(format, stream, kwargs): stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream) previous_fields = None for line in stream: data = json.loads(line) fields = data.keys() if previous_fields != fields: metainfo = StreamHeader(**dict(kwargs, fields=fields)) previous_fields = fields yield metainfo yield metainfo.t._make(data.values()) yield StreamFooter()
def read(format, stream, kwargs): import xlrd wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None)) ws = wb.sheet_by_index(0) nrows = ws.nrows fields = kwargs.get('fields', None) if not fields: b = 1 fields = [cell.value for cell in ws.row(0)] else: b = 0 metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for i in xrange(b, nrows): cells = ws.row(i) yield metainfo.t._make(map(valuenormalize, cells)) yield StreamFooter()
def read(format, stream, kwargs): from openpyxl import load_workbook wb = load_workbook(filename=stream, use_iterators=True) ws = wb.get_active_sheet() it = ws.iter_rows() fields = kwargs.get('fields', None) if not fields: fields = [cell.internal_value for cell in it.next()] metainfo = StreamHeader(**dict(kwargs, fields=fields)) yield metainfo for row in it: # it brings a new method: iter_rows() ## stop at the first row with "none" nrow = map(valuenormalize, row) if not any(nrow): break yield metainfo.t._make(nrow) yield StreamFooter()
def pull_twitter(false_stream, consumer_key=None, consumer_secret=None, access_token=None, access_token_secret=None): import tweepy if consumer_key: auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) else: api = tweepy.API() # If the authentication was successful, you should # see the name of the account print out #print api.me().name # If the application settings are set for "Read and Write" then # this line should tweet out the message to your account's # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps #api.update_status('Updating using OAuth authentication via Tweepy!') metainfo = None if consumer_key: statuses = api.user_timeline(include_entities=True) else: statuses = api.public_timeline(include_entities=True) for u in statuses: flatten_status(u) if not metainfo: names = build_status_names(u) metainfo = StreamHeader(typename="Status", fields=names) yield metainfo u.__class__.__iter__ = lambda s: iter( [getattr(s, key) for key in names]) yield u yield StreamFooter()
def pull_sql(false_stream, query=None, table=None, host=None, database_kind=None, database=None, ssh_host=None, user=None, password=None, sql_command=None, **kwargs): """Pull from SQL query to the database. query : The query to execute, if not SELECT * FROM table table : The table to fetch from db : The database to query host : The host to connect to ssh_host : SSH to a remote connection. HOST or USER@HOST command : Override the connection command string prefix """ ignore_bad_lines = kwargs.get('ignore_bad_lines', False) # Existing iterator go first. if hasattr(false_stream, 'stream') and false_stream.stream: for row in false_stream: yield row db_params = PULL_DB[database_kind] if sql_command: c = sql_command else: c = db_params['command'] if 'separator' in db_params: c = c + [db_params['separator'] % '\t'] if user: c = c + [db_params['user'] % user] if password: c = c + [db_params['password'] % password] c = c + [database] if not query: query = 'SELECT * FROM %s' % table if db_params.get('need_pipe', False): tmpfifo = TempFifo() readstream = tmpfifo.open_read() else: tmpfifo = None readstream = None query_ins = Template(db_params['query_template']).substitute( query=query, out_filename=tmpfifo.filename if tmpfifo else None) p = Popen(c, stdin=PIPE, stdout=None if readstream else PIPE, stderr=None) p.stdin.write(query_ins) p.stdin.flush() p.stdin.close() dialect = sql_dialect() stream = readstream if readstream else p.stdout #if kwargs.get('utf8_cleanup', False): # stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8')) #elif codecs.getreader(kwargs.get('encoding', 'utf-8')) != codecs.getreader('utf-8'): # stream = UTF8Recoder(stream, kwargs.get('encoding', None)) #else: # pass reader = csv.reader(stream, dialect=dialect) fields = reader.next() ## Vectorwise specifics ... ## Remove the last characeter (space on the l) if database_kind == 'vectorwise': fields[-1] = fields[-1][:-1] if fields[0].startswith("E_"): print >> sys.stderr, ' '.join(fields) for line in stream: print >> sys.stderr, line.rstrip() raise Exception("Error in SQL Command") metainfo = StreamHeader(**dict(kwargs, typename=table, fields=fields)) yield metainfo for row in reader: if database_kind == 'vectorwise': if len(row) == 0: print 'Error, empty row: %s ' % row continue row[-1] = row[-1][:-1] try: yield metainfo.t._make([unicode(x, 'utf-8') for x in row]) except UnicodeDecodeError: if ignore_bad_lines: print "Error on line ", x else: raise p.wait() if p.returncode != 0: raise Exception("SQL process failed with errcode %u" % p.returncode) yield StreamFooter()
table = kwargs['table'] header = StreamHeader(fields=fields, table=table) yield header prefix = "INSERT INTO `%s` VALUES " % table try: for line in stream: if not line.startswith(prefix): continue pos = len(prefix) while pos < len(line): (elts, pos) = parse_tuple(pos, line) yield header.t(*elts) if line[pos] == ',': pos = pos+1 continue elif line[pos] == ';': break else: raise Exception("ParseError pos %u " % pos) except TypeError, e: print len(elts), elts raise e yield StreamFooter() BabeBase.addPullPlugin("sql", ["sql"], pull) if __name__ == "__main__": for line in sys.stdin: print parse_tuple(0, line)