예제 #1
0
def head(stream, n, all_streams = False):
    """Retrieve only the first n lines. 
    If all_streams is true, apply head on each substream
    Otherwise (default), only keep the first substream."""
    if not all_streams:
        for row in stream: 
            if isinstance(row, StreamHeader):
                count = 0 
            elif isinstance(row, StreamFooter):
                break
            else: 
                if count >= n: 
                    yield StreamFooter()
                    break
                count = count + 1
            yield row
    else:
        skip = False 
        for row in stream:
            if isinstance(row, StreamHeader):
                count = 0
            elif isinstance(row, StreamFooter):
                skip = False
            else:
                if count >= n: 
                    skip = True 
                count = count + 1 
            if not skip:
                yield row 
예제 #2
0
파일: partition.py 프로젝트: waytai/PyBabe
def partition(stream, field):
    """Create substream per different value of 'column'"""
    beginning = False
    last_value = None
    header = None
    for row in stream:
        if isinstance(row, StreamHeader):
            beginning = True
            header = row
        elif isinstance(row, StreamFooter):
            if beginning == True:
                beginning = False
                continue  # Empty partition: Emit neither header nor footer
            yield row
        else:
            v = getattr(row, field)
            if beginning:
                beginning = False
                last_value = v
                yield header.replace(partition=[(field, v)])
            elif v != last_value:
                yield StreamFooter()
                yield header.replace(partition=[(field, v)])
                last_value = v
            yield row
예제 #3
0
파일: mongo.py 프로젝트: ogirardot/PyBabe
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
	Pull objects from mongo as rows
	"""
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename' in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(
                **dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()
예제 #4
0
파일: format_csv.py 프로젝트: waytai/PyBabe
def linepull(stream, dialect, kwargs):
    it = iter(stream)
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [it.next().rstrip('\r\n')]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for row in it:
        yield metainfo.t._make([row.rstrip('\r\n')])
    yield StreamFooter()
예제 #5
0
def pull(format, stream, kwargs):    
    stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream)

    fields = kwargs.get('fields', ['text'])
    
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo 
    
    for line in stream:
        yield metainfo.t._make([line])
    yield StreamFooter()
예제 #6
0
def pull(format, stream, kwargs):
    stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream)

    previous_fields = None
    for line in stream:
        data = json.loads(line)
        fields = data.keys()
        if previous_fields != fields:
            metainfo = StreamHeader(**dict(kwargs, fields=fields))
            previous_fields = fields
            yield metainfo
        yield metainfo.t._make(data.values())
    yield StreamFooter()
예제 #7
0
def read(format, stream, kwargs):
    import xlrd
    wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None))
    ws = wb.sheet_by_index(0)
    nrows = ws.nrows
    fields = kwargs.get('fields', None)
    if not fields: 
        b = 1 
        fields = [cell.value for cell in ws.row(0)] 
    else: 
        b = 0 
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for i in xrange(b, nrows):
        cells = ws.row(i)
        yield metainfo.t._make(map(valuenormalize, cells))
    yield StreamFooter()
예제 #8
0
def read(format, stream, kwargs):
    from openpyxl import load_workbook
    wb = load_workbook(filename=stream, use_iterators=True)
    ws = wb.get_active_sheet()
    it = ws.iter_rows()
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [cell.internal_value for cell in it.next()]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for row in it:  # it brings a new method: iter_rows()
        ## stop at the first row with "none"
        nrow = map(valuenormalize, row)
        if not any(nrow):
            break
        yield metainfo.t._make(nrow)
    yield StreamFooter()
예제 #9
0
def pull_twitter(false_stream,
                 consumer_key=None,
                 consumer_secret=None,
                 access_token=None,
                 access_token_secret=None):
    import tweepy

    if consumer_key:
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter(
            [getattr(s, key) for key in names])
        yield u
    yield StreamFooter()
예제 #10
0
파일: sql.py 프로젝트: ogirardot/PyBabe
def pull_sql(false_stream,
             query=None,
             table=None,
             host=None,
             database_kind=None,
             database=None,
             ssh_host=None,
             user=None,
             password=None,
             sql_command=None,
             **kwargs):
    """Pull from SQL query to the database.  
    query : The query to execute, if not SELECT * FROM table
    table : The table to fetch from
    db    : The database to query
    host  : The host to connect to
    ssh_host : SSH to a remote connection. HOST  or USER@HOST
    command : Override the connection command string prefix
    """

    ignore_bad_lines = kwargs.get('ignore_bad_lines', False)
    # Existing iterator go first.
    if hasattr(false_stream, 'stream') and false_stream.stream:
        for row in false_stream:
            yield row

    db_params = PULL_DB[database_kind]

    if sql_command:
        c = sql_command
    else:
        c = db_params['command']

    if 'separator' in db_params:
        c = c + [db_params['separator'] % '\t']

    if user:
        c = c + [db_params['user'] % user]
    if password:
        c = c + [db_params['password'] % password]

    c = c + [database]

    if not query:
        query = 'SELECT * FROM %s' % table

    if db_params.get('need_pipe', False):
        tmpfifo = TempFifo()
        readstream = tmpfifo.open_read()
    else:
        tmpfifo = None
        readstream = None

    query_ins = Template(db_params['query_template']).substitute(
        query=query, out_filename=tmpfifo.filename if tmpfifo else None)
    p = Popen(c, stdin=PIPE, stdout=None if readstream else PIPE, stderr=None)
    p.stdin.write(query_ins)
    p.stdin.flush()
    p.stdin.close()
    dialect = sql_dialect()

    stream = readstream if readstream else p.stdout
    #if kwargs.get('utf8_cleanup', False):
    #    stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8'))
    #elif codecs.getreader(kwargs.get('encoding', 'utf-8'))  != codecs.getreader('utf-8'):
    #    stream = UTF8Recoder(stream, kwargs.get('encoding', None))
    #else:
    #    pass
    reader = csv.reader(stream, dialect=dialect)
    fields = reader.next()
    ## Vectorwise specifics ...
    ## Remove the last characeter (space on the l)
    if database_kind == 'vectorwise':
        fields[-1] = fields[-1][:-1]
        if fields[0].startswith("E_"):
            print >> sys.stderr, ' '.join(fields)
            for line in stream:
                print >> sys.stderr, line.rstrip()
            raise Exception("Error in SQL Command")
    metainfo = StreamHeader(**dict(kwargs, typename=table, fields=fields))

    yield metainfo
    for row in reader:
        if database_kind == 'vectorwise':
            if len(row) == 0:
                print 'Error, empty row: %s ' % row
                continue
            row[-1] = row[-1][:-1]
        try:
            yield metainfo.t._make([unicode(x, 'utf-8') for x in row])
        except UnicodeDecodeError:
            if ignore_bad_lines:
                print "Error on line ", x
            else:
                raise
    p.wait()
    if p.returncode != 0:
        raise Exception("SQL process failed with errcode %u" % p.returncode)
    yield StreamFooter()
예제 #11
0
	table = kwargs['table']
	header = StreamHeader(fields=fields, table=table)
	yield header 
	prefix = "INSERT INTO `%s` VALUES " % table 
	try: 
		for line in stream: 
			if not line.startswith(prefix):
				continue
			pos = len(prefix)
			while pos < len(line):
				(elts, pos) = parse_tuple(pos, line)
				yield header.t(*elts)
				if line[pos] == ',':
					pos = pos+1
					continue
				elif line[pos] == ';':
					break
				else:
					raise Exception("ParseError pos %u " % pos)
	except TypeError, e:
		print len(elts), elts 
		raise e
	yield StreamFooter()

BabeBase.addPullPlugin("sql", ["sql"], pull)

if __name__ == "__main__": 
	for line in sys.stdin:
		print parse_tuple(0, line)