Ejemplo n.º 1
0
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
	Pull objects from mongo as rows
	"""
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename' in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(
                **dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()
Ejemplo n.º 2
0
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
    Pull objects from mongo as rows
    """
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename'in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()
Ejemplo n.º 3
0
def pull(format, stream, kwargs):
    """
    Read a SQL dump "INSERT VALUE" statements from a single table

    table = The name of the table to read (mandatory)
    fields = The sets
    """

    fields = kwargs['fields']
    table = kwargs['table']
    header = StreamHeader(fields=fields, table=table)
    yield header
    prefix = "INSERT INTO `%s` VALUES " % table
    try:
        for line in stream:
            if not line.startswith(prefix):
                continue
            pos = len(prefix)
            while pos < len(line):
                (elts, pos) = parse_tuple(pos, line)
                yield header.t(*elts)
                if line[pos] == ',':
                    pos = pos + 1
                    continue
                elif line[pos] == ';':
                    break
                else:
                    raise Exception("ParseError pos %u " % pos)
    except TypeError, e:
        print len(elts), elts
        raise e
Ejemplo n.º 4
0
def pull(format, stream, kwargs): 
	"""
	Read a SQL dump "INSERT VALUE" statements from a single table 

	table = The name of the table to read (mandatory)
	fields = The sets 
	"""

	fields = kwargs['fields']
	table = kwargs['table']
	header = StreamHeader(fields=fields, table=table)
	yield header 
	prefix = "INSERT INTO `%s` VALUES " % table 
	try: 
		for line in stream: 
			if not line.startswith(prefix):
				continue
			pos = len(prefix)
			while pos < len(line):
				(elts, pos) = parse_tuple(pos, line)
				yield header.t(*elts)
				if line[pos] == ',':
					pos = pos+1
					continue
				elif line[pos] == ';':
					break
				else:
					raise Exception("ParseError pos %u " % pos)
	except TypeError, e:
		print len(elts), elts 
		raise e
Ejemplo n.º 5
0
def pull_bigquery(false_stream,
                  project_id,
                  query=None,
                  timeout=10000,
                  num_retries=2,
                  **kwargs):

    bigquery = get_bigquery()

    query_data = {
        'query': query,
        'timeoutMs': 0,  # use a timeout of 0 means we'll always need
        # to get the results via getQueryResults
    }

    response = bigquery.jobs().query(
        projectId=project_id,
        body=query_data
    ).execute(
        num_retries=num_retries
    )

    metainfo = None
    job_ref = response['jobReference']

    while True:

        page_token = response.get('pageToken', None)
        query_complete = response.get('jobComplete', False)

        if query_complete:
            if not metainfo:
                fields = [f['name'] for f in response['schema']['fields']]
                typename = kwargs.get('typename', 'BigQuery')
                metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
                yield metainfo

            for row in response['rows']:
                yield metainfo.t(*[field['v'] for field in row['f']])

            if page_token is None:
                # The query is done and there are no more results
                # to read.
                yield StreamFooter()
                break

        response = bigquery.jobs().getQueryResults(
            pageToken=page_token,
            timeoutMs=timeout,
            **job_ref
        ).execute(
            num_retries=num_retries
        )
Ejemplo n.º 6
0
def transpose(stream, typename=None):
    """
    Transpose a stream.
    For each row, the 'unique identifier'
        for this row will be used as a column name.
    city, b, c
    PARIS, foo, bas
    LONDON, coucou, salut

    field, PARIS,LONDON
    city, PARIS, LONDON
    b, foo, coucou
    c, bas, salut

    b,c
    foo,bar
    coucou,salut

    field, 1, 2
    b,foo, coucou
    c,bar,salut

    """
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            linecount = 0
            t_names = ['field']
            t_primary_key = 'field'
            t_rows = [[name] for name in metainfo.fields]
        elif isinstance(row, StreamFooter):
            t_metainfo = StreamHeader(source=metainfo.source,
                                      typename=typename,
                                      fields=t_names,
                                      primary_key=t_primary_key)
            yield t_metainfo
            for t_row in t_rows:
                if t_row[0] == metainfo.primary_key:  # Skip primary key
                    continue
                yield t_metainfo.t(*t_row)
            yield row
        else:
            linecount = linecount + 1
            c_id = metainfo.get_primary_identifier(row, linecount)
            t_names.append(c_id)
            for i, cell in enumerate(row):
                t_rows[i].append(cell)
Ejemplo n.º 7
0
def transpose(stream, typename=None):
    """
    Transpose a stream.
    For each row, the 'unique identifier'
        for this row will be used as a column name.
    city, b, c
    PARIS, foo, bas
    LONDON, coucou, salut

    field, PARIS,LONDON
    city, PARIS, LONDON
    b, foo, coucou
    c, bas, salut

    b,c
    foo,bar
    coucou,salut

    field, 1, 2
    b,foo, coucou
    c,bar,salut

    """
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            linecount = 0
            t_names = ['field']
            t_primary_key = 'field'
            t_rows = [[name] for name in metainfo.fields]
        elif isinstance(row, StreamFooter):
            t_metainfo = StreamHeader(source=metainfo.source,
                                      typename=typename,
                                      fields=t_names,
                                      primary_key=t_primary_key)
            yield t_metainfo
            for t_row in t_rows:
                if t_row[0] == metainfo.primary_key:  # Skip primary key
                    continue
                yield t_metainfo.t(*t_row)
            yield row
        else:
            linecount = linecount + 1
            c_id = metainfo.get_primary_identifier(row, linecount)
            t_names.append(c_id)
            for i, cell in enumerate(row):
                t_rows[i].append(cell)
Ejemplo n.º 8
0
def linepull(stream, dialect, kwargs):
    it = iter(stream)
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [it.next().rstrip('\r\n')]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for row in it:
        yield metainfo.t._make([row.rstrip('\r\n')])
    yield StreamFooter()
Ejemplo n.º 9
0
def pull(format, stream, kwargs):    
    stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream)

    fields = kwargs.get('fields', ['text'])
    
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo 
    
    for line in stream:
        yield metainfo.t._make([line])
    yield StreamFooter()
Ejemplo n.º 10
0
def pull(format, stream, kwargs):
    stream = codecs.getreader(kwargs.get('encoding', 'utf8'))(stream)

    previous_fields = None
    for line in stream:
        data = json.loads(line)
        fields = data.keys()
        if previous_fields != fields:
            metainfo = StreamHeader(**dict(kwargs, fields=fields))
            previous_fields = fields
            yield metainfo
        yield metainfo.t._make(data.values())
    yield StreamFooter()
Ejemplo n.º 11
0
def csvpull(stream, dialect, kwargs):
    reader = csv.reader(stream, dialect)
    fields = kwargs.get('fields', None)
    null_value = kwargs.get('null_value', "")
    ignore_malformed = kwargs.get('ignore_bad_lines', False)
    if not fields:
        fields = reader.next()
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for row in reader:
        try:
            yield metainfo.t._make([build_value(x, null_value) for x in row])
        except Exception, e:
            if ignore_malformed:
                log.warn("Malformed line: %s, %s" % (row, e))
            else:
                raise e
Ejemplo n.º 12
0
def read(format, stream, kwargs):
    import xlrd
    wb = xlrd.open_workbook(file_contents=stream.read(), encoding_override=kwargs.get('encoding', None))
    ws = wb.sheet_by_index(0)
    nrows = ws.nrows
    fields = kwargs.get('fields', None)
    if not fields: 
        b = 1 
        fields = [cell.value for cell in ws.row(0)] 
    else: 
        b = 0 
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for i in xrange(b, nrows):
        cells = ws.row(i)
        yield metainfo.t._make(map(valuenormalize, cells))
    yield StreamFooter()
Ejemplo n.º 13
0
def read(format, stream, kwargs):
    from openpyxl import load_workbook
    wb = load_workbook(filename=stream, use_iterators=True)
    ws = wb.get_active_sheet()
    it = ws.iter_rows()
    fields = kwargs.get('fields', None)
    if not fields:
        fields = [cell.internal_value for cell in it.next()]
    metainfo = StreamHeader(**dict(kwargs, fields=fields))
    yield metainfo
    for row in it:  # it brings a new method: iter_rows()
        ## stop at the first row with "none"
        nrow = map(valuenormalize, row)
        if not any(nrow):
            break
        yield metainfo.t._make(nrow)
    yield StreamFooter()
Ejemplo n.º 14
0
def pull_twitter(false_stream,
                 consumer_key=None,
                 consumer_secret=None,
                 access_token=None,
                 access_token_secret=None):
    import tweepy

    if consumer_key:
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
    else:
        api = tweepy.API()

    # If the authentication was successful, you should
    # see the name of the account print out
    #print api.me().name

    # If the application settings are set for "Read and Write" then
    # this line should tweet out the message to your account's
    # timeline. The "Read and Write" setting is on https://dev.twitter.com/apps
    #api.update_status('Updating using OAuth authentication via Tweepy!')
    metainfo = None
    if consumer_key:
        statuses = api.user_timeline(include_entities=True)
    else:
        statuses = api.public_timeline(include_entities=True)
    for u in statuses:
        flatten_status(u)
        if not metainfo:
            names = build_status_names(u)
            metainfo = StreamHeader(typename="Status", fields=names)
            yield metainfo
        u.__class__.__iter__ = lambda s: iter(
            [getattr(s, key) for key in names])
        yield u
    yield StreamFooter()
Ejemplo n.º 15
0
def pull_sql(false_stream,
             query=None,
             table=None,
             host=None,
             database_kind=None,
             database=None,
             ssh_host=None,
             user=None,
             password=None,
             sql_command=None,
             **kwargs):
    """Pull from SQL query to the database.  
    query : The query to execute, if not SELECT * FROM table
    table : The table to fetch from
    db    : The database to query
    host  : The host to connect to
    ssh_host : SSH to a remote connection. HOST  or USER@HOST
    command : Override the connection command string prefix
    """

    ignore_bad_lines = kwargs.get('ignore_bad_lines', False)
    # Existing iterator go first.
    if hasattr(false_stream, 'stream') and false_stream.stream:
        for row in false_stream:
            yield row

    db_params = PULL_DB[database_kind]

    if sql_command:
        c = sql_command
    else:
        c = db_params['command']

    if 'separator' in db_params:
        c = c + [db_params['separator'] % '\t']

    if user:
        c = c + [db_params['user'] % user]
    if password:
        c = c + [db_params['password'] % password]

    c = c + [database]

    if not query:
        query = 'SELECT * FROM %s' % table

    if db_params.get('need_pipe', False):
        tmpfifo = TempFifo()
        readstream = tmpfifo.open_read()
    else:
        tmpfifo = None
        readstream = None

    query_ins = Template(db_params['query_template']).substitute(
        query=query, out_filename=tmpfifo.filename if tmpfifo else None)
    p = Popen(c, stdin=PIPE, stdout=None if readstream else PIPE, stderr=None)
    p.stdin.write(query_ins)
    p.stdin.flush()
    p.stdin.close()
    dialect = sql_dialect()

    stream = readstream if readstream else p.stdout
    #if kwargs.get('utf8_cleanup', False):
    #    stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8'))
    #elif codecs.getreader(kwargs.get('encoding', 'utf-8'))  != codecs.getreader('utf-8'):
    #    stream = UTF8Recoder(stream, kwargs.get('encoding', None))
    #else:
    #    pass
    reader = csv.reader(stream, dialect=dialect)
    fields = reader.next()
    ## Vectorwise specifics ...
    ## Remove the last characeter (space on the l)
    if database_kind == 'vectorwise':
        fields[-1] = fields[-1][:-1]
        if fields[0].startswith("E_"):
            print >> sys.stderr, ' '.join(fields)
            for line in stream:
                print >> sys.stderr, line.rstrip()
            raise Exception("Error in SQL Command")
    metainfo = StreamHeader(**dict(kwargs, typename=table, fields=fields))

    yield metainfo
    for row in reader:
        if database_kind == 'vectorwise':
            if len(row) == 0:
                print 'Error, empty row: %s ' % row
                continue
            row[-1] = row[-1][:-1]
        try:
            yield metainfo.t._make([unicode(x, 'utf-8') for x in row])
        except UnicodeDecodeError:
            if ignore_bad_lines:
                print "Error on line ", x
            else:
                raise
    p.wait()
    if p.returncode != 0:
        raise Exception("SQL process failed with errcode %u" % p.returncode)
    yield StreamFooter()