Example #1
0
def pull(format, stream, kwargs):
    """
    Read a SQL dump "INSERT VALUE" statements from a single table

    table = The name of the table to read (mandatory)
    fields = The sets
    """

    fields = kwargs['fields']
    table = kwargs['table']
    header = StreamHeader(fields=fields, table=table)
    yield header
    prefix = "INSERT INTO `%s` VALUES " % table
    try:
        for line in stream:
            if not line.startswith(prefix):
                continue
            pos = len(prefix)
            while pos < len(line):
                (elts, pos) = parse_tuple(pos, line)
                yield header.t(*elts)
                if line[pos] == ',':
                    pos = pos + 1
                    continue
                elif line[pos] == ';':
                    break
                else:
                    raise Exception("ParseError pos %u " % pos)
    except TypeError, e:
        print len(elts), elts
        raise e
Example #2
0
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
    Pull objects from mongo as rows
    """
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename'in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()
Example #3
0
def pull(format, stream, kwargs): 
	"""
	Read a SQL dump "INSERT VALUE" statements from a single table 

	table = The name of the table to read (mandatory)
	fields = The sets 
	"""

	fields = kwargs['fields']
	table = kwargs['table']
	header = StreamHeader(fields=fields, table=table)
	yield header 
	prefix = "INSERT INTO `%s` VALUES " % table 
	try: 
		for line in stream: 
			if not line.startswith(prefix):
				continue
			pos = len(prefix)
			while pos < len(line):
				(elts, pos) = parse_tuple(pos, line)
				yield header.t(*elts)
				if line[pos] == ',':
					pos = pos+1
					continue
				elif line[pos] == ';':
					break
				else:
					raise Exception("ParseError pos %u " % pos)
	except TypeError, e:
		print len(elts), elts 
		raise e
Example #4
0
def pull_mongo(false_stream, db, collection, spec=None, **kwargs):
    """
	Pull objects from mongo as rows
	"""
    k = kwargs.copy()
    if 'fields' in k:
        del k['fields']
    if 'typename' in k:
        del k['typename']
    connection = Connection(**k)
    db_ = connection[db]
    coll = db_[collection]
    metainfo = None
    for doc in coll.find(spec, **k):
        if not metainfo:
            fields = kwargs.get('fields', None)
            if not fields:
                fields = [StreamHeader.keynormalize(n) for n in doc]
                fields.sort()  # Mandatory for determisn.
            typename = kwargs.get('typename', collection)
            metainfo = StreamHeader(
                **dict(kwargs, typename=typename, fields=fields))
            yield metainfo
        yield metainfo.t(*[doc[field] for field in fields])
    if metainfo:
        yield StreamFooter()
Example #5
0
def pull_bigquery(false_stream,
                  project_id,
                  query=None,
                  timeout=10000,
                  num_retries=2,
                  **kwargs):

    bigquery = get_bigquery()

    query_data = {
        'query': query,
        'timeoutMs': 0,  # use a timeout of 0 means we'll always need
        # to get the results via getQueryResults
    }

    response = bigquery.jobs().query(
        projectId=project_id,
        body=query_data
    ).execute(
        num_retries=num_retries
    )

    metainfo = None
    job_ref = response['jobReference']

    while True:

        page_token = response.get('pageToken', None)
        query_complete = response.get('jobComplete', False)

        if query_complete:
            if not metainfo:
                fields = [f['name'] for f in response['schema']['fields']]
                typename = kwargs.get('typename', 'BigQuery')
                metainfo = StreamHeader(**dict(kwargs, typename=typename, fields=fields))
                yield metainfo

            for row in response['rows']:
                yield metainfo.t(*[field['v'] for field in row['f']])

            if page_token is None:
                # The query is done and there are no more results
                # to read.
                yield StreamFooter()
                break

        response = bigquery.jobs().getQueryResults(
            pageToken=page_token,
            timeoutMs=timeout,
            **job_ref
        ).execute(
            num_retries=num_retries
        )
Example #6
0
def transpose(stream, typename=None):
    """
    Transpose a stream.
    For each row, the 'unique identifier'
        for this row will be used as a column name.
    city, b, c
    PARIS, foo, bas
    LONDON, coucou, salut

    field, PARIS,LONDON
    city, PARIS, LONDON
    b, foo, coucou
    c, bas, salut

    b,c
    foo,bar
    coucou,salut

    field, 1, 2
    b,foo, coucou
    c,bar,salut

    """
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            linecount = 0
            t_names = ['field']
            t_primary_key = 'field'
            t_rows = [[name] for name in metainfo.fields]
        elif isinstance(row, StreamFooter):
            t_metainfo = StreamHeader(source=metainfo.source,
                                      typename=typename,
                                      fields=t_names,
                                      primary_key=t_primary_key)
            yield t_metainfo
            for t_row in t_rows:
                if t_row[0] == metainfo.primary_key:  # Skip primary key
                    continue
                yield t_metainfo.t(*t_row)
            yield row
        else:
            linecount = linecount + 1
            c_id = metainfo.get_primary_identifier(row, linecount)
            t_names.append(c_id)
            for i, cell in enumerate(row):
                t_rows[i].append(cell)
Example #7
0
def transpose(stream, typename=None):
    """
    Transpose a stream.
    For each row, the 'unique identifier'
        for this row will be used as a column name.
    city, b, c
    PARIS, foo, bas
    LONDON, coucou, salut

    field, PARIS,LONDON
    city, PARIS, LONDON
    b, foo, coucou
    c, bas, salut

    b,c
    foo,bar
    coucou,salut

    field, 1, 2
    b,foo, coucou
    c,bar,salut

    """
    for row in stream:
        if isinstance(row, StreamHeader):
            metainfo = row
            linecount = 0
            t_names = ['field']
            t_primary_key = 'field'
            t_rows = [[name] for name in metainfo.fields]
        elif isinstance(row, StreamFooter):
            t_metainfo = StreamHeader(source=metainfo.source,
                                      typename=typename,
                                      fields=t_names,
                                      primary_key=t_primary_key)
            yield t_metainfo
            for t_row in t_rows:
                if t_row[0] == metainfo.primary_key:  # Skip primary key
                    continue
                yield t_metainfo.t(*t_row)
            yield row
        else:
            linecount = linecount + 1
            c_id = metainfo.get_primary_identifier(row, linecount)
            t_names.append(c_id)
            for i, cell in enumerate(row):
                t_rows[i].append(cell)