Example #1
0
def dt_move_large(config, task, dt_file, dt_partition, jobs):
  if config.verbose:
    print('DT TO TABLE LARGE', dt_partition)

  delimiter = '\n'
  disposition = 'WRITE_TRUNCATE'

  # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls )
  gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS)

  # sliding view of data flowing out of decompression, used to buffer and delimit rows
  first_row = True
  view = ''

  # loop all chunks of file, decompress, and find row delimiter
  for data_gz in object_get_chunks(config, task['auth'],
                                   '%s:%s' % (task['bucket'], dt_file)):

    view += gz_handler.decompress(data_gz).decode('utf-8')

    if first_row:
      end = view.find(delimiter)
      schema = dt_schema(view[:end].split(','))
      view = view[(end + 1):]
      first_row = False

    end = view.rfind(delimiter)

    jobs.append(
        io_to_table(config, task['auth'], config.project,
                    task['to']['dataset'], dt_partition,
                    BytesIO(view[:end].encode()), 'CSV', schema, 0, disposition,
                    False))
    disposition = 'WRITE_APPEND'
    view = view[min(end + 1, len(view)):]
Example #2
0
def dt_header(dt_file):
  if project.verbose: print("DT HEADER")

  # find first dt file to match pattern
  path = '%s:%s' % (project.task['bucket'], dt_file)

  # find first line of file ( gzip will decompress partial, and pull header out )
  sample_data = next(object_get_chunks(project.task['auth'], path, HEADER_SIZE))
  with gzip.GzipFile(fileobj=BytesIO(sample_data), mode='rb') as fo: 
    sample_header = fo.read(HEADER_SIZE).decode('utf-8').split('\n')[0]

  return sample_header.split(',')
Example #3
0
def report_file(auth, report_id=None, name=None, timeout=60, chunksize=None):
    """ Retrieves most recent DBM file by name or ID, if in progress, waits for it to complete.

  Bulletproofing: https://developers.google.com/bid-manager/v1/queries/getquery

  Timeout is in minutes ( retries will happen at 1 minute interval, default total time is 60 minutes )
  If chunksize is set to None then the whole file is downloaded at once.

  Args:
    * auth: (string) Either user or service.
    * report_id: (int) ID of DCm report to fetch ( either or name ).
    * name: (string) Name of report to fetch ( either or report_id ).
    * timeout: (int) Minutes to wait for in progress report before giving up.
    * chunksize: (int) number of bytes to download at a time, for memory constrained systems.

  Returns:
    * (filename, iterator) if file exists and is ready to download in chunks.
    * (filename, file) if file exists and chunking is off.
    * ('report_running.csv', None) if report is in progress.
    * (None, None) if file does not exist.

  """

    storage_path = report_fetch(auth, report_id, name, timeout)

    if storage_path == False:
        return None, None
    elif storage_path == True:
        return 'report_running.csv', None
    else:
        filename = RE_FILENAME.search(storage_path).groups(0)[0]

        # streaming
        if 0:  #if chunksize: BROKEN SO DEFAULTING TO STREAMING
            #print('PATH PRE', storage_path)
            path = storage_path.split('?', 1)[0].replace(
                'https://storage.googleapis.com/', '').replace('/', ':', 1)
            #print('PATH POST', path)
            return filename, object_get_chunks(auth, path, chunksize)

        # single object
        else:
            #print('SP', storage_path)
            return filename, StringIO(
                urlopen(storage_path).read().decode('UTF-8'))
Example #4
0
def get_entity(path):
  delimiter = ',\r\r'
  first = True
  view = ''

  for chunk in object_get_chunks(project.task['auth'], path, CHUNK_SIZE):
    # read the next chunk, remove all newlines, leaving only '\r\r' between records ( clever use of non display characters for parsing )
    view += chunk.getvalue().replace('\n', '')

    # first time through, scrap the leading bracket
    if first:
      view = view.strip('[\r\r')
      first = False

    # after replacing all newlines, only '\r\r' are left, clever Googlers
    end = view.rfind(delimiter)
    if end > -1:
      yield view[:end].replace(delimiter, '\n')
      view = view[end + 1:]

  # last one never delimits, so opportunity to trim extra bracket
  yield view.strip('\r\r]')
Example #5
0
def dt():
    if project.verbose: print "DT TO TABLE", project.task['to']['table']

    delimiter = '\n'
    disposition = 'WRITE_TRUNCATE'

    # loop all dt files to match pattern
    path = '%s:%s' % (project.task['from']['bucket'],
                      project.task['from']['path'])
    for dt_file in object_list(project.task['auth'], path, files_only=True):

        # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls )
        gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS)

        # sliding view of data flowing out of decompression, used to buffer and delimit rows
        first_row = True
        view = ''

        # loop all chunks of file, decompress, and find row delimiter
        for data_gz in object_get_chunks(project.task['auth'], dt_file):

            view += gz_handler.decompress(data_gz.read())

            if first_row:
                end = view.find(delimiter)
                schema = dt_schema(view[:end].split(','))
                view = view[(end + 1):]
                first_row = False

            end = view.rfind(delimiter)

            io_to_table(project.task['auth'], project.id,
                        project.task['to']['dataset'],
                        project.task['to']['table'], BytesIO(view[:end]),
                        'CSV', schema, 0, disposition, False)
            disposition = 'WRITE_APPEND'
            view = view[min(end + 1, len(view)):]