def dt_move_large(config, task, dt_file, dt_partition, jobs): if config.verbose: print('DT TO TABLE LARGE', dt_partition) delimiter = '\n' disposition = 'WRITE_TRUNCATE' # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls ) gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS) # sliding view of data flowing out of decompression, used to buffer and delimit rows first_row = True view = '' # loop all chunks of file, decompress, and find row delimiter for data_gz in object_get_chunks(config, task['auth'], '%s:%s' % (task['bucket'], dt_file)): view += gz_handler.decompress(data_gz).decode('utf-8') if first_row: end = view.find(delimiter) schema = dt_schema(view[:end].split(',')) view = view[(end + 1):] first_row = False end = view.rfind(delimiter) jobs.append( io_to_table(config, task['auth'], config.project, task['to']['dataset'], dt_partition, BytesIO(view[:end].encode()), 'CSV', schema, 0, disposition, False)) disposition = 'WRITE_APPEND' view = view[min(end + 1, len(view)):]
def dt_header(dt_file): if project.verbose: print("DT HEADER") # find first dt file to match pattern path = '%s:%s' % (project.task['bucket'], dt_file) # find first line of file ( gzip will decompress partial, and pull header out ) sample_data = next(object_get_chunks(project.task['auth'], path, HEADER_SIZE)) with gzip.GzipFile(fileobj=BytesIO(sample_data), mode='rb') as fo: sample_header = fo.read(HEADER_SIZE).decode('utf-8').split('\n')[0] return sample_header.split(',')
def report_file(auth, report_id=None, name=None, timeout=60, chunksize=None): """ Retrieves most recent DBM file by name or ID, if in progress, waits for it to complete. Bulletproofing: https://developers.google.com/bid-manager/v1/queries/getquery Timeout is in minutes ( retries will happen at 1 minute interval, default total time is 60 minutes ) If chunksize is set to None then the whole file is downloaded at once. Args: * auth: (string) Either user or service. * report_id: (int) ID of DCm report to fetch ( either or name ). * name: (string) Name of report to fetch ( either or report_id ). * timeout: (int) Minutes to wait for in progress report before giving up. * chunksize: (int) number of bytes to download at a time, for memory constrained systems. Returns: * (filename, iterator) if file exists and is ready to download in chunks. * (filename, file) if file exists and chunking is off. * ('report_running.csv', None) if report is in progress. * (None, None) if file does not exist. """ storage_path = report_fetch(auth, report_id, name, timeout) if storage_path == False: return None, None elif storage_path == True: return 'report_running.csv', None else: filename = RE_FILENAME.search(storage_path).groups(0)[0] # streaming if 0: #if chunksize: BROKEN SO DEFAULTING TO STREAMING #print('PATH PRE', storage_path) path = storage_path.split('?', 1)[0].replace( 'https://storage.googleapis.com/', '').replace('/', ':', 1) #print('PATH POST', path) return filename, object_get_chunks(auth, path, chunksize) # single object else: #print('SP', storage_path) return filename, StringIO( urlopen(storage_path).read().decode('UTF-8'))
def get_entity(path): delimiter = ',\r\r' first = True view = '' for chunk in object_get_chunks(project.task['auth'], path, CHUNK_SIZE): # read the next chunk, remove all newlines, leaving only '\r\r' between records ( clever use of non display characters for parsing ) view += chunk.getvalue().replace('\n', '') # first time through, scrap the leading bracket if first: view = view.strip('[\r\r') first = False # after replacing all newlines, only '\r\r' are left, clever Googlers end = view.rfind(delimiter) if end > -1: yield view[:end].replace(delimiter, '\n') view = view[end + 1:] # last one never delimits, so opportunity to trim extra bracket yield view.strip('\r\r]')
def dt(): if project.verbose: print "DT TO TABLE", project.task['to']['table'] delimiter = '\n' disposition = 'WRITE_TRUNCATE' # loop all dt files to match pattern path = '%s:%s' % (project.task['from']['bucket'], project.task['from']['path']) for dt_file in object_list(project.task['auth'], path, files_only=True): # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls ) gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS) # sliding view of data flowing out of decompression, used to buffer and delimit rows first_row = True view = '' # loop all chunks of file, decompress, and find row delimiter for data_gz in object_get_chunks(project.task['auth'], dt_file): view += gz_handler.decompress(data_gz.read()) if first_row: end = view.find(delimiter) schema = dt_schema(view[:end].split(',')) view = view[(end + 1):] first_row = False end = view.rfind(delimiter) io_to_table(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], BytesIO(view[:end]), 'CSV', schema, 0, disposition, False) disposition = 'WRITE_APPEND' view = view[min(end + 1, len(view)):]