def sftp(): cnopts = pysftp.CnOpts() cnopts.hostkeys = None sftp_configs = project.task['from']['sftp']['connection'] sftp_configs['cnopts'] = cnopts sftp = pysftp.Connection(**sftp_configs) file_name = (datetime.datetime.now() + datetime.timedelta( project.task['from']['sftp'].get('day', 0))).strftime( project.task['from']['sftp']['file']) input_file_name = '/tmp/%s.csv' % str(uuid.uuid1()) sftp.get(file_name, localpath=input_file_name) compression = project.task['from']['sftp'].get('compression', None) if 'table' in project.task['to']: input_file = None if compression == 'gzip': input_file = gzip.open(input_file_name, 'rb') uncompressed_file = '/tmp/%s.csv' % str(uuid.uuid1()) out = open(uncompressed_file, 'wb') for line in input_file: if len(line) > 1: out.write(line) out.close() input_file.close() os.remove(input_file_name) input_file_name = uncompressed_file input_file = open(input_file_name, 'rb') reader = csv.reader(input_file) header = next(reader) input_file.seek(0) schema = make_schema(header) output_file_name = '/tmp/%s.csv' % str(uuid.uuid1()) clean_csv(input_file, output_file_name, len(header), header=True) input_file.close() output_file = open(output_file_name, 'rb') io_to_table(project.task['auth'], project.id, project.task['to'].get('dataset'), project.task['to'].get('table'), output_file, 'CSV', schema, skip_rows=0, disposition=project.task['to'].get('write_disposition', 'WRITE_TRUNCATE')) output_file.close() os.remove(input_file_name) os.remove(output_file_name)
def dt_move_large(config, task, dt_file, dt_partition, jobs): if config.verbose: print('DT TO TABLE LARGE', dt_partition) delimiter = '\n' disposition = 'WRITE_TRUNCATE' # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls ) gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS) # sliding view of data flowing out of decompression, used to buffer and delimit rows first_row = True view = '' # loop all chunks of file, decompress, and find row delimiter for data_gz in object_get_chunks(config, task['auth'], '%s:%s' % (task['bucket'], dt_file)): view += gz_handler.decompress(data_gz).decode('utf-8') if first_row: end = view.find(delimiter) schema = dt_schema(view[:end].split(',')) view = view[(end + 1):] first_row = False end = view.rfind(delimiter) jobs.append( io_to_table(config, task['auth'], config.project, task['to']['dataset'], dt_partition, BytesIO(view[:end].encode()), 'CSV', schema, 0, disposition, False)) disposition = 'WRITE_APPEND' view = view[min(end + 1, len(view)):]
def dt(): if project.verbose: print "DT TO TABLE", project.task['to']['table'] delimiter = '\n' disposition = 'WRITE_TRUNCATE' # loop all dt files to match pattern path = '%s:%s' % (project.task['from']['bucket'], project.task['from']['path']) for dt_file in object_list(project.task['auth'], path, files_only=True): # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls ) gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS) # sliding view of data flowing out of decompression, used to buffer and delimit rows first_row = True view = '' # loop all chunks of file, decompress, and find row delimiter for data_gz in object_get_chunks(project.task['auth'], dt_file): view += gz_handler.decompress(data_gz.read()) if first_row: end = view.find(delimiter) schema = dt_schema(view[:end].split(',')) view = view[(end + 1):] first_row = False end = view.rfind(delimiter) io_to_table(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], BytesIO(view[:end]), 'CSV', schema, 0, disposition, False) disposition = 'WRITE_APPEND' view = view[min(end + 1, len(view)):]