コード例 #1
0
ファイル: run.py プロジェクト: Gregorfran/starthinker-gregor
def sftp():

    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    sftp_configs = project.task['from']['sftp']['connection']
    sftp_configs['cnopts'] = cnopts
    sftp = pysftp.Connection(**sftp_configs)

    file_name = (datetime.datetime.now() + datetime.timedelta(
        project.task['from']['sftp'].get('day', 0))).strftime(
            project.task['from']['sftp']['file'])
    input_file_name = '/tmp/%s.csv' % str(uuid.uuid1())
    sftp.get(file_name, localpath=input_file_name)

    compression = project.task['from']['sftp'].get('compression', None)

    if 'table' in project.task['to']:
        input_file = None
        if compression == 'gzip':
            input_file = gzip.open(input_file_name, 'rb')
            uncompressed_file = '/tmp/%s.csv' % str(uuid.uuid1())

            out = open(uncompressed_file, 'wb')
            for line in input_file:
                if len(line) > 1:
                    out.write(line)
            out.close()
            input_file.close()

            os.remove(input_file_name)
            input_file_name = uncompressed_file

        input_file = open(input_file_name, 'rb')

        reader = csv.reader(input_file)
        header = next(reader)
        input_file.seek(0)
        schema = make_schema(header)
        output_file_name = '/tmp/%s.csv' % str(uuid.uuid1())
        clean_csv(input_file, output_file_name, len(header), header=True)
        input_file.close()

        output_file = open(output_file_name, 'rb')
        io_to_table(project.task['auth'],
                    project.id,
                    project.task['to'].get('dataset'),
                    project.task['to'].get('table'),
                    output_file,
                    'CSV',
                    schema,
                    skip_rows=0,
                    disposition=project.task['to'].get('write_disposition',
                                                       'WRITE_TRUNCATE'))
        output_file.close()

        os.remove(input_file_name)

    os.remove(output_file_name)
コード例 #2
0
ファイル: run.py プロジェクト: google/starthinker
def dt_move_large(config, task, dt_file, dt_partition, jobs):
  if config.verbose:
    print('DT TO TABLE LARGE', dt_partition)

  delimiter = '\n'
  disposition = 'WRITE_TRUNCATE'

  # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls )
  gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS)

  # sliding view of data flowing out of decompression, used to buffer and delimit rows
  first_row = True
  view = ''

  # loop all chunks of file, decompress, and find row delimiter
  for data_gz in object_get_chunks(config, task['auth'],
                                   '%s:%s' % (task['bucket'], dt_file)):

    view += gz_handler.decompress(data_gz).decode('utf-8')

    if first_row:
      end = view.find(delimiter)
      schema = dt_schema(view[:end].split(','))
      view = view[(end + 1):]
      first_row = False

    end = view.rfind(delimiter)

    jobs.append(
        io_to_table(config, task['auth'], config.project,
                    task['to']['dataset'], dt_partition,
                    BytesIO(view[:end].encode()), 'CSV', schema, 0, disposition,
                    False))
    disposition = 'WRITE_APPEND'
    view = view[min(end + 1, len(view)):]
コード例 #3
0
def dt():
    if project.verbose: print "DT TO TABLE", project.task['to']['table']

    delimiter = '\n'
    disposition = 'WRITE_TRUNCATE'

    # loop all dt files to match pattern
    path = '%s:%s' % (project.task['from']['bucket'],
                      project.task['from']['path'])
    for dt_file in object_list(project.task['auth'], path, files_only=True):

        # decompression handler for gzip ( must be outside of chunks as it keeps track of stream across multiple calls )
        gz_handler = zlib.decompressobj(32 + zlib.MAX_WBITS)

        # sliding view of data flowing out of decompression, used to buffer and delimit rows
        first_row = True
        view = ''

        # loop all chunks of file, decompress, and find row delimiter
        for data_gz in object_get_chunks(project.task['auth'], dt_file):

            view += gz_handler.decompress(data_gz.read())

            if first_row:
                end = view.find(delimiter)
                schema = dt_schema(view[:end].split(','))
                view = view[(end + 1):]
                first_row = False

            end = view.rfind(delimiter)

            io_to_table(project.task['auth'], project.id,
                        project.task['to']['dataset'],
                        project.task['to']['table'], BytesIO(view[:end]),
                        'CSV', schema, 0, disposition, False)
            disposition = 'WRITE_APPEND'
            view = view[min(end + 1, len(view)):]