コード例 #1
0
ファイル: store.py プロジェクト: yazici/starthinker
  def clear(self):
    """Clears the store in the Bulkdozer feed.

    """
    if self.trix_id:
      sheets_clear(self.auth, self.trix_id, 'Store', 'A1:Z1')

    self._store = {}
    self._id_map = {}
コード例 #2
0
ファイル: run.py プロジェクト: sjoerdapp/starthinker
def bigquery():

    if 'query' in project.task['from']:
        if 'table' in project.task['to']:
            if project.verbose:
                print "QUERY TO TABLE", project.task['to']['table']

            if 'pre_process_query' in project.task['to']:
                print 'executing statement'
                execute_statement(project.task['auth'],
                                  project.id,
                                  project.task['to']['dataset'],
                                  project.task['to']['pre_process_query'],
                                  use_legacy_sql=project.task['from'].get(
                                      'legacy', project.task['from'].get(
                                          'useLegacySql', True)))
            query_to_table(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['table'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                disposition=project.task['write_disposition']
                if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
                legacy=project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql,
                target_project_id=project.task['to'].get(
                    'project_id', project.id))
        # NOT USED SO RIPPING IT OUT
        # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json
        elif 'storage' in project.task['to']:
            if project.verbose:
                print "QUERY TO STORAGE", project.task['to']['storage']
            local_file_name = '/tmp/%s' % str(uuid.uuid1())
            rows = query_to_rows(project.task['auth'], project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'])

            f = open(local_file_name, 'wb')
            writer = csv.writer(f)
            writer.writerows(rows)
            f.close()

            f = open(local_file_name, 'rb')
            object_put(project.task['auth'], project.task['to']['storage'], f)
            f.close()

            os.remove(local_file_name)
        elif 'trix' in project.task['to']:
            if project.verbose:
                print "QUERY TO SHEET", project.task['to']['trix']
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'legacy', True))

            # makes sure types are correct in sheet
            rows = rows_to_type(rows)

            sheets_clear(project.task['auth'], project.task['to']['trix'],
                         project.task['to']['range'].split('!')[0],
                         project.task['to']['range'].split('!')[1])
            sheets_write(project.task['auth'], project.task['to']['trix'],
                         project.task['to']['range'].split('!')[0],
                         project.task['to']['range'].split('!')[1], rows)
        elif 'sftp' in project.task['to']:
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'use_legacy_sql', True))

            if rows:
                if project.verbose: print "QUERY TO SFTP"
                put_rows(project.task['auth'], project.task['to'], '', rows)
        else:
            if project.verbose:
                print "QUERY TO VIEW", project.task['to']['view']
            query_to_view(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['view'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                project.task['from'].get('legacy', project.task['from'].get(
                    'useLegacySql', True))  # DEPRECATED: useLegacySql
            )
    else:
        if project.verbose:
            print "STORAGE TO TABLE", project.task['to']['table']
        storage_to_table(
            project.task['auth'], project.id, project.task['to']['dataset'],
            project.task['to']['table'], project.task['from']['bucket'] + ':' +
            project.task['from']['path'], project.task.get('schema', []),
            project.task.get('skip_rows', 1),
            project.task.get('structure', 'CSV'),
            project.task.get('disposition', 'WRITE_TRUNCATE'))
コード例 #3
0
  def clear(self):
    """Clears the log tab in the Bulkdozer feed, useful when a new execution is starting.

    """
    sheets_clear(self.auth, self.trix_id, 'Log', 'A1:B')
    self._row = 1
コード例 #4
0
ファイル: run.py プロジェクト: sjoerdapp/starthinker
def sheets():
    if project.verbose: print 'SHEETS'

    # clear if specified
    if project.task.get('clear', False):
        sheets_clear(project.task['auth'], project.task['sheet'],
                     project.task['tab'], project.task['range'])

    # delete if specified ( after clear to prevent errors in case both are given )
    if project.task.get('delete', False):
        sheets_tab_delete(project.task['auth'], project.task['sheet'],
                          project.task['tab'])

    # create or copy if specified
    if 'template' in project.task:
        sheets_tab_copy(project.task['auth'],
                        project.task['template']['sheet'],
                        project.task['template']['tab'], project.task['sheet'],
                        project.task['tab'])
    else:
        sheets_tab_create(project.task['auth'], project.task['sheet'],
                          project.task['tab'])

    # write data if specified
    if 'write' in project.task:
        rows = get_rows(project.task['auth'], project.task['write'])
        sheets_write(project.task['auth'], project.task['sheet'],
                     project.task['tab'], project.task['range'], rows)

    # move if specified
    if 'out' in project.task:
        rows = sheets_read(project.task['auth'], project.task['sheet'],
                           project.task['tab'], project.task['range'])

        if rows:
            schema = None

            # RECOMMENDED: define schema in json
            if project.task['out']['bigquery'].get('schema'):
                if project.verbose: print 'SHEETS SCHEMA DEFINED'
                schema = project.task['out']['bigquery']['schema']
            # NOT RECOMMENDED: determine schema if missing
            else:
                if project.verbose:
                    print 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )'
                # cast rows to types ( for schema detection )
                rows = rows_to_type(rows)
                rows, schema = get_schema(rows,
                                          project.task.get('header', False),
                                          infer_type=project.task.get(
                                              'infer_type', True))

            # write to table ( not using put because no use cases for other destinations )
            rows_to_table(
                auth=project.task['auth'],
                project_id=project.id,
                dataset_id=project.task['out']['bigquery']['dataset'],
                table_id=project.task['out']['bigquery']['table'],
                rows=rows,
                schema=schema,
                skip_rows=1 if project.task.get('header', False) else 0,
                disposition=project.task['out']['bigquery'].get(
                    'disposition', 'WRITE_TRUNCATE'))

        else:
            print 'SHEET EMPTY'
コード例 #5
0
ファイル: __init__.py プロジェクト: sjoerdapp/starthinker
def put_rows(auth, destination, filename, rows, variant=''):
  """Processes standard write JSON block for dynamic export of data.
  
  Allows us to quickly write the results of a script to a destination.  For example
  write the results of a DCM report into BigQuery.

  - Will write to multiple destinations if specified.
  - Extensible, add a handler to define a new destination ( be kind update the documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.
  
    from utils.data import put_rows
  
    var_json = {
      "out":{
        "bigquery":{
          "dataset": [ string ],
          "table": [ string ]
          "schema": [ json - standard bigquery schema json ],
          "skip_rows": [ integer - for removing header ]
          "disposition": [ string - same as BigQuery documentation ]
        },
        "sheets":{
          "url":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
          "delete": [ boolean - if sheet range should be cleared before writing ]
        },
        "storage":{
          "bucket": [ string ],
          "path": [ string ]
        },
        "directory":[ string - full path to place to write file ]
      } 
    } 
  
    values = put_rows('user', var_json)
  
  Or you can use it directly with project singleton.
  
    from util.project import project
    from utils.data import put_rows
  
    def something():
      values = get_rows(project.task['auth'], project.task['out'])
  
    if __name__ == "__main__":
      project.load('something')
      something()
  
  Args:
    auth: (string) The type of authentication to use, user or service.
    destination: (json) A json block resembling var_json described above.
    filename: (string) A unique filename if writing to medium requiring one, Usually gnerated by script.
    rows ( list ) The data being written as a list object.
    variant ( string ) Appends this to the destination name to create a variant ( for example when downloading multiple tabs in a sheet ).

  Returns:
    If single_cell is False: Returns a list of row values [[v1], [v2], ... ]
    If single_cell is True: Returns a list of values [v1, v2, ...]
"""

  if 'bigquery' in destination:

    if destination['bigquery'].get('format' , 'CSV') == 'JSON':
      json_to_table(
        destination['bigquery'].get('auth', auth),
        destination['bigquery'].get('project_id', project.id),
        destination['bigquery']['dataset'],
        destination['bigquery']['table'] + variant,
        rows,
        destination['bigquery'].get('schema', []),
        destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
      )
    
    elif destination['bigquery'].get('is_incremental_load', False) == True:
      incremental_rows_to_table( 
        destination['bigquery'].get('auth', auth),
        destination['bigquery'].get('project_id', project.id),
        destination['bigquery']['dataset'],
        destination['bigquery']['table'] + variant,
        rows,
        destination['bigquery'].get('schema', []),
        destination['bigquery'].get('skip_rows', 1),
        destination['bigquery'].get('disposition', 'WRITE_APPEND'),
        billing_project_id=project.id
      )

    else:
      rows_to_table(
        destination['bigquery'].get('auth', auth),
        destination['bigquery'].get('project_id', project.id),
        destination['bigquery']['dataset'],
        destination['bigquery']['table'] + variant,
        rows,
        destination['bigquery'].get('schema', []),
        destination['bigquery'].get('skip_rows', 1),
        destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'),
      )

  if 'sheets' in destination:
    if destination['sheets'].get('delete', False): 
      sheets_clear(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'])
    sheets_write(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], rows) 

  if 'directory' in destination:
    file_out = destination['directory'] + variant + filename
    if project.verbose: print 'SAVING', file_out
    makedirs_safe(parse_path(file_out))
    with open(file_out, 'wb') as save_file:
      save_file.write(rows_to_csv(rows).read())

  if 'storage' in destination and destination['storage'].get('bucket') and destination['storage'].get('path'):
    # create the bucket
    bucket_create(auth, project.id, destination['storage']['bucket'])

    # put the file
    file_out = destination['storage']['bucket'] + ':' + destination['storage']['path'] + variant + filename
    if project.verbose: print 'SAVING', file_out
    object_put(auth, file_out, rows_to_csv(rows))

  # deprecated do not use
  if 'trix' in destination:
    trix_update(auth, destination['trix']['sheet_id'], destination['trix']['sheet_range'], rows_to_csv(rows), destination['trix']['clear'])

  if 'email' in destination:
    pass

  if 'sftp' in destination:
    try:
      sys.stderr = StringIO();

      cnopts = pysftp.CnOpts()
      cnopts.hostkeys = None

      file_prefix = 'report'
      if 'file_prefix' in destination['sftp']:
        file_prefix = destination['sftp'].get('file_prefix')
        del destination['sftp']['file_prefix']

      #sftp_configs = destination['sftp']
      #sftp_configs['cnopts'] = cnopts
      #sftp = pysftp.Connection(**sftp_configs)

      sftp = pysftp.Connection(host=destination['sftp']['host'], username=destination['sftp']['username'], password=destination['sftp']['password'], port=destination['sftp']['port'], cnopts=cnopts)

      if 'directory' in destination['sftp']:
        sftp.cwd(destination['sftp']['directory'])

      tmp_file_name = '/tmp/%s_%s.csv' % (file_prefix, datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S'))

      tmp_file = open(tmp_file_name, 'wb')
      tmp_file.write(rows_to_csv(rows).read())
      tmp_file.close()

      sftp.put(tmp_file_name)

      os.remove(tmp_file_name)

      sys.stderr = sys.__stderr__;
    except e:
      print e
      traceback.print_exc()