def get_sheet_metadata(sheet, spreadsheet_id, client): sheet_id = sheet.get('properties', {}).get('sheetId') sheet_title = sheet.get('properties', {}).get('title') LOGGER.info('sheet_id = {}, sheet_title = {}'.format( sheet_id, sheet_title)) stream_name = 'sheet_metadata' stream_metadata = STREAMS.get(stream_name) api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) sheet_title_encoded = urllib.parse.quote_plus(sheet_title) sheet_title_escaped = re.escape(sheet_title) # create querystring for preparing the request querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ params.items()]).replace('{sheet_title}', sheet_title_encoded) # create path for preparing the request path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) sheet_md_results = client.get(path=path, api=api, endpoint=sheet_title_escaped) # sheet_metadata: 1st `sheets` node in results sheet_metadata = sheet_md_results.get('sheets')[0] # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) try: sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) except Exception as err: LOGGER.warning('{}'.format(err)) LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title)) sheet_json_schema, columns = None, None return sheet_json_schema, columns
def get_sheet_metadata(sheet, spreadsheet_id, client): sheet_id = sheet.get('properties', {}).get('sheetId') sheet_title = sheet.get('properties', {}).get('title') LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) stream_name = 'sheet_metadata' stream_metadata = STREAMS.get(stream_name) params = stream_metadata.get('params', {}) # GET sheet_metadata sheet_md_results = client.request(endpoint=stream_name, spreadsheet_id=spreadsheet_id, sheet_title=sheet_title, params=params) # sheet_metadata: 1st `sheets` node in results sheet_metadata = sheet_md_results.get('sheets')[0] # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) try: sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) except Exception as err: LOGGER.warning('{}'.format(err)) LOGGER.warning('SKIPPING Malformed sheet: {}'.format(sheet_title)) sheet_json_schema, columns = None, None return sheet_json_schema, columns
def get_schemas(client, spreadsheet_id): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata if stream_name == 'spreadsheet_metadata': api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ endpoint=stream_name) sheets = spreadsheet_md_results.get('sheets') if sheets: # Loop thru each worksheet in spreadsheet for sheet in sheets: # GET sheet_json_schema for each worksheet (from function above) sheet_json_schema, columns = get_sheet_metadata( sheet, spreadsheet_id, client) # SKIP empty sheets (where sheet_json_schema and columns are None) if sheet_json_schema and columns: sheet_title = sheet.get('properties', {}).get('title') schemas[sheet_title] = sheet_json_schema sheet_mdata = metadata.new() sheet_mdata = metadata.get_standard_metadata( schema=sheet_json_schema, key_properties=['__sdc_row'], valid_replication_keys=None, replication_method='FULL_TABLE') field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata
def get_sheet_metadata(sheet, spreadsheet_id, client): sheet_id = sheet.get('properties', {}).get('sheetId') sheet_title = sheet.get('properties', {}).get('title') LOGGER.info('sheet_id = {}, sheet_title = {}'.format(sheet_id, sheet_title)) stream_name = 'sheet_metadata' stream_metadata = STREAMS.get(stream_name) api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]).replace('{sheet_title}', sheet_title) path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', spreadsheet_id), querystring) sheet_md_results = client.get(path=path, api=api, endpoint=stream_name) # sheet_metadata: 1st `sheets` node in results sheet_metadata = sheet_md_results.get('sheets')[0] # Create sheet_json_schema (for discovery/catalog) and columns (for sheet_metadata results) sheet_json_schema, columns = get_sheet_schema_columns(sheet_metadata) return sheet_json_schema, columns
def get_schemas(client, spreadsheet_id): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata if stream_name == 'spreadsheet_metadata': api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) # prepare the query string for the request querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) # prepare the path for request path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ endpoint=stream_name) sheets = spreadsheet_md_results.get('sheets') if sheets: # Loop thru each worksheet in spreadsheet for sheet in sheets: # GET sheet_json_schema for each worksheet (from function above) sheet_json_schema, columns = get_sheet_metadata( sheet, spreadsheet_id, client) # SKIP empty sheets (where sheet_json_schema and columns are None) if sheet_json_schema and columns: sheet_title = sheet.get('properties', {}).get('title') schemas[sheet_title] = sheet_json_schema sheet_mdata = metadata.new() sheet_mdata = metadata.get_standard_metadata( schema=sheet_json_schema, key_properties=['__sdc_row'], valid_replication_keys=None, replication_method='FULL_TABLE') # for each column check if the `columnSkipped` value is true and the `prior_column_skipped` is false or None # in the columns dict. The `prior_column_skipped` would be true when it is the first column of the two # consecutive empty headers column if true: update the incusion property to `unsupported` for column in columns: if column.get('columnSkipped') and not column.get( 'prior_column_skipped'): mdata = metadata.to_map(sheet_mdata) sheet_mdata = metadata.write( mdata, ('properties', column.get('columnName')), 'inclusion', 'unsupported') sheet_mdata = metadata.to_list(mdata) field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata
def sync(client, config, catalog, state): start_date = config.get('start_date') spreadsheet_id = config.get('spreadsheet_id') # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams: return # FILE_METADATA file_metadata = {} stream_name = 'file_metadata' file_metadata_config = STREAMS.get(stream_name) # GET file_metadata LOGGER.info('GET file_meatadata') file_metadata, time_extracted = get_data( stream_name=stream_name, endpoint_config=file_metadata_config, client=client, spreadsheet_id=spreadsheet_id) # Transform file_metadata LOGGER.info('Transform file_meatadata') file_metadata_tf = transform_file_metadata(file_metadata) # LOGGER.info('file_metadata_tf = {}'.format(file_metadata_tf)) # Check if file has changed, if not break (return to __init__) last_datetime = strptime_to_utc( get_bookmark(state, stream_name, start_date)) this_datetime = strptime_to_utc(file_metadata.get('modifiedTime')) LOGGER.info('last_datetime = {}, this_datetime = {}'.format( last_datetime, this_datetime)) if this_datetime <= last_datetime: LOGGER.info( 'this_datetime <= last_datetime, FILE NOT CHANGED. EXITING.') # Update file_metadata bookmark write_bookmark(state, 'file_metadata', strftime(this_datetime)) return # Sync file_metadata if selected sync_stream(stream_name, selected_streams, catalog, state, file_metadata_tf, time_extracted) # file_metadata bookmark is updated at the end of sync # SPREADSHEET_METADATA spreadsheet_metadata = {} stream_name = 'spreadsheet_metadata' spreadsheet_metadata_config = STREAMS.get(stream_name) # GET spreadsheet_metadata LOGGER.info('GET spreadsheet_meatadata') spreadsheet_metadata, ss_time_extracted = get_data( stream_name=stream_name, endpoint_config=spreadsheet_metadata_config, client=client, spreadsheet_id=spreadsheet_id) # Transform spreadsheet_metadata LOGGER.info('Transform spreadsheet_meatadata') spreadsheet_metadata_tf = transform_spreadsheet_metadata( spreadsheet_metadata) # Sync spreadsheet_metadata if selected sync_stream(stream_name, selected_streams, catalog, state, spreadsheet_metadata_tf, \ ss_time_extracted) # SHEET_METADATA and SHEET_DATA sheets = spreadsheet_metadata.get('sheets') sheet_metadata = [] sheets_loaded = [] sheets_loaded_config = STREAMS['sheets_loaded'] if sheets: # Loop thru sheets (worksheet tabs) in spreadsheet for sheet in sheets: sheet_title = sheet.get('properties', {}).get('title') sheet_id = sheet.get('properties', {}).get('sheetId') # GET sheet_metadata and columns sheet_schema, columns = get_sheet_metadata(sheet, spreadsheet_id, client) # LOGGER.info('sheet_schema: {}'.format(sheet_schema)) # SKIP empty sheets (where sheet_schema and columns are None) if not sheet_schema or not columns: LOGGER.info('SKIPPING Empty Sheet: {}'.format(sheet_title)) else: # Transform sheet_metadata sheet_metadata_tf = transform_sheet_metadata( spreadsheet_id, sheet, columns) # LOGGER.info('sheet_metadata_tf = {}'.format(sheet_metadata_tf)) sheet_metadata.append(sheet_metadata_tf) # SHEET_DATA # Should this worksheet tab be synced? if sheet_title in selected_streams: LOGGER.info('STARTED Syncing Sheet {}'.format(sheet_title)) update_currently_syncing(state, sheet_title) selected_fields = get_selected_fields(catalog, sheet_title) LOGGER.info('Stream: {}, selected_fields: {}'.format( sheet_title, selected_fields)) write_schema(catalog, sheet_title) # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs) # everytime after each sheet sync is complete. # This forces hard deletes on the data downstream if fewer records are sent. # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137 last_integer = int(get_bookmark(state, sheet_title, 0)) activate_version = int(time.time() * 1000) activate_version_message = singer.ActivateVersionMessage( stream=sheet_title, version=activate_version) if last_integer == 0: # initial load, send activate_version before AND after data sync singer.write_message(activate_version_message) LOGGER.info( 'INITIAL SYNC, Stream: {}, Activate Version: {}'. format(sheet_title, activate_version)) # Determine max range of columns and rows for "paging" through the data sheet_last_col_index = 1 sheet_last_col_letter = 'A' for col in columns: col_index = col.get('columnIndex') col_letter = col.get('columnLetter') if col_index > sheet_last_col_index: sheet_last_col_index = col_index sheet_last_col_letter = col_letter sheet_max_row = sheet.get('properties').get( 'gridProperties', {}).get('rowCount') # Initialize paging for 1st batch is_last_row = False if config.get('batch_rows'): batch_rows = config.get('batch_rows') else: batch_rows = 200 from_row = 2 if sheet_max_row < batch_rows: to_row = sheet_max_row else: to_row = batch_rows # Loop thru batches (each having 200 rows of data) while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row: range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row) # GET sheet_data for a worksheet tab sheet_data, time_extracted = get_data( stream_name=sheet_title, endpoint_config=sheets_loaded_config, client=client, spreadsheet_id=spreadsheet_id, range_rows=range_rows) # Data is returned as a list of arrays, an array of values for each row sheet_data_rows = sheet_data.get('values', []) # Transform batch of rows to JSON with keys for each column sheet_data_tf, row_num = transform_sheet_data( spreadsheet_id=spreadsheet_id, sheet_id=sheet_id, sheet_title=sheet_title, from_row=from_row, columns=columns, sheet_data_rows=sheet_data_rows) if row_num < to_row: is_last_row = True # Process records, send batch of records to target record_count = process_records( catalog=catalog, stream_name=sheet_title, records=sheet_data_tf, time_extracted=ss_time_extracted, version=activate_version) LOGGER.info('Sheet: {}, records processed: {}'.format( sheet_title, record_count)) # Update paging from/to_row for next batch from_row = to_row + 1 if to_row + batch_rows > sheet_max_row: to_row = sheet_max_row else: to_row = to_row + batch_rows # End of Stream: Send Activate Version and update State singer.write_message(activate_version_message) write_bookmark(state, sheet_title, activate_version) LOGGER.info( 'COMPLETE SYNC, Stream: {}, Activate Version: {}'. format(sheet_title, activate_version)) LOGGER.info( 'FINISHED Syncing Sheet {}, Total Rows: {}'.format( sheet_title, row_num - 2)) # subtract 1 for header row update_currently_syncing(state, None) # SHEETS_LOADED # Add sheet to sheets_loaded sheet_loaded = {} sheet_loaded['spreadsheetId'] = spreadsheet_id sheet_loaded['sheetId'] = sheet_id sheet_loaded['title'] = sheet_title sheet_loaded['loadDate'] = strftime(utils.now()) sheet_loaded['lastRowNumber'] = row_num sheets_loaded.append(sheet_loaded) stream_name = 'sheet_metadata' # Sync sheet_metadata if selected sync_stream(stream_name, selected_streams, catalog, state, sheet_metadata) stream_name = 'sheets_loaded' # Sync sheet_metadata if selected sync_stream(stream_name, selected_streams, catalog, state, sheets_loaded) # Update file_metadata bookmark write_bookmark(state, 'file_metadata', strftime(this_datetime)) return