def _export_builds(dataset, table_name, builds, deadline): """Saves builds to BigQuery. Logs insert errors and returns a list of ids of builds that could not be inserted. """ # BigQuery API doc: # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll logging.info('sending %d rows', len(builds)) pairs = [(b, build_pb2.Build()) for b in builds] model.builds_to_protos_async( pairs, load_tags=True, load_input_properties=True, load_output_properties=True, load_steps=True, load_infra=True, ).get_result() # Clear fields that we don't want in BigQuery. for _, proto in pairs: proto.infra.buildbucket.hostname = '' for s in proto.steps: s.summary_markdown = '' s.ClearField('logs') res = net.json_request( url=(('https://www.googleapis.com/bigquery/v2/' 'projects/%s/datasets/%s/tables/%s/insertAll') % (app_identity.get_application_id(), dataset, table_name)), method='POST', payload={ 'kind': 'bigquery#tableDataInsertAllRequest', # Do not fail entire request because of one bad build. # We handle invalid rows below. 'skipInvalidRows': True, 'ignoreUnknownValues': False, 'rows': [{ 'insertId': str(p.id), 'json': bqh.message_to_dict(p), } for _, p in pairs], }, scopes=bqh.INSERT_ROWS_SCOPE, # deadline parameter here is duration in seconds. deadline=(deadline - utils.utcnow()).total_seconds(), ) failed_ids = [] for err in res.get('insertErrors', []): _, bp = pairs[err['index']] failed_ids.append(bp.id) logging.error('failed to insert row for build %d: %r', bp.id, err['errors']) return failed_ids
def _send_to_bq(snapshots): """Sends the snapshots to BigQuery. Returns: Timestamps, encoded as strings, of snapshots that failed to be sent """ # See doc/Monitoring.md. dataset = 'isolated' table_name = 'stats' # BigQuery API doc: # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll url = ( 'https://www.googleapis.com/bigquery/v2/projects/%s/datasets/%s/tables/' '%s/insertAll') % (app_identity.get_application_id(), dataset, table_name) payload = { 'kind': 'bigquery#tableDataInsertAllRequest', # Do not fail entire request because of one bad snapshot. # We handle invalid rows below. 'skipInvalidRows': True, 'ignoreUnknownValues': False, 'rows': [{ 'insertId': s.timestamp_str, 'json': bqh.message_to_dict(_to_proto(s)), } for s in snapshots], } res = net.json_request(url=url, method='POST', payload=payload, scopes=bqh.INSERT_ROWS_SCOPE, deadline=600) failed = [] for err in res.get('insertErrors', []): t = snapshots[err['index']].timestamp_str if not failed: # Log the error for the first entry, useful to diagnose schema failure. logging.error('Failed to insert row %s: %r', t, err['errors']) failed.append(t) return failed
def _send_to_bq_raw(dataset, table_name, rows): """Sends the rows to BigQuery. Arguments: dataset: BigQuery dataset name that contains the table. table_name: BigQuery table to stream the rows to. rows: list of (row_id, row) rows to sent to BQ. Returns: indexes of rows that failed to be sent. """ # BigQuery API doc: # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll url = ( 'https://www.googleapis.com/bigquery/v2/projects/%s/datasets/%s/tables/' '%s/insertAll') % (app_identity.get_application_id(), dataset, table_name) payload = { 'kind': 'bigquery#tableDataInsertAllRequest', # Do not fail entire request because of one bad row. # We handle invalid rows below. 'skipInvalidRows': True, 'ignoreUnknownValues': False, 'rows': [{ 'insertId': row_id, 'json': bqh.message_to_dict(row) } for row_id, row in rows], } res = net.json_request(url=url, method='POST', payload=payload, scopes=bqh.INSERT_ROWS_SCOPE, deadline=600) dropped = 0 failed = [] # Use this error message string to detect the error where we're pushing data # that is too old. This can occasionally happen as a cron job looks for old # entity and by the time it's sending them BigQuery doesn't accept them, just # skip these and log a warning. out_of_time = ( 'You can only stream to date range within 365 days in the past ' 'and 183 days in the future relative to the current date') # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll#response for line in res.get('insertErrors', []): i = line['index'] err = line['errors'][0] if err['reason'] == 'invalid' and out_of_time in err['message']: # Silently drop it. The rationale is that if it is not skipped, the loop # will get stuck on it. dropped += 1 continue if not failed: # Log the error for the first entry, useful to diagnose schema failure. logging.error('Failed to insert row %s: %r', i, err) failed.append(i) if dropped: logging.warning('%d old rows silently dropped', dropped) return failed