def Flush(self): """Finish writing JSON files, upload to cloudstorage and bigquery.""" self.bigquery = bigquery.GetBigQueryClient() # BigQuery job ids must be alphanum plus dash and underscore. urn_str = self.state.source_urn.RelativeName("aff4:/").replace( "/", "_").replace(":", "").replace(".", "-") for tracker in self.temp_output_trackers.values(): # Close out the gzip handle and pass the original file handle to the # bigquery client so it sees the gzip'd content. tracker.gzip_filehandle.write("\n") tracker.gzip_filehandle.close() tracker.gzip_filehandle_parent.seek(0) # e.g. job_id: hunts_HFFE1D044_Results_ExportedFile_1446056474 job_id = "{0}_{1}_{2}".format( urn_str, tracker.output_type, rdfvalue.RDFDatetime.Now().AsSecondsFromEpoch()) # If we have a job id stored, that means we failed last time. Re-use the # job id and append to the same file if it continues to fail. This avoids # writing many files on failure. if tracker.output_type in self.state.output_jobids: job_id = self.state.output_jobids[tracker.output_type] else: self.state.output_jobids[tracker.output_type] = job_id if (self.state.failure_count >= config.CONFIG["BigQuery.max_upload_failures"]): logging.error("Exceeded BigQuery.max_upload_failures for %s. Giving up " "on BigQuery and writing to AFF4.", self.state.source_urn) self._WriteToAFF4(job_id, tracker.schema, tracker.gzip_filehandle_parent, self.token) else: try: self.bigquery.InsertData(tracker.output_type, tracker.gzip_filehandle_parent, tracker.schema, job_id) self.state.failure_count = max(0, self.state.failure_count - 1) del self.state.output_jobids[tracker.output_type] except bigquery.BigQueryJobUploadError: self.state.failure_count += 1 self._WriteToAFF4(job_id, tracker.schema, tracker.gzip_filehandle_parent, self.token) # Now that everything is in bigquery we can remove the output streams self.temp_output_trackers = {}
def testInsertData(self, mock_http, mock_build, mock_creds): bq_client = bigquery.GetBigQueryClient( service_account_json=self.SERVICE_ACCOUNT_JSON, project_id=self.PROJECT_ID) schema_data = json.load( open( os.path.join(config.CONFIG["Test.data_dir"], "bigquery", "ExportedFile.schema"), "rb")) data_fd = open( os.path.join(config.CONFIG["Test.data_dir"], "bigquery", "ExportedFile.json.gz"), "rb") now = rdfvalue.RDFDatetime.Now().AsSecondsSinceEpoch() job_id = "hunts_HFFE1D044_Results_%s" % now bq_client.InsertData("ExportedFile", data_fd, schema_data, job_id) # We should have called insert once insert = mock_build.return_value.jobs.return_value.insert self.assertEqual(insert.call_count, 1) self.assertEqual( job_id, insert.call_args_list[0][1]["body"]["jobReference"]["jobId"])