Example #1
0
    def Flush(self):
        """Finish writing JSON files, upload to cloudstorage and bigquery."""
        self.bigquery = bigquery.GetBigQueryClient()
        # BigQuery job ids must be alphanum plus dash and underscore.
        urn_str = self.state.source_urn.RelativeName("aff4:/").replace(
            "/", "_").replace(":", "").replace(".", "-")

        for tracker in self.temp_output_trackers.values():
            # Close out the gzip handle and pass the original file handle to the
            # bigquery client so it sees the gzip'd content.
            tracker.gzip_filehandle.write("\n")
            tracker.gzip_filehandle.close()
            tracker.gzip_filehandle_parent.seek(0)

            # e.g. job_id: hunts_HFFE1D044_Results_ExportedFile_1446056474
            job_id = "{0}_{1}_{2}".format(
                urn_str, tracker.output_type,
                rdfvalue.RDFDatetime.Now().AsSecondsSinceEpoch())

            # If we have a job id stored, that means we failed last time. Re-use the
            # job id and append to the same file if it continues to fail. This avoids
            # writing many files on failure.
            if tracker.output_type in self.state.output_jobids:
                job_id = self.state.output_jobids[tracker.output_type]
            else:
                self.state.output_jobids[tracker.output_type] = job_id

            if (self.state.failure_count >=
                    config.CONFIG["BigQuery.max_upload_failures"]):
                logging.error(
                    "Exceeded BigQuery.max_upload_failures for %s. Giving up "
                    "on BigQuery and writing to AFF4.", self.state.source_urn)
                self._WriteToAFF4(job_id, tracker.schema,
                                  tracker.gzip_filehandle_parent, self.token)

            else:
                try:
                    self.bigquery.InsertData(tracker.output_type,
                                             tracker.gzip_filehandle_parent,
                                             tracker.schema, job_id)
                    self.state.failure_count = max(
                        0, self.state.failure_count - 1)
                    del self.state.output_jobids[tracker.output_type]
                except bigquery.BigQueryJobUploadError:
                    self.state.failure_count += 1

                    self._WriteToAFF4(job_id, tracker.schema,
                                      tracker.gzip_filehandle_parent,
                                      self.token)

        # Now that everything is in bigquery we can remove the output streams
        self.temp_output_trackers = {}
Example #2
0
  def testInsertData(self, mock_http, mock_build, mock_creds):
    bq_client = bigquery.GetBigQueryClient(
        service_account_json=self.SERVICE_ACCOUNT_JSON,
        project_id=self.PROJECT_ID)

    schema_data = json.load(
        open(
            os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                         "ExportedFile.schema"), "rb"))
    data_fd = open(
        os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                     "ExportedFile.json.gz"), "rb")
    now = rdfvalue.RDFDatetime.Now().AsSecondsSinceEpoch()
    job_id = "hunts_HFFE1D044_Results_%s" % now
    bq_client.InsertData("ExportedFile", data_fd, schema_data, job_id)

    # We should have called insert once
    insert = mock_build.return_value.jobs.return_value.insert
    self.assertEqual(insert.call_count, 1)
    self.assertEqual(
        job_id, insert.call_args_list[0][1]["body"]["jobReference"]["jobId"])