Exemple #1
0
  def Flush(self):
    """Finish writing JSON files, upload to cloudstorage and bigquery."""
    self.bigquery = bigquery.GetBigQueryClient()
    # BigQuery job ids must be alphanum plus dash and underscore.
    urn_str = self.state.source_urn.RelativeName("aff4:/").replace(
        "/", "_").replace(":", "").replace(".", "-")

    for tracker in self.temp_output_trackers.values():
      # Close out the gzip handle and pass the original file handle to the
      # bigquery client so it sees the gzip'd content.
      tracker.gzip_filehandle.write("\n")
      tracker.gzip_filehandle.close()
      tracker.gzip_filehandle_parent.seek(0)

      # e.g. job_id: hunts_HFFE1D044_Results_ExportedFile_1446056474
      job_id = "{0}_{1}_{2}".format(
          urn_str, tracker.output_type,
          rdfvalue.RDFDatetime.Now().AsSecondsFromEpoch())

      # If we have a job id stored, that means we failed last time. Re-use the
      # job id and append to the same file if it continues to fail. This avoids
      # writing many files on failure.
      if tracker.output_type in self.state.output_jobids:
        job_id = self.state.output_jobids[tracker.output_type]
      else:
        self.state.output_jobids[tracker.output_type] = job_id

      if (self.state.failure_count >=
          config.CONFIG["BigQuery.max_upload_failures"]):
        logging.error("Exceeded BigQuery.max_upload_failures for %s. Giving up "
                      "on BigQuery and writing to AFF4.", self.state.source_urn)
        self._WriteToAFF4(job_id, tracker.schema,
                          tracker.gzip_filehandle_parent, self.token)

      else:
        try:
          self.bigquery.InsertData(tracker.output_type,
                                   tracker.gzip_filehandle_parent,
                                   tracker.schema, job_id)
          self.state.failure_count = max(0, self.state.failure_count - 1)
          del self.state.output_jobids[tracker.output_type]
        except bigquery.BigQueryJobUploadError:
          self.state.failure_count += 1

          self._WriteToAFF4(job_id, tracker.schema,
                            tracker.gzip_filehandle_parent, self.token)

    # Now that everything is in bigquery we can remove the output streams
    self.temp_output_trackers = {}
Exemple #2
0
  def testInsertData(self, mock_http, mock_build, mock_creds):
    bq_client = bigquery.GetBigQueryClient(
        service_account_json=self.SERVICE_ACCOUNT_JSON,
        project_id=self.PROJECT_ID)

    schema_data = json.load(
        open(
            os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                         "ExportedFile.schema"), "rb"))
    data_fd = open(
        os.path.join(config.CONFIG["Test.data_dir"], "bigquery",
                     "ExportedFile.json.gz"), "rb")
    now = rdfvalue.RDFDatetime.Now().AsSecondsSinceEpoch()
    job_id = "hunts_HFFE1D044_Results_%s" % now
    bq_client.InsertData("ExportedFile", data_fd, schema_data, job_id)

    # We should have called insert once
    insert = mock_build.return_value.jobs.return_value.insert
    self.assertEqual(insert.call_count, 1)
    self.assertEqual(
        job_id, insert.call_args_list[0][1]["body"]["jobReference"]["jobId"])