コード例 #1
0
ファイル: spec_test.py プロジェクト: cl3777/curation
    def test_get_full_result_log_when_all_exist(self, mock_hpo_csv):
        self._empty_hpo_buckets()
        hpos = resources.hpo_csv()
        hpo_0 = hpos[0]
        hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id'])
        with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp:
            gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp)

        with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp:
            hpo_0_expected_items = json.load(fp)
            for item in hpo_0_expected_items:
                item['hpo_id'] = hpo_0['hpo_id']

        hpo_1 = hpos[1]
        hpo_1_bucket = gcs_utils.get_hpo_bucket(hpo_1['hpo_id'])
        with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT, 'r') as fp:
            gcs_utils.upload_object(hpo_1_bucket, common.RESULT_CSV, fp)

        with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT_NO_HPO_JSON,
                  'r') as fp:
            hpo_1_expected_items = json.load(fp)
            for item in hpo_1_expected_items:
                item['hpo_id'] = hpo_1['hpo_id']

        expected = hpo_0_expected_items + hpo_1_expected_items
        actual = main.get_full_result_log()
        self.assertResultLogItemsEqual(expected, actual)
コード例 #2
0
 def test_get_metadata_on_existing_file(self):
     expected_file_name = 'person.csv'
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         gcs_utils.upload_object(self.hpo_bucket, expected_file_name, fp)
     metadata = gcs_utils.get_metadata(self.hpo_bucket, expected_file_name)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata['name'], expected_file_name)
コード例 #3
0
ファイル: bq_utils_test.py プロジェクト: ksdkalluri/curation
    def test_merge_with_unmatched_schema(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'measurement.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'measurement')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_measurement',
                                                    'pitt_person'))

        table_names = ['nyc_measurement', 'pitt_person']
        success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(),
                                               table_names,
                                               bq_utils.get_dataset_id(),
                                               'merged_nyc_pitt')
        self.assertFalse(success)
コード例 #4
0
ファイル: bq_utils_test.py プロジェクト: ksdkalluri/curation
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = bq_utils.get_dataset_id()
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id']) for row in resources._csv_to_list(
             PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id),
                                 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertListEqual(actual_result, expected_observation_ids)
コード例 #5
0
 def load_dataset_from_files(dataset_id, path):
     app_id = bq_utils.app_identity.get_application_id()
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in common.CDM_TABLES:
         filename = table + '.csv'
         schema = os.path.join(resources.fields_path, table + '.json')
         f = os.path.join(path, filename)
         if os.path.exists(os.path.join(path, filename)):
             with open(f, 'r') as fp:
                 gcs_utils.upload_object(bucket, filename, fp)
         else:
             test_util.write_cloud_str(bucket, filename, '\n')
         gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                      filename=filename)
         load_results = bq_utils.load_csv(schema,
                                          gcs_path,
                                          app_id,
                                          dataset_id,
                                          table,
                                          allow_jagged_rows=True)
         load_job_id = load_results['jobReference']['jobId']
         job_ids.append(load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
コード例 #6
0
 def test_get_object(self):
     with open(FIVE_PERSONS_PERSON_CSV, 'r') as fp:
         expected = fp.read()
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
     result = gcs_utils.get_object(self.hpo_bucket, 'person.csv')
     self.assertEqual(expected, result)
コード例 #7
0
 def test_upload_object(self):
     bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
     self.assertEqual(len(bucket_items), 0)
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
     bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
     self.assertEqual(len(bucket_items), 1)
     bucket_item = bucket_items[0]
     self.assertEqual(bucket_item['name'], 'person.csv')
コード例 #8
0
    def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset):
        mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}]
        mock_is_ehr_dataset.return_value = True

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id,
                                                                      table_id=hpo_table,
                                                                      pids=retract_data_bq.int_list_to_bq(
                                                                                self.person_ids))
            retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id,
                                                                            hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id,
                                                                  hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete')
        retract_data_bq.logger.debug('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = retract_data_bq.to_int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']
        deid_flag = False

        # perform retraction
        retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(expected_row_count[table],
                             row_count_before_retraction[table] - row_count_after_retraction[table])
コード例 #9
0
    def test_query_result(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)

        load_job_id = result['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON))

        table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
        q = 'SELECT person_id FROM %s' % table_id
        result = bq_utils.query(q)
        self.assertEqual(5, int(result['totalRows']))
コード例 #10
0
ファイル: spec_test.py プロジェクト: cl3777/curation
 def test_get_full_result_log_when_one_does_not_exist(self, mock_hpo_csv):
     self._empty_hpo_buckets()
     hpos = resources.hpo_csv()
     hpo_0 = hpos[0]
     hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id'])
     with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp:
         gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp)
     with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp:
         expected = json.load(fp)
         for item in expected:
             item['hpo_id'] = hpo_0['hpo_id']
     actual = main.get_full_result_log()
     self.assertResultLogItemsEqual(expected, actual)
コード例 #11
0
    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable']['tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id))
        table_info = bq_utils.get_table_info(table_id)
        num_rows = table_info.get('numRows')
        self.assertEqual(num_rows, '5')
コード例 #12
0
ファイル: bq_utils_test.py プロジェクト: ksdkalluri/curation
    def test_merge_with_good_data(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        nyc_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV)
        ]
        pitt_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV)
        ]
        expected_result = nyc_person_ids + pitt_person_ids
        expected_result.sort()

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_person',
                                                    'pitt_person'))

        dataset_id = bq_utils.get_dataset_id()
        table_ids = ['nyc_person', 'pitt_person']
        merged_table_id = 'merged_nyc_pitt'
        success_flag, error = bq_utils.merge_tables(dataset_id, table_ids,
                                                    dataset_id,
                                                    merged_table_id)

        self.assertTrue(success_flag)
        self.assertEqual(error, "")

        query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format(
            dataset_id=dataset_id, table_id=merged_table_id)
        merged_query_job_result = bq_utils.query(query_string)

        self.assertIsNone(merged_query_job_result.get('errors', None))
        actual_result = [
            int(row['f'][0]['v']) for row in merged_query_job_result['rows']
        ]
        actual_result.sort()
        self.assertListEqual(expected_result, actual_result)
コード例 #13
0
 def _upload_file_to_bucket(bucket, dataset_id, path, table):
     app_id = bq_utils.app_identity.get_application_id()
     filename = table + '.csv'
     schema = os.path.join(resources.fields_path, table + '.json')
     f = os.path.join(path, filename)
     if os.path.exists(os.path.join(path, filename)):
         with open(f, 'r') as fp:
             gcs_utils.upload_object(bucket, filename, fp)
     else:
         test_util.write_cloud_str(bucket, filename, '\n')
     gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename)
     load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True)
     load_job_id = load_results['jobReference']['jobId']
     return load_job_id
コード例 #14
0
    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable'][
            'tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_id))
        query_response = bq_utils.query('SELECT 1 FROM %(table_id)s' %
                                        locals())
        self.assertEqual(query_response['totalRows'], '5')
コード例 #15
0
ファイル: bq_utils_test.py プロジェクト: ksdkalluri/curation
    def test_load_csv(self):
        from google.appengine.api import app_identity

        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        schema_file_name = table_name + '.json'
        csv_file_name = table_name + '.csv'
        schema_path = os.path.join(resources.fields_path, schema_file_name)
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        with open(local_csv_path, 'r') as fp:
            response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name,
                                               fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = bq_utils.get_dataset_id()
        load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
コード例 #16
0
def save_datasources_json(datasource_id=None,
                          folder_prefix="",
                          target_bucket=None):
    """
    Generate and save datasources.json (from curation report) in a GCS bucket

    :param datasource_id: the ID of the HPO aggregate dataset that report should go to
    :param folder_prefix: relative path in GCS to save to (without 'gs://')
    :param target_bucket: GCS bucket to save to. If not supplied, uses the
        bucket assigned to hpo_id.
    :return:
    """
    if datasource_id is None:
        if target_bucket is None:
            raise RuntimeError(
                'Cannot save datasources.json if neither hpo_id '
                'or target_bucket are specified.')
        hpo_id = 'default'
    else:
        if target_bucket is None:
            target_bucket = gcs_utils.get_hpo_bucket(datasource_id)

    datasource = dict(name=datasource_id, folder=datasource_id, cdmVersion=5)
    datasources = dict(datasources=[datasource])
    datasources_fp = StringIO(json.dumps(datasources))
    result = gcs_utils.upload_object(
        target_bucket, folder_prefix + ACHILLES_EXPORT_DATASOURCES_JSON,
        datasources_fp)
    return result
コード例 #17
0
def _upload_achilles_files(hpo_id=None, folder_prefix='', target_bucket=None):
    """
    uploads achilles web files to the corresponding hpo bucket

    :hpo_id: which hpo bucket do these files go into
    :returns:
    """
    results = []
    if target_bucket is not None:
        bucket = target_bucket
    else:
        if hpo_id is None:
            raise RuntimeError(
                'either hpo_id or target_bucket must be specified')
        bucket = gcs_utils.get_hpo_bucket(hpo_id)
    logging.info('Uploading achilles index files to `gs://%s/%s`...', bucket,
                 folder_prefix)
    for filename in resources.ACHILLES_INDEX_FILES:
        logging.info('Uploading achilles file `%s` to bucket `%s`' %
                     (filename, bucket))
        bucket_file_name = filename.split(resources.resource_path +
                                          os.sep)[1].strip().replace(
                                              '\\', '/')
        with open(filename, 'rb') as fp:
            upload_result = gcs_utils.upload_object(
                bucket, folder_prefix + bucket_file_name, fp)
            results.append(upload_result)
    return results
コード例 #18
0
ファイル: main.py プロジェクト: juh7007/curation
def save_datasources_json(hpo_id, folder_prefix=""):
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    datasource = dict(name=hpo_id, folder=hpo_id, cdmVersion=5)
    datasources = dict(datasources=[datasource])
    datasources_fp = StringIO.StringIO(json.dumps(datasources))
    result = gcs_utils.upload_object(
        hpo_bucket, folder_prefix + ACHILLES_EXPORT_DATASOURCES_JSON,
        datasources_fp)
    return result
コード例 #19
0
def retract(pids, bucket, found_files, folder_prefix, force_flag):
    """
    Retract from a folder in a GCS bucket all records associated with a pid

    :param pids: person_ids to retract
    :param bucket: bucket containing records to retract
    :param found_files: files found in the current folder
    :param folder_prefix: current folder being processed
    :param force_flag: if False then prompt for each file
    :return: metadata for each object updated in order to retract
    """
    result_list = []
    for file_name in found_files:
        table_name = file_name.split(".")[0]
        lines_removed = 0
        if force_flag:
            logger.debug("Attempting to force retract for person_ids %s in path %s/%s%s"
                         % (pids, bucket, folder_prefix, file_name))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            logger.debug("Are you sure you want to retract rows for person_ids %s from path %s/%s%s?"
                         % (pids, bucket, folder_prefix, file_name))
            response = get_response()
        if response == "Y":
            # Output and input file content initialization
            retracted_file_string = StringIO.StringIO()
            input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name)
            input_contents = input_file_string.split('\n')
            modified_flag = False

            logger.debug("Checking for person_ids %s in path %s/%s%s"
                         % (pids, bucket, folder_prefix, file_name))

            # Check if file has person_id in first or second column
            for input_line in input_contents:
                if input_line != '':
                    if (table_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) in pids) or \
                            (table_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) in pids):
                        lines_removed += 1
                        modified_flag = True
                    else:
                        retracted_file_string.write(input_line + '\n')

            # Write result back to bucket
            if modified_flag:
                logger.debug("Retracted %d rows from %s/%s%s" % (lines_removed, bucket, folder_prefix, file_name))
                logger.debug("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name))
                upload_result = gcs_utils.upload_object(bucket, folder_prefix + file_name, retracted_file_string)
                result_list.append(upload_result)
                logger.debug("Retraction successful for file %s/%s%s " % (bucket, folder_prefix, file_name))
            else:
                logger.debug("Skipping file %s/%s%s since pids %s not found" % (bucket, folder_prefix, file_name, pids))
        elif response.lower() == "n":
            logger.debug("Ignoring file %s" % file_name)
    return result_list
コード例 #20
0
ファイル: retract_data.py プロジェクト: noahgengel/curation
def retract(pid, bucket, found_files, folder_prefix, force):
    """
    Retract from a folder in a GCS bucket all records associated with a pid

    :param pid: person_id
    :param bucket: bucket containing records to retract
    :param found_files: files found in the current folder
    :param folder_prefix: current folder being processed
    :param force: if False then prompt for each file
    :return: metadata for each object updated in order to retract
    """
    result_list = []
    for file_name in found_files:
        if force:
            print("Force retracting rows for person_id %s from path %s/%s%s" %
                  (pid, bucket, folder_prefix, file_name))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            print(
                "Are you sure you want to retract rows for person_id %s from path %s/%s%s?"
                % (pid, bucket, folder_prefix, file_name))
            response = get_response()
        if response == "Y":
            # Output and input file content initialization
            retracted_file_string = StringIO.StringIO()
            input_file_string = gcs_utils.get_object(bucket,
                                                     folder_prefix + file_name)
            input_contents = input_file_string.split('\n')
            modified_flag = False

            # Check if file has person_id in first or second column
            for input_line in input_contents:
                if input_line != '':
                    if (file_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) != pid) or \
                            (file_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) != pid):
                        retracted_file_string.write(input_line + '\n')
                    else:
                        modified_flag = True
            # TODO: return number of lines removed, message if no file in the folder was updated
            # Write result back to bucket
            if modified_flag:
                print("Overwriting file %s/%s%s" %
                      (bucket, folder_prefix, file_name))
                upload_result = gcs_utils.upload_object(
                    bucket, folder_prefix + file_name, retracted_file_string)
                result_list.append(upload_result)
            else:
                print("Skipping file %s/%s%s since pid %s not found" %
                      (bucket, folder_prefix, file_name, pid))
        elif response.lower() == "n":
            print("Ignoring file %s" % file_name)
    return result_list
コード例 #21
0
    def _upload_file_to_bucket(bucket, dataset_id, path, table):
        app_id = bq_utils.app_identity.get_application_id()
        filename = table + '.csv'

        file_path = os.path.join(path, filename)
        try:
            with open(file_path, 'rb') as filepath:
                gcs_utils.upload_object(bucket, filename, filepath)
        except OSError:
            test_util.write_cloud_str(bucket, filename, '\n')

        gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                     filename=filename)
        load_results = bq_utils.load_csv(table,
                                         gcs_path,
                                         app_id,
                                         dataset_id,
                                         table,
                                         allow_jagged_rows=True)
        load_job_id = load_results['jobReference']['jobId']
        return load_job_id
コード例 #22
0
def _write_string_to_file(bucket, name, string):
    """
    Save the validation results in GCS
    :param bucket: bucket to save to
    :param name: name of the file (object) to save to in GCS
    :param cdm_file_results: list of tuples (<cdm_file_name>, <found>)
    :return:
    """
    f = StringIO.StringIO()
    f.write(string)
    f.seek(0)
    result = gcs_utils.upload_object(bucket, name, f)
    f.close()
    return result
コード例 #23
0
def _write_string_to_file(bucket, name, string):
    """
    Save the validation results in GCS
    :param bucket: bucket to save to
    :param name: name of the file (object) to save to in GCS
    :param string: string to write
    :return:
    """
    f = StringIO()
    f.write(string)
    f.seek(0)
    result = gcs_utils.upload_object(bucket, name, f)
    f.close()
    return result
コード例 #24
0
ファイル: main.py プロジェクト: noahgengel/curation
def _generate_site():
    """
    Construct html pages for each report, data_model, file_transfer and index.

    :param : 
    :return: returns 'okay' if succesful. logs critical error otherwise.
    """
    bucket = gcs_utils.get_drc_bucket()

    for page_name in PAGE_NAMES:
        # generate the page
        html = to_html(page_name)

        # write it to the drc shared bucket
        file_name = page_name + '.html'
        fp = StringIO.StringIO(html)
        gcs_utils.upload_object(bucket, file_name, fp)

    # aggregate result logs and write to bucket
    full_result_log = get_full_result_log()
    content = json.dumps(full_result_log)
    fp = StringIO.StringIO(content)
    gcs_utils.upload_object(bucket, LOG_JSON, fp)
    return 'okay'
コード例 #25
0
def _save_warnings_in_gcs(bucket, name, warnings):
    """
    Save the warnings in GCS
    :param bucket: bucket to save to
    :param name: name of the file (object) to save to in GCS
    :param warnings: list of tuples (<file_name>, <message>)
    :return:
    """
    f = StringIO.StringIO()
    f.write('"file_name","message"\n')
    for (file_name, message) in warnings:
        line = '"%(file_name)s","%(message)s"\n' % locals()
        f.write(line)
    f.seek(0)
    result = gcs_utils.upload_object(bucket, name, f)
    f.close()
    return result
コード例 #26
0
def _save_errors_in_gcs(bucket, name, errors):
    """Save errors.csv into hpo bucket

    :bucket:  bucket to save in
    :name: file_name to save to
    :errors: list of errors of form (file_name, errors)
    :returns: result of upload operation. not being used for now.

    """
    f = StringIO.StringIO()
    f.write('"file_name","errors"\n')
    for (file_name, message) in errors:
        line = '"%(file_name)s","%(message)s"\n' % locals()
        f.write(line)
    f.seek(0)
    result = gcs_utils.upload_object(bucket, name, f)
    f.close()
    return result
コード例 #27
0
def _save_result_in_gcs(bucket, name, results):
    """
    Save the validation results in GCS
    :param bucket: bucket to save to
    :param name: name of the file (object) to save to in GCS
    :param file_results: list of tuples (<file_name>, <found>)
    :return:
    """
    f = StringIO.StringIO()
    f.write('"file_name","found","parsed","loaded"\n')
    for (file_name, found, parsed, loaded) in results:
        line = '"%(file_name)s","%(found)s","%(parsed)s","%(loaded)s"\n' % locals(
        )
        f.write(line)
    f.seek(0)
    result = gcs_utils.upload_object(bucket, name, f)
    f.close()
    return result
コード例 #28
0
ファイル: main.py プロジェクト: juh7007/curation
def _upload_achilles_files(hpo_id, folder_prefix):
    """uploads achilles web files to the corresponding hpo bucket

    :hpo_id: which hpo bucket do these files go into
    :returns:

    """
    results = []
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    for filename in common.ACHILLES_INDEX_FILES:
        logging.debug('Uploading achilles file `%s` to bucket `%s`' %
                      (filename, bucket))
        bucket_file_name = filename.split(resources.resource_path +
                                          os.sep)[1].strip()
        with open(filename, 'r') as fp:
            upload_result = gcs_utils.upload_object(
                bucket, folder_prefix + bucket_file_name, fp)
            results.append(upload_result)
    return results
コード例 #29
0
def run_export(hpo_id=None, folder_prefix="", target_bucket=None):
    """
    Run export queries for an HPO and store JSON payloads in specified folder in (optional) target bucket

    :type hpo_id: ID of the HPO to run export for. This is the data source name in the report.
    :param folder_prefix: Relative base path to store report. empty by default.
    :param target_bucket: Bucket to save report. If None, use bucket associated with hpo_id.
    """
    results = []

    # Using separate var rather than hpo_id here because hpo_id None needed in calls below
    datasource_name = 'default'
    if hpo_id is None:
        if target_bucket is None:
            raise RuntimeError(
                'Cannot export if neither hpo_id or target_bucket is specified.'
            )
    else:
        datasource_name = hpo_id
        if target_bucket is None:
            target_bucket = gcs_utils.get_hpo_bucket(hpo_id)

    logging.info('Exporting %s report to bucket %s', datasource_name,
                 target_bucket)

    # Run export queries and store json payloads in specified folder in the target bucket
    reports_prefix = folder_prefix + ACHILLES_EXPORT_PREFIX_STRING + datasource_name + '/'
    for export_name in common.ALL_REPORTS:
        sql_path = os.path.join(export.EXPORT_PATH, export_name)
        result = export.export_from_path(sql_path, hpo_id)
        content = json.dumps(result)
        fp = StringIO(content)
        result = gcs_utils.upload_object(
            target_bucket, reports_prefix + export_name + '.json', fp)
        results.append(result)
    result = save_datasources_json(hpo_id=hpo_id,
                                   folder_prefix=folder_prefix,
                                   target_bucket=target_bucket)
    results.append(result)
    return results
コード例 #30
0
ファイル: main.py プロジェクト: juh7007/curation
def run_export(hpo_id, folder_prefix):
    """
    this function also changes the datasources.json file
    """
    results = []
    logging.info('running export for hpo_id %s' % hpo_id)
    # TODO : add check for required tables
    hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)
    _reports_prefix = ACHILLES_EXPORT_PREFIX_STRING + hpo_id + "/"
    for export_name in common.ALL_REPORTS:
        sql_path = os.path.join(export.EXPORT_PATH, export_name)
        result = export.export_from_path(sql_path, hpo_id)
        content = json.dumps(result)
        fp = StringIO.StringIO(content)
        result = gcs_utils.upload_object(
            hpo_bucket,
            folder_prefix + _reports_prefix + export_name + '.json', fp)
        results.append(result)

    datasources_json_result = save_datasources_json(hpo_id, folder_prefix)
    results.append(datasources_json_result)

    return results