Esempio n. 1
0
    def test_run_export(self):
        folder_prefix = 'dummy-prefix-2018-03-24/'
        main._upload_achilles_files(test_util.FAKE_HPO_ID, folder_prefix)
        main.run_export(hpo_id=test_util.FAKE_HPO_ID,
                        folder_prefix=folder_prefix)
        bucket_objects = gcs_utils.list_bucket(self.hpo_bucket)
        actual_object_names = [obj['name'] for obj in bucket_objects]
        for report in common.ALL_REPORT_FILES:
            prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + test_util.FAKE_HPO_ID + '/'
            expected_object_name = prefix + report
            self.assertIn(expected_object_name, actual_object_names)

        datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON
        self.assertIn(datasources_json_path, actual_object_names)
        datasources_json = gcs_utils.get_object(self.hpo_bucket,
                                                datasources_json_path)
        datasources_actual = json.loads(datasources_json)
        datasources_expected = {
            'datasources': [{
                'name': test_util.FAKE_HPO_ID,
                'folder': test_util.FAKE_HPO_ID,
                'cdmVersion': 5
            }]
        }
        self.assertDictEqual(datasources_expected, datasources_actual)
Esempio n. 2
0
def get_full_result_log():
    full_log = []
    for hpo in resources.hpo_csv():
        hpo_id = hpo['hpo_id']
        hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)

        try:
            # TODO : figure out possible errors and catch specific bucket inexistence error
            obj_metadata = gcs_utils.get_metadata(hpo_bucket, RESULT_CSV)
        except:
            logging.warning(
                'skipping hpo {}. bucket does not exist.'.format(hpo))
            continue

        if obj_metadata is None:
            logging.info('%s was not found in %s' % (RESULT_CSV, hpo_bucket))
        else:
            hpo_result = gcs_utils.get_object(hpo_bucket, RESULT_CSV)
            hpo_result_file = StringIO.StringIO(hpo_result)
            hpo_result_items = resources._csv_file_to_list(hpo_result_file)
            result_objects = map(
                lambda item: hpo_log_item_to_obj(hpo_id, item),
                hpo_result_items)
            full_log.extend(result_objects)
    return full_log
Esempio n. 3
0
 def test_get_object(self):
     with open(FIVE_PERSONS_PERSON_CSV, 'r') as fp:
         expected = fp.read()
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
     result = gcs_utils.get_object(self.hpo_bucket, 'person.csv')
     self.assertEqual(expected, result)
Esempio n. 4
0
    def test_run_export_with_target_bucket(self):
        folder_prefix = 'dummy-prefix-2018-03-24/'
        bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
        test_util.get_synpuf_results_files()
        test_util.populate_achilles(self.hpo_bucket, hpo_id=None)
        main.run_export(folder_prefix=folder_prefix, target_bucket=bucket_nyc)
        bucket_objects = gcs_utils.list_bucket(bucket_nyc)
        actual_object_names = [obj['name'] for obj in bucket_objects]
        for report in common.ALL_REPORT_FILES:
            expected_object_name = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + 'default' + '/' + report
            self.assertIn(expected_object_name, actual_object_names)

        datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON
        self.assertIn(datasources_json_path, actual_object_names)
        datasources_json = gcs_utils.get_object(bucket_nyc,
                                                datasources_json_path)
        datasources_actual = json.loads(datasources_json)
        datasources_expected = {
            'datasources': [{
                'name': 'default',
                'folder': 'default',
                'cdmVersion': 5
            }]
        }
        self.assertDictEqual(datasources_expected, datasources_actual)
Esempio n. 5
0
 def test_run_export_with_target_bucket_and_datasource_id(
         self, mock_is_hpo_id):
     # validation/main.py INTEGRATION TEST
     mock_is_hpo_id.return_value = True
     folder_prefix = 'dummy-prefix-2018-03-24/'
     bucket_nyc = gcs_utils.get_hpo_bucket('nyc')
     main.run_export(datasource_id=FAKE_HPO_ID,
                     folder_prefix=folder_prefix,
                     target_bucket=bucket_nyc)
     bucket_objects = gcs_utils.list_bucket(bucket_nyc)
     actual_object_names = [obj['name'] for obj in bucket_objects]
     for report in common.ALL_REPORT_FILES:
         prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + FAKE_HPO_ID + '/'
         expected_object_name = prefix + report
         self.assertIn(expected_object_name, actual_object_names)
     datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON
     self.assertIn(datasources_json_path, actual_object_names)
     datasources_json = gcs_utils.get_object(bucket_nyc,
                                             datasources_json_path)
     datasources_actual = json.loads(datasources_json)
     datasources_expected = {
         'datasources': [{
             'name': FAKE_HPO_ID,
             'folder': FAKE_HPO_ID,
             'cdmVersion': 5
         }]
     }
     self.assertDictEqual(datasources_expected, datasources_actual)
Esempio n. 6
0
def retract(pids, bucket, found_files, folder_prefix, force_flag):
    """
    Retract from a folder in a GCS bucket all records associated with a pid

    :param pids: person_ids to retract
    :param bucket: bucket containing records to retract
    :param found_files: files found in the current folder
    :param folder_prefix: current folder being processed
    :param force_flag: if False then prompt for each file
    :return: metadata for each object updated in order to retract
    """
    result_list = []
    for file_name in found_files:
        table_name = file_name.split(".")[0]
        lines_removed = 0
        if force_flag:
            logger.debug("Attempting to force retract for person_ids %s in path %s/%s%s"
                         % (pids, bucket, folder_prefix, file_name))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            logger.debug("Are you sure you want to retract rows for person_ids %s from path %s/%s%s?"
                         % (pids, bucket, folder_prefix, file_name))
            response = get_response()
        if response == "Y":
            # Output and input file content initialization
            retracted_file_string = StringIO.StringIO()
            input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name)
            input_contents = input_file_string.split('\n')
            modified_flag = False

            logger.debug("Checking for person_ids %s in path %s/%s%s"
                         % (pids, bucket, folder_prefix, file_name))

            # Check if file has person_id in first or second column
            for input_line in input_contents:
                if input_line != '':
                    if (table_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) in pids) or \
                            (table_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) in pids):
                        lines_removed += 1
                        modified_flag = True
                    else:
                        retracted_file_string.write(input_line + '\n')

            # Write result back to bucket
            if modified_flag:
                logger.debug("Retracted %d rows from %s/%s%s" % (lines_removed, bucket, folder_prefix, file_name))
                logger.debug("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name))
                upload_result = gcs_utils.upload_object(bucket, folder_prefix + file_name, retracted_file_string)
                result_list.append(upload_result)
                logger.debug("Retraction successful for file %s/%s%s " % (bucket, folder_prefix, file_name))
            else:
                logger.debug("Skipping file %s/%s%s since pids %s not found" % (bucket, folder_prefix, file_name, pids))
        elif response.lower() == "n":
            logger.debug("Ignoring file %s" % file_name)
    return result_list
Esempio n. 7
0
def all_required_files_loaded(hpo_id, folder_prefix):
    result_file = gcs_utils.get_object(gcs_utils.get_hpo_bucket(hpo_id),
                                       folder_prefix + common.RESULT_CSV)
    result_file = StringIO.StringIO(result_file)
    result_items = resources._csv_file_to_list(result_file)
    for item in result_items:
        if item['file_name'] in common.REQUIRED_FILES:
            if item['loaded'] != '1':
                return False
    return True
Esempio n. 8
0
def retract(pid, bucket, found_files, folder_prefix, force):
    """
    Retract from a folder in a GCS bucket all records associated with a pid

    :param pid: person_id
    :param bucket: bucket containing records to retract
    :param found_files: files found in the current folder
    :param folder_prefix: current folder being processed
    :param force: if False then prompt for each file
    :return: metadata for each object updated in order to retract
    """
    result_list = []
    for file_name in found_files:
        if force:
            print("Force retracting rows for person_id %s from path %s/%s%s" %
                  (pid, bucket, folder_prefix, file_name))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            print(
                "Are you sure you want to retract rows for person_id %s from path %s/%s%s?"
                % (pid, bucket, folder_prefix, file_name))
            response = get_response()
        if response == "Y":
            # Output and input file content initialization
            retracted_file_string = StringIO.StringIO()
            input_file_string = gcs_utils.get_object(bucket,
                                                     folder_prefix + file_name)
            input_contents = input_file_string.split('\n')
            modified_flag = False

            # Check if file has person_id in first or second column
            for input_line in input_contents:
                if input_line != '':
                    if (file_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) != pid) or \
                            (file_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) != pid):
                        retracted_file_string.write(input_line + '\n')
                    else:
                        modified_flag = True
            # TODO: return number of lines removed, message if no file in the folder was updated
            # Write result back to bucket
            if modified_flag:
                print("Overwriting file %s/%s%s" %
                      (bucket, folder_prefix, file_name))
                upload_result = gcs_utils.upload_object(
                    bucket, folder_prefix + file_name, retracted_file_string)
                result_list.append(upload_result)
            else:
                print("Skipping file %s/%s%s since pid %s not found" %
                      (bucket, folder_prefix, file_name, pid))
        elif response.lower() == "n":
            print("Ignoring file %s" % file_name)
    return result_list
Esempio n. 9
0
def read_cloud_file(bucket, name):
    return gcs_utils.get_object(bucket, name)
Esempio n. 10
0
def retract(pids, bucket, found_files, folder_prefix, force_flag):
    """
    Retract from a folder in a GCS bucket all records associated with a pid
    pid table must follow schema described in retract_data_bq.PID_TABLE_FIELDS and must reside in sandbox_dataset_id
    This function removes lines from all files containing person_ids if they exist in pid_table_id
    Throws SyntaxError/TypeError/ValueError if non-ints are found

    :param pids: person_ids to retract
    :param bucket: bucket containing records to retract
    :param found_files: files found in the current folder
    :param folder_prefix: current folder being processed
    :param force_flag: if False then prompt for each file
    :return: metadata for each object updated in order to retract
    """
    result_list = []
    for file_name in found_files:
        table_name = file_name.split(".")[0]
        lines_removed = 0
        file_gcs_path = '%s/%s%s' % (bucket, folder_prefix, file_name)
        if force_flag:
            logger.info(
                "Attempting to force retract for person_ids %s in path %s/%s%s"
                % (pids, bucket, folder_prefix, file_name))
            response = "Y"
        else:
            # Make sure user types Y to proceed
            logger.info(
                "Are you sure you want to retract rows for person_ids %s from path %s/%s%s?"
                % (pids, bucket, folder_prefix, file_name))
            response = get_response()
        if response == "Y":
            # Output and input file content initialization
            retracted_file_string = BytesIO()
            input_file_bytes = gcs_utils.get_object(bucket,
                                                    folder_prefix + file_name,
                                                    as_text=False)
            input_file_lines = input_file_bytes.split(b'\n')
            input_header = input_file_lines[0]
            input_contents = input_file_lines[1:]
            retracted_file_string.write(input_header + b'\n')
            logger.info("Checking for person_ids %s in path %s" %
                        (pids, file_gcs_path))

            # Check if file has person_id in first or second column
            for input_line in input_contents:
                input_line = input_line.strip()
                # ensure line is not empty
                if input_line:
                    cols = input_line.split(b',')
                    # ensure at least two columns exist
                    if len(cols) > 1:
                        col_1 = cols[0]
                        col_2 = cols[1]
                        # skip if non-integer is encountered and keep the line as is
                        try:
                            if (table_name in PID_IN_COL1 and int(col_1) in pids) or \
                                    (table_name in PID_IN_COL2 and int(col_2) in pids):
                                # do not write back this line since it contains a pid to retract
                                # increment removed lines counter
                                lines_removed += 1
                            else:
                                # pid not found, retain this line
                                retracted_file_string.write(input_line + b'\n')
                        except ValueError:
                            # write back non-num lines
                            retracted_file_string.write(input_line + b'\n')
                    else:
                        # write back ill-formed lines. Note: These lines do not make it into BigQuery
                        retracted_file_string.write(input_line + b'\n')

            # Write result back to bucket
            if lines_removed > 0:
                logger.info("%d rows retracted from %s, overwriting..." %
                            (lines_removed, file_gcs_path))
                upload_result = gcs_utils.upload_object(
                    bucket, folder_prefix + file_name, retracted_file_string)
                result_list.append(upload_result)
                logger.info("Retraction successful for file %s" %
                            file_gcs_path)
            else:
                logger.info("Not updating file %s since pids %s not found" %
                            (file_gcs_path, pids))
        elif response.lower() == "n":
            logger.info("Skipping file %s" % file_gcs_path)
    return result_list