Exemple #1
0
def app(request):

    _app.config.from_object("peregrine.test_settings")
    app_init(_app)

    sheepdog_blueprint = sheepdog.blueprint.create_blueprint('submission')
    _app.register_blueprint(sheepdog_blueprint, url_prefix='/v0/submission')

    _app.logger.info('Initializing IndexClient')
    _app.index_client = IndexClient(
        _app.config['INDEX_CLIENT']['host'],
        version=_app.config['INDEX_CLIENT']['version'],
        auth=_app.config['INDEX_CLIENT']['auth'])
    try:
        _app.logger.info('Initializing Auth driver')
    except Exception:
        _app.logger.exception("Couldn't initialize auth, continuing anyway")

    _app.logger.setLevel(os.environ.get("GDC_LOG_LEVEL", "WARNING"))
    _app.jwt_public_keys = {
        _app.config['USER_API']: {
            'key-test': utils.read_file('resources/keys/test_public_key.pem')
        }
    }
    return _app
Exemple #2
0
def dist_get_record(record):

    # Sort the list of distributed ID services
    # Ones with which the request matches a hint will be first
    # Followed by those that don't match the hint
    sorted_dist = sorted(blueprint.dist,
                         key=lambda k: hint_match(record, k['hints']),
                         reverse=True)

    for indexd in sorted_dist:
        try:
            if indexd['type'] == "doi":
                fetcher_client = DOIClient(baseurl=indexd['host'])
                res = fetcher_client.get(record)
            elif indexd['type'] == "dos":
                fetcher_client = DOSClient(baseurl=indexd['host'])
                res = fetcher_client.get(record)
            else:
                fetcher_client = IndexClient(baseurl=indexd['host'])
                res = fetcher_client.global_get(record, no_dist=True)
        except:
            # a lot of things can go wrong with the get, but in general we don't care here.
            continue

        if res:
            json = res.to_json()
            json['from_index_service'] = {
                'host': indexd['host'],
                'name': indexd['name'],
            }
            return json

    raise IndexNoRecordFound('no record found')
Exemple #3
0
def db_init(app):
    app.logger.info('Initializing PsqlGraph driver')
    app.db = PsqlGraphDriver(
        host=app.config['PSQLGRAPH']['host'],
        user=app.config['PSQLGRAPH']['user'],
        password=app.config['PSQLGRAPH']['password'],
        database=app.config['PSQLGRAPH']['database'],
        set_flush_timestamps=True,
    )

    app.userdb = SQLAlchemyDriver(app.config['PSQL_USER_DB_CONNECTION'])
    flask_scoped_session(app.userdb.Session, app)

    app.oauth2 = OAuth2Client(**app.config['OAUTH2'])

    app.logger.info('Initializing Indexd driver')
    app.signpost = IndexClient(app.config['SIGNPOST']['host'],
                               version=app.config['SIGNPOST']['version'],
                               auth=app.config['SIGNPOST']['auth'])
    try:
        app.logger.info('Initializing Auth driver')
        app.auth = AuthDriver(app.config["AUTH_ADMIN_CREDS"],
                              app.config["INTERNAL_AUTH"])
    except Exception:
        app.logger.exception("Couldn't initialize auth, continuing anyway")
Exemple #4
0
def db_init(app):
    app.logger.info("Initializing PsqlGraph driver")
    connect_args = {}
    if app.config.get("PSQLGRAPH") and app.config["PSQLGRAPH"].get("sslmode"):
        connect_args["sslmode"] = app.config["PSQLGRAPH"]["sslmode"]
    app.db = PsqlGraphDriver(
        host=app.config["PSQLGRAPH"]["host"],
        user=app.config["PSQLGRAPH"]["user"],
        password=app.config["PSQLGRAPH"]["password"],
        database=app.config["PSQLGRAPH"]["database"],
        set_flush_timestamps=True,
        connect_args=connect_args,
        isolation_level=app.config["PSQLGRAPH"].get(
            "isolation_level", "READ_COMMITTED"
        ),
    )
    if app.config.get("AUTO_MIGRATE_DATABASE"):
        migrate_database(app)

    app.oauth_client = oauth2_client.OAuthClient(**app.config["OAUTH2"])

    app.logger.info("Initializing index client")
    app.index_client = IndexClient(
        app.config["INDEX_CLIENT"]["host"],
        version=app.config["INDEX_CLIENT"]["version"],
        auth=app.config["INDEX_CLIENT"]["auth"],
    )
Exemple #5
0
def app(request):

    _app.config.from_object("peregrine.test_settings")
    app_init(_app)

    sheepdog_blueprint = sheepdog.blueprint.create_blueprint("submission")
    _app.register_blueprint(sheepdog_blueprint, url_prefix="/v0/submission")

    _app.logger.info("Initializing IndexClient")
    _app.index_client = IndexClient(
        _app.config["INDEX_CLIENT"]["host"],
        version=_app.config["INDEX_CLIENT"]["version"],
        auth=_app.config["INDEX_CLIENT"]["auth"],
    )
    try:
        _app.logger.info("Initializing Auth driver")
    except Exception:
        _app.logger.exception("Couldn't initialize auth, continuing anyway")

    _app.logger.setLevel(os.environ.get("GDC_LOG_LEVEL", "WARNING"))
    _app.jwt_public_keys = {
        _app.config["USER_API"]: {
            "key-test": utils.read_file("resources/keys/test_public_key.pem")
        }
    }
    return _app
Exemple #6
0
def get_indexd_records():
    """
    Get all indexd records
    """
    results = {}
    indexd_client = IndexClient(
        INDEXD["host"],
        INDEXD["version"],
        (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
    )
    it = indexd_client.list(page_size=1000)

    progress = 0
    for doc in it:
        progress += 1
        results[doc.did] = doc.urls

    return results
Exemple #7
0
def test_hashes(get_request_mock, handle_error_mock):
    from indexclient.client import IndexClient
    input_params = {
        'hashes': {
            'md5': '00000000000000000000000000000001'
        },
        'size': '1'
    }

    expected_format = {
        'hash': ['md5:00000000000000000000000000000001'],
        'size': '1',
        'limit': 1
    }

    with patch("indexclient.client.IndexClient._get") as get_mock:
        client = IndexClient('base_url')
        client.get_with_params(input_params)

        assert get_mock.called
        args, kwargs = get_mock.call_args_list[0]
        assert kwargs['params'] == expected_format
Exemple #8
0
def test_hashes(get_request_mock, handle_error_mock):
    from indexclient.client import IndexClient

    input_params = {
        "hashes": {
            "md5": "00000000000000000000000000000001"
        },
        "size": "1"
    }

    expected_format = {
        "hash": ["md5:00000000000000000000000000000001"],
        "size": "1",
        "limit": 1,
    }

    with patch("indexclient.client.IndexClient._get") as get_mock:
        client = IndexClient("base_url")
        client.get_with_params(input_params)

        assert get_mock.called
        args, kwargs = get_mock.call_args_list[0]
        assert kwargs["params"] == expected_format
Exemple #9
0
def db_init(app):
    app.logger.info('Initializing PsqlGraph driver')
    app.db = PsqlGraphDriver(
        host=app.config['PSQLGRAPH']['host'],
        user=app.config['PSQLGRAPH']['user'],
        password=app.config['PSQLGRAPH']['password'],
        database=app.config['PSQLGRAPH']['database'],
        set_flush_timestamps=True,
    )

    app.oauth2 = OAuth2Client(**app.config['OAUTH2'])

    app.logger.info('Initializing Indexd driver')
    app.signpost = IndexClient(app.config['SIGNPOST']['host'],
                               version=app.config['SIGNPOST']['version'],
                               auth=app.config['SIGNPOST']['auth'])
Exemple #10
0
def db_init(app):
    app.logger.info("Initializing PsqlGraph driver")
    app.db = PsqlGraphDriver(
        host=app.config["PSQLGRAPH"]["host"],
        user=app.config["PSQLGRAPH"]["user"],
        password=app.config["PSQLGRAPH"]["password"],
        database=app.config["PSQLGRAPH"]["database"],
        set_flush_timestamps=True,
    )

    app.oauth2 = OAuth2Client(**app.config["OAUTH2"])

    app.logger.info("Initializing Indexd driver")
    app.index_client = IndexClient(
        app.config["INDEX_CLIENT"]["host"],
        version=app.config["INDEX_CLIENT"]["version"],
        auth=app.config["INDEX_CLIENT"]["auth"],
    )
Exemple #11
0
def db_init(app):
    app.logger.info("Initializing PsqlGraph driver")
    app.db = PsqlGraphDriver(
        host=app.config["PSQLGRAPH"]["host"],
        user=app.config["PSQLGRAPH"]["user"],
        password=app.config["PSQLGRAPH"]["password"],
        database=app.config["PSQLGRAPH"]["database"],
        set_flush_timestamps=True,
    )
    if app.config.get("AUTO_MIGRATE_DATABASE"):
        migrate_database(app)

    app.oauth_client = oauth2_client.OAuthClient(**app.config["OAUTH2"])

    app.logger.info("Initializing index client")
    app.index_client = IndexClient(
        app.config["INDEX_CLIENT"]["host"],
        version=app.config["INDEX_CLIENT"]["version"],
        auth=app.config["INDEX_CLIENT"]["auth"],
    )
    def __init__(
        self,
        global_config,
        files,
        total_files,
        job_name,
        copied_objects,
        manager_ns,
        bucket=None,
    ):
        """
        Class constructor

        Args:
            global_config(dict): a configuration
            {
                "multi_part_upload_threads": 10,
                "data_chunk_size": 1024*1024*5
            }
            files(list(str)): list of copying files
            total_files(int): total number of files
            job_name(str): copying|indexing
            copied_objects(dict): a dictionary of copied files with key is uuid/file_name
            manager_ns(ManagerNamespace): for synchronization
            bucket(str): source bucket

        """
        self.bucket = bucket
        self.files = files
        self.total_files = total_files
        self.global_config = global_config
        self.job_name = job_name
        self.copied_objects = copied_objects
        self.manager_ns = manager_ns

        self.indexclient = IndexClient(
            INDEXD["host"],
            INDEXD["version"],
            (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
        )
    def __init__(
        self,
        global_config,
        files,
        total_files,
        job_name,
        copied_objects,
        manager_ns,
        bucket=None,
    ):
        """
        Class constructor

        Args:
            global_config(dict): a configuration
            {
                "multi_part_upload_threads": 10,
                "data_chunk_size": 1024*1024*5
            }
            manifest_file(str): manifest file
            thread_num(int): number of threads
            job_name(str): copying|indexing
            bucket(str): source bucket

        """
        self.bucket = bucket
        self.files = files
        self.total_files = total_files
        self.global_config = global_config
        self.job_name = job_name
        self.copied_objects = copied_objects
        self.manager_ns = manager_ns

        self.indexclient = IndexClient(
            INDEXD["host"],
            INDEXD["version"],
            (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
        )
Exemple #14
0
def dist_get_record(record):

    # Sort the list of distributed ID services
    # Ones with which the request matches a hint will be first
    # Followed by those that don't match the hint
    sorted_dist = sorted(
        blueprint.dist, key=lambda k: hint_match(record, k["hints"]), reverse=True
    )

    for indexd in sorted_dist:
        try:
            if indexd["type"] == "doi":  # Digital Object Identifier
                fetcher_client = DOIClient(baseurl=indexd["host"])
                res = fetcher_client.get(record)
            elif indexd["type"] == "dos":  # Data Object Service
                fetcher_client = DOSClient(baseurl=indexd["host"])
                res = fetcher_client.get(record)
            elif indexd["type"] == "hs":  # HydroShare and CommonsShare
                fetcher_client = HSClient(baseurl=indexd["host"])
                res = fetcher_client.get(record)
            else:
                fetcher_client = IndexClient(baseurl=indexd["host"])
                res = fetcher_client.global_get(record, no_dist=True)
        except:
            # a lot of things can go wrong with the get, but in general we don't care here.
            continue

        if res:
            json = res.to_json()
            json["from_index_service"] = {
                "host": indexd["host"],
                "name": indexd["name"],
            }
            return json

    raise IndexNoRecordFound("no record found")
Exemple #15
0
def delete_objects_from_cloud_resources(manifest,
                                        log_bucket,
                                        release,
                                        dry_run=True):
    """
    delete object from S3 and GS
    for safety use filename instead of file_name in manifest file
    to avoid accident deletion.

    Args:
        manifest(str): manifest file
        log_filename(str): the name of log file
        release(str): data release
        dry_run(bool): True the program does not really delete the file (for report purpose)
    """
    session = boto3.session.Session()
    s3_sess = session.resource("s3")

    try:
        s3_sess.meta.client.head_bucket(Bucket=log_bucket)
    except botocore.exceptions.ClientError as e:
        logger.error(
            "The bucket {} does not exist or you have no access. Detail {}".
            format(log_bucket, e))
        return

    indexclient = IndexClient(
        INDEXD["host"],
        INDEXD["version"],
        (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
    )

    if manifest.startswith("s3://"):
        file_infos = get_fileinfo_list_from_s3_manifest(manifest)
    else:
        file_infos = get_fileinfo_list_from_csv_manifest(manifest)

    s3 = boto3.resource("s3")
    gs_client = storage.Client()

    ignored_dict = get_ignored_files(IGNORED_FILES, "\t")

    aws_deletion_logs = []
    gs_deletion_logs = []
    num = 0
    for fi in file_infos:
        num = num + 1
        logger.info("Start to process file {}".format(num))
        try:
            aws_target_bucket = get_aws_bucket_name(fi, PROJECT_ACL)
        except UserError as e:
            aws_deletion_logs.append(
                DeletionLog(url=fi.get("id") + "/" + fi.get("filename"),
                            message=e.message))
            aws_target_bucket = None

        if not dry_run:
            if aws_target_bucket:
                aws_deletion_logs.append(
                    _remove_object_from_s3(s3, indexclient, fi,
                                           aws_target_bucket, dry_run))
            try:
                google_target_bucket = get_google_bucket_name(fi, PROJECT_ACL)
            except UserError as e:
                logger.warning(e)
                gs_deletion_logs.append(
                    DeletionLog(url=fi.get("id") + "/" + fi.get("filename"),
                                message=e.message))
                continue
            gs_deletion_logs.append(
                _remove_object_from_gs(gs_client, indexclient, fi,
                                       google_target_bucket, ignored_dict))
            delete_record_from_indexd(fi.get("id"), indexclient)

    aws_log_list = []
    for log in aws_deletion_logs:
        aws_log_list.append(log.to_dict())
    aws_log_json = {}
    aws_log_json["data"] = aws_log_list

    gs_log_list = []
    for log in gs_deletion_logs:
        gs_log_list.append(log.to_dict())
    gs_log_json = {}
    gs_log_json["data"] = gs_log_list

    timestr = time.strftime("%Y%m%d-%H%M%S")
    gs_filename = timestr + "gs_deletion_log.json"
    aws_filename = timestr + "aws_deletion_log.json"

    if not dry_run:
        try:
            s3 = boto3.client("s3")
            with open(aws_filename, "w") as outfile:
                json.dump(aws_log_json, outfile)
            s3.upload_file(aws_filename, log_bucket,
                           release + "/" + basename(aws_filename))

            with open(gs_filename, "w") as outfile:
                json.dump(gs_log_json, outfile)
            s3.upload_file(gs_filename, log_bucket,
                           release + "/" + basename(gs_filename))
        except Exception as e:
            logger.error(e)
    else:
        logger.info(
            "All following files are for redaction.\nIf there is nothing below that means there is nothing to redact!!!\n\n"
        )
        logger.info("url\n")
        for log in aws_log_list:
            if log["deleted"]:
                logger.info(log["url"])
Exemple #16
0
def index_client():
    return IndexClient(SIGNPOST['host'], SIGNPOST['version'], SIGNPOST['auth'])
Exemple #17
0
def manifest_indexing(manifest, prefix=None, replace_urls=False):
    """
    Loop through all the files in the manifest, update/create records in indexd
    update indexd if the url is not in the record url list or acl has changed

    """
    indexclient = IndexClient(
        INDEXD["host"],
        INDEXD["version"],
        (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
    )
    try:
        files = get_fileinfos_from_tsv_manifest(manifest)
    except Exception as e:
        logger.error("Can not read {}. Detail {}".format(manifest, e))
        return

    prefix = prefix + "/" if prefix else ""
    number_indexed_files = 0
    for fi in files:
        try:
            urls = fi.get("url").split(" ")

            if fi.get("acl").lower() in {"[u'open']", "['open']"}:
                acl = ["*"]
            else:
                acl = [
                    element.strip().replace("'", "")
                    for element in fi.get("acl")[1:-1].split(",")
                ]

            doc = indexclient.get(prefix + fi.get("GUID"))
            if doc is not None:
                need_update = False

                for url in urls:
                    if not replace_urls and url not in doc.urls:
                        doc.urls.append(url)
                        need_update = True

                if replace_urls and set(urls) != set(doc.urls):
                    doc.urls = urls
                    need_update = True

                    # indexd doesn't like when records have metadata for non-existing
                    # urls
                    new_urls_metadata = copy.deepcopy(doc.urls_metadata)
                    for url, metadata in doc.urls_metadata.items():
                        if url not in urls:
                            del new_urls_metadata[url]

                    doc.urls_metadata = new_urls_metadata

                if set(doc.acl) != set(acl):
                    doc.acl = acl
                    need_update = True

                if need_update:
                    doc.patch()
            else:
                doc = indexclient.create(
                    did=prefix + fi.get("GUID"),
                    hashes={"md5": fi.get("md5")},
                    size=fi.get("size", 0),
                    acl=acl,
                    urls=urls,
                )
            number_indexed_files += 1
            if number_indexed_files % 10 == 0 or number_indexed_files == len(
                    files):
                logger.info("Progress {}%".format(number_indexed_files *
                                                  100.0 / len(files)))

        except Exception as e:
            # Don't break for any reason
            logger.error(
                "Can not update/create an indexd record with uuid {}. Detail {}"
                .format(fi.get("GUID"), e))
def exec_google_copy(fi, ignored_dict, global_config):
    """
    copy a file to google bucket.
    Args:
        fi(dict): a dictionary of a copying file
        global_config(dict): a configuration
            {
                "chunk_size_download": 1024,
                "chunk_size_upload": 1024
            }
    Returns:
        DataFlowLog
    """
    if fi["size"] == 0:
        msg = "can not copy {} to GOOGLE bucket since it is empty file".format(fi["id"])
        return DataFlowLog(message=msg)

    indexd_client = IndexClient(
        INDEXD["host"],
        INDEXD["version"],
        (INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
    )

    if not ignored_dict:
        raise UserError(
            "Expecting non-empty IGNORED_FILES. Please check if ignored_files_manifest.py is configured correctly!!!"
        )
    try:
        bucket_name = utils.get_google_bucket_name(fi, PROJECT_ACL)
    except UserError as e:
        msg = "can not copy {} to GOOGLE bucket. Detail {}. {}".format(
            fi["id"], e, PROJECT_ACL
        )
        logger.error(msg)
        return DataFlowLog(message=msg)

    if not bucket_exists(bucket_name):
        msg = "There is no bucket with provided name {}\n".format(bucket_name)
        logger.error(msg)
        return DataFlowLog(message=msg)

    if fi["id"] in ignored_dict:
        logger.info(
            "{} is ignored. Start to check indexd for u5aa objects".format(fi["id"])
        )
        _update_indexd_for_5aa_object(fi, bucket_name, ignored_dict, indexd_client)
        return DataFlowLog(message="{} is in the ignored list".format(fi["id"]))

    client = storage.Client()
    sess = AuthorizedSession(client._credentials)
    blob_name = fi.get("id") + "/" + fi.get("file_name")

    _check_and_handle_changed_acl_object(fi)

    if blob_exists(bucket_name, blob_name):
        logger.info("{} is already copied".format(fi["id"]))
    else:
        try:
            logger.info(
                "Start to stream {}. Size {} (MB)".format(
                    fi["id"], fi["size"] * 1.0 / 1000 / 1000
                )
            )
            tries = 0
            while tries < NUM_STREAMING_TRIES:
                try:
                    resumable_streaming_copy(
                        fi, client, bucket_name, blob_name, global_config
                    )
                    if fail_resumable_copy_blob(sess, bucket_name, blob_name, fi):
                        delete_object(sess, bucket_name, blob_name)
                    else:
                        break
                except Exception as e:
                    logger.warning(e)
                    tries += 1
            if tries == NUM_STREAMING_TRIES:
                logger.error(
                    "Can not stream {} after multiple attemps".format(fi.get("id"))
                )
            else:
                logger.info(
                    "Finish streaming {}. Size {} (MB)".format(
                        fi["id"], fi["size"] * 1.0 / 1000 / 1000
                    )
                )
        except APIError as e:
            logger.error(str(e))
            return DataFlowLog(message=str(e))
        except Exception as e:
            # Don't break (Not expected)
            logger.error(str(e))
            return DataFlowLog(message=str(e))

    # Confirm that the object was copied
    if blob_exists(bucket_name, blob_name):
        try:
            if indexd_utils.update_url(fi, indexd_client, provider="gs"):
                logger.info("Successfully update indexd for {}".format(fi["id"]))
            else:
                logger.info("Can not update indexd for {}".format(fi["id"]))
        except APIError as e:
            logger.error(e)
            return DataFlowLog(copy_success=True, message=e)
    else:
        msg = "can not copy {} to GOOGLE bucket after multiple attempts. Check the error detail in logs".format(
            blob_name
        )
        logger.error(msg)
        return DataFlowLog(message=msg)

    return DataFlowLog(
        copy_success=True,
        index_success=True,
        message="object {} successfully copied ".format(blob_name),
    )
Exemple #19
0
def index_client():
    return IndexClient(INDEX_CLIENT["host"], INDEX_CLIENT["version"],
                       INDEX_CLIENT["auth"])
Exemple #20
0
def index_client():
    return IndexClient(INDEX_CLIENT['host'], INDEX_CLIENT['version'],
                       INDEX_CLIENT['auth'])
Exemple #21
0
def index_client():
    return IndexClient(SIGNPOST["host"], SIGNPOST["version"], SIGNPOST["auth"])