def test_load_id_no_confdir(local_gcp, mocked_confdir,
                            mocked_alternate_confdir):
    shutil.rmtree(mocked_confdir)
    shutil.rmtree(mocked_alternate_confdir)
    alt_gcp = globus_sdk.LocalGlobusConnectPersonal(
        config_dir=mocked_alternate_confdir)
    assert local_gcp.endpoint_id is None
    assert alt_gcp.endpoint_id is None
Example #2
0
    def get_local_endpoint(self):
        """Fetches the local endpoint. Returns None if no local endpoint is available.

        Ensure that your globus connect personal is running!

        Returns:
            str: local endpoint id
        """

        localEndpoint = globus_sdk.LocalGlobusConnectPersonal()
        return localEndpoint.endpoint_id
def test_localep_load_id_alternate_conf_dir(mocked_alternate_confdir,
                                            write_gcp_id_file):
    gcp = globus_sdk.LocalGlobusConnectPersonal(
        config_dir=normalize_config_dir_argument(mocked_alternate_confdir))
    assert gcp.endpoint_id is None
    write_gcp_id_file("foobar", alternate=True)
    assert gcp.endpoint_id == "foobar"
    write_gcp_id_file("xyz", alternate=True)
    assert gcp.endpoint_id == "foobar"
    del gcp.endpoint_id
    assert gcp.endpoint_id == "xyz"
Example #4
0
def login(refresh_tokens, force, local_server, browser):
    pc = pilot.commands.get_pilot_client()
    is_logged_in = pc.is_logged_in()
    if is_logged_in and not force:
        click.echo('You are already logged in.')
        return
    elif is_logged_in and force:
        pc.logout()

    prev_info = pc.profile.load_user_info()
    scopes = pc.context.get_value('scopes') or pc.DEFAULT_SCOPES
    pc.login(refresh_tokens=refresh_tokens,
             no_local_server=not local_server,
             no_browser=not browser,
             force=force,
             requested_scopes=scopes)
    if not pc.project.load_all():
        log.debug('NO project info saved, updating...')
        pc.context.update_with_diff()
    click.secho('You have been logged in.', fg='green')

    local_ep = (pc.profile.load_option('local_endpoint')
                or globus_sdk.LocalGlobusConnectPersonal().endpoint_id)
    local_path = pc.profile.load_option('local_endpoint_path')
    log.debug('Local Endpoint set to {}:{}'.format(local_ep, local_path))
    tc = pc.get_transfer_client()
    try:
        if local_ep:
            name = tc.get_endpoint(local_ep).data['display_name']
            pc.profile.save_option('local_endpoint', local_ep)
            pc.profile.save_option('local_endpoint_path', local_path)
            pc.profile.save_option('local_endpoint_name', name)
            endpoint_utils.test_local_endpoint()
    except pilot.exc.LocalEndpointUnresponsive as leu:
        log.debug('Endpoint UUID: {}, local path: {}'.format(
            local_ep, local_path))
        click.secho(str(leu), fg='yellow')
    if prev_info != pc.profile.load_user_info():
        pitems = [('Name:', pc.profile.name),
                  ('Organization:', pc.profile.organization),
                  ('Local Endpoint:',
                   pc.profile.load_option('local_endpoint_name'))]
        pstr = '\n'.join(['{:16}{}'.format(t, v) for t, v in pitems])
        report = ('Your personal info has been saved as: \n{}\n\n'
                  'You can update these with "pilot profile -i"'.format(pstr))
        click.secho(report, fg='blue')
Example #5
0
def get_local_endpoint_id():
    """
    Get the endpoint ID of a local Globus Connect Personal endpoint.

    Returns
    -------
    endpoint_id : `str`
        The endpoint ID.

    Raises
    ------
    ConnectionError
        If no local endpoint can be detected a connection error is raised.

    """
    local_endpoint = globus_sdk.LocalGlobusConnectPersonal()
    endpoint_id = local_endpoint.endpoint_id

    if not endpoint_id:
        raise ConnectionError(
            "Can not find a local Globus Connect endpoint.")

    return endpoint_id
Example #6
0
    def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None,
                          output_dir=None, delete_dir=False, handle_git_repos=True,
                          dry_run=False, test_sub=False, globus=False, disable_validation=False,
                          **kwargs):
        """Start the Globus Automate Flow to ingest CFDE data into DERIVA.

        Arguments:
            data_path (str): The path to the data to ingest into DERIVA. The path can be:
                    1) A directory to be formatted into a BDBag
                    2) A Git repository to be copied into a BDBag
                    3) A premade BDBag directory
                    4) A premade BDBag in an archive file
            dcc_id (str): The CFDE-recognized DCC ID for this submission.
            catalog_id (int or str): The ID of the DERIVA catalog to ingest into.
                    Default None, to create a new catalog.
            schema (str): The named schema or schema file link to validate data against.
                    Default None, to only validate against the declared TableSchema.
            server (str): The DERIVA server to ingest to.
                    Default None, to use the Action Provider-set default.
            output_dir (str): The path to create an output directory in. The resulting
                    BDBag archive will be named after this directory.
                    If not set, the directory will be turned into a BDBag in-place.
                    For Git repositories, this is automatically set, but can be overridden.
                    If data_path is a file, this has no effect.
                    This dir MUST NOT be in the `data_path` directory or any subdirectories.
                    Default None.
            delete_dir (bool): Should the output_dir be deleted after submission?
                    Has no effect if output_dir is not specified.
                    For Git repositories, this is always True.
                    Default False.
            handle_git_repos (bool): Should Git repositories be detected and handled?
                    When this is False, Git repositories are handled as simple directories
                    instead of Git repositories.
                    Default True.
            dry_run (bool): Should the data be validated and bagged without starting the Flow?
                    When True, does not ingest into DERIVA or start the Globus Automate Flow,
                    and the return value will not have valid DERIVA Flow information.
                    Default False.
            test_sub (bool): Should the submission be run in "test mode" where
                    the submission will be inegsted into DERIVA and immediately deleted?
                    When True, the data wil not remain in DERIVA to be viewed and the
                    Flow will terminate before any curation step.
            globus (bool): Should the data be transferred using Globus Transfer? Default False.

        Other keyword arguments are passed directly to the ``make_bag()`` function of the
        BDBag API (see https://github.com/fair-research/bdbag for details).
        """
        self.check()
        logger.debug("Startup: Validating input")

        catalogs = self.remote_config['CATALOGS']
        if catalog_id in catalogs.keys():
            if schema:
                raise ValueError("You may not specify a schema ('{}') when ingesting to "
                                 "a named catalog ('{}'). Retry without specifying "
                                 "a schema.".format(schema, catalog_id))
            schema = catalogs[catalog_id]

        # Verify the dcc is valid
        if ':' not in dcc_id:
            dcc_id = f"cfde_registry_dcc:{dcc_id}"
        if not self.valid_dcc(dcc_id):
            raise exc.InvalidInput("Error: The dcc you've specified is not valid. Please double "
                                   "check the spelling and try again.")

        # Coerces the BDBag path to a .zip archive
        data_path = bdbag_utils.get_bag(
            data_path, output_dir=output_dir, delete_dir=delete_dir,
            handle_git_repos=handle_git_repos, bdbag_kwargs=kwargs
        )
        # Raises exc.ValidationException if something doesn't match up with the schema
        if not disable_validation:
            validation.validate_user_submission(data_path, schema)

        flow_info = self.remote_config["FLOWS"][self.service_instance]
        dest_path = "{}{}".format(flow_info["cfde_ep_path"], os.path.basename(data_path))

        # If doing dry run, stop here before making Flow input
        if dry_run:
            return {
                "success": True,
                "message": "Dry run validated successfully. No data was transferred."
            }

        logger.debug("Creating input for Flow")
        flow_input = {
            "cfde_ep_id": flow_info["cfde_ep_id"],
            "test_sub": test_sub,
            "dcc_id": dcc_id
        }
        if catalog_id:
            flow_input["catalog_id"] = str(catalog_id)
        if server:
            flow_input["server"] = server

        # Transfer data via globus
        if globus:
            local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
            logger.debug(f'Local endpoint: {local_endpoint}')
            if not local_endpoint:
                raise exc.EndpointUnavailable("Globus Connect Personal installation not found. To "
                                              "install, please visit "
                                              "https://www.globus.org/globus-connect-personal")
            try:
                self.transfer_client.operation_ls(local_endpoint, path=os.path.dirname(data_path))
                logger.debug("Successfully connected to Globus Connect Personal endpoint "
                             f"'{local_endpoint}'")
            except globus_sdk.exc.TransferAPIError as e:

                # Unable to connect
                if e.http_status == 502:
                    raise exc.EndpointUnavailable(f"Unable to connect to local endpoint "
                                                  f"'{local_endpoint}'. Please verify that Globus "
                                                  "Connect Personal is running.")
                # Forbidden
                elif e.http_status == 403:
                    raise exc.EndpointUnavailable(f"Unable to access '{data_path}' on local "
                                                  f"endpoint '{local_endpoint}'. Please set the "
                                                  "access preferences in Globus Connect Personal "
                                                  "to permit access.")

                else:
                    raise exc.EndpointUnavailable(e.message)

            # Populate Transfer fields in Flow
            flow_input.update({
                "source_endpoint_id": local_endpoint,
                "source_path": data_path,
                "cfde_ep_path": dest_path,
                "cfde_ep_url": flow_info["cfde_ep_url"],
                "is_directory": False,
            })

        # Otherwise, HTTP PUT the BDBag on the server
        else:
            logger.debug("Uploading with HTTPS PUT")
            data_url = "{}{}".format(flow_info["cfde_ep_url"], dest_path)
            globus_http.upload(data_path, data_url, self.https_authorizer)
            flow_input.update({
                "source_endpoint_id": False,
                "data_url": data_url,
            })

        logger.debug("Flow input populated:\n{}".format(json.dumps(flow_input, indent=4,
                                                                   sort_keys=True)))
        # Get Flow scope
        flow_id = flow_info["flow_id"]
        # Start Flow
        logger.debug("Starting Flow - Submitting data")
        try:
            flow_res = self.flow_client.run_flow(flow_id, self.flow_scope, flow_input)
        except globus_sdk.GlobusAPIError as e:
            if e.http_status == 404:
                return {
                    "success": False,
                    "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA "
                              "Demo Globus Group? Check your membership or apply for access "
                              "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-"
                              "0efb3ba9a670/about")
                }
            else:
                raise
        self.last_flow_run = {
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"]
        }
        logger.debug("Flow started successfully.")

        return {
            "success": True,
            "message": ("Started DERIVA ingest flow\nYour dataset has been "
                        "submitted\nYou can check the progress with: cfde-submit status\n"),
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"],
            "cfde_dest_path": dest_path,
            "http_link": "{}{}".format(flow_info["cfde_ep_url"], dest_path),
            "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}"
                                .format(flow_info["cfde_ep_id"], os.path.dirname(dest_path)))
        }
def do_transfers(transfer):

    local_ep = globus_sdk.LocalGlobusConnectPersonal()
    local_ep_id = local_ep.endpoint_id

    p.opt["task_label"] = p.opt["archive_date_time"].strftime(
        p.opt["taskLabel"])

    #p.opt["task_label"] = p.opt["task_label"].decode('utf-8')

    logging.info(
        f"Creating TransferData object with label '{p.opt['task_label']}'")
    #logging.info(f"task_label -  {type(p.opt['task_label'])}")

    tdata = globus_sdk.TransferData(transfer,
                                    local_ep_id,
                                    p.opt["archiveEndPoint"],
                                    label=p.opt["task_label"])
    #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"])

    logging.info("\nBEGINNING PROCESSING OF archiveItems")
    for item, item_info in p.opt["archiveItems"].items():
        logging.info(f"Starting on {item}")

        ii = copy.deepcopy(item_info)
        ii["key"] = item
        logging.verbose(f"Storing {item} as key")
        # substitute date/time strings and env variables in item info
        #logging.verbose(f"ii keys: {ii.keys()}")
        for ii_key in ("source", "destination", "tarFileName", "cdDirTar"):
            if ii.get(ii_key):
                logging.verbose(f"swapping {ii_key}: {ii[ii_key]}")
                ii[ii_key] = p.opt["archive_date_time"].strftime(ii[ii_key])
                ii[ii_key] = os.path.expandvars(ii[ii_key])
                logging.verbose(f"after swap {ii_key}: {ii[ii_key]}")

        # initialize number of files to 0
        ii['num_files'] = 0

        add_to_email(f"\nSOURCE:      {ii['source']}\n")
        add_to_email(f"DESTINATION: {ii['destination']}\n")

        if "*" in ii["source"] or "?" in ii[
                "source"]:  # Is there a '*' or '?' in the source?
            logging.verbose(f"Found wildcard in source: {ii['source']}")
            expanded_sources = glob.glob(ii['source'])
            ii["glob"] = True

            if len(expanded_sources) == 0:
                log_and_email(
                    f"Source expands to zero targets: {ii['source']}.  SKIPPING!",
                    logging.error)
                continue

        else:
            ii["glob"] = False

        if ii.get("glob") == True and not ii.get("tarFileName"):
            # can't handle both dirs and files in a glob
            file_glob = False
            dir_glob = False
            for es in expanded_sources:
                if os.path.isfile(es):
                    file_glob = True
                if os.path.isdir(es):
                    dir_glob = True
            if file_glob and dir_glob:
                log_and_email(
                    f"glob: {ii['source']} expands to files and dirs.  Not allowed.  Skipping this archive item.",
                    logging.error)
                continue

            for es_ix, es in enumerate(expanded_sources):
                # skip files that start with underscore if set to skip them
                if ii.get("skipUnderscoreFiles") and es.startswith('_'):
                    continue

                ii["source"] = es

                # if not last item
                if es_ix != len(expanded_sources) - 1:
                    ii["last_glob"] = False
                else:
                    ii["last_glob"] = True
                if not prepare_and_add_transfer(transfer, tdata, ii):
                    continue

        else:
            if not ii["glob"] and not os.path.exists(ii["source"]):
                log_and_email(
                    f"{ii['source']} does not exist. Skipping this archive item.",
                    logging.error)
                continue

            # setting last glob to True for tarring with a glob so expected file size/number is checked
            ii["last_glob"] = True
            if not prepare_and_add_transfer(transfer, tdata, ii):
                continue

    # submit all tasks for transfer
    if p.opt['submitTasks']:
        submit_transfer_task(transfer, tdata)
Example #8
0
def upload(dataframe, destination, metadata, gcp, update, test, dry_run,
           verbose, no_analyze):
    """
    Create a search entry and upload this file to the GCS Endpoint.

    # TODO: Fault tolerance for interrupted or failed file uploads (rollback)
    """
    pc = pilot.commands.get_pilot_client()
    if not pc.is_logged_in():
        click.echo('You are not logged in.')
        return

    if test:
        click.secho('Using test location: {}'.format(pc.TESTING_DIR),
                    fg='yellow')
        click.secho('Using test index for Globus Search', fg='yellow')

    if not destination:
        path = pc.get_path('', '', test)
        dirs = pc.ls('', '', test)
        click.echo('No Destination Provided. Please select one from the '
                   'directory "{}":\n{}'.format(path, '\t '.join(dirs)))
        return

    try:
        pc.ls(dataframe, destination, test)
    except globus_sdk.exc.TransferAPIError as tapie:
        if tapie.code == 'ClientError.NotFound':
            url = pc.get_globus_app_url('', test)
            click.secho('Directory does not exist: "{}"\nPlease create it at: '
                        '{}'.format(destination, url),
                        err=True,
                        bg='red')
            return 1
        else:
            click.secho(tapie.message, err=True, bg='red')
            return 1

    if metadata is not None:
        with open(metadata) as mf_fh:
            user_metadata = json.load(mf_fh)
    else:
        user_metadata = {}

    filename = os.path.basename(dataframe)
    prev_metadata = pc.get_search_entry(filename, destination, test)

    url = pc.get_globus_http_url(filename, destination, test)
    new_metadata = scrape_metadata(dataframe, url, no_analyze, test)

    try:
        new_metadata = update_metadata(new_metadata, prev_metadata,
                                       user_metadata)
        subject = pc.get_subject_url(filename, destination, test)
        gmeta = gen_gmeta(subject, pc.GROUP, new_metadata)
    except (RequiredUploadFields, ValidationError) as e:
        click.secho('Error Validating Metadata: {}'.format(e), fg='red')
        return 1

    if json.dumps(new_metadata) == json.dumps(prev_metadata):
        click.secho(
            'Files and search entry are an exact match. No update '
            'necessary.',
            fg='green')
        return 1

    if prev_metadata and not update:
        last_updated = prev_metadata['dc']['dates'][-1]['date']
        dt = datetime.datetime.strptime(last_updated, '%Y-%m-%dT%H:%M:%S.%fZ')
        click.echo('Existing record found for {}, specify -u to update.\n'
                   'Last updated: {: %A, %b %d, %Y}'
                   ''.format(filename, dt))
        return 1

    if dry_run:
        click.echo('Success! (Dry Run -- No changes made.)')
        click.echo(
            'Pre-existing record: {}'.format('yes' if prev_metadata else 'no'))
        click.echo('Version: {}'.format(new_metadata['dc']['version']))
        click.echo('Search Subject: {}\nURL: {}'.format(subject, url))
        if verbose:
            click.echo('Ingesting the following data:')
            click.echo(json.dumps(new_metadata, indent=2))
        return

    click.echo('Ingesting record into search...')
    pc.ingest_entry(gmeta, test)
    click.echo('Success!')

    if prev_metadata and not files_modified(new_metadata['files'],
                                            prev_metadata['files']):
        click.echo('Metadata updated, dataframe is already up to date.')
        return
    if gcp:
        local_ep = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
        if not local_ep:
            raise Exception('No local GCP client found')
        auth = pc.get_authorizers()['transfer.api.globus.org']
        tc = globus_sdk.TransferClient(authorizer=auth)
        tdata = globus_sdk.TransferData(tc,
                                        local_ep,
                                        pc.ENDPOINT,
                                        label='{} Transfer'.format(
                                            pc.APP_NAME),
                                        notify_on_succeeded=False,
                                        sync_level='checksum',
                                        encrypt_data=True)
        path = pc.get_path(filename, destination, test)
        tdata.add_item(dataframe, path)
        click.echo('Starting Transfer...')
        transfer_result = tc.submit_transfer(tdata)
        short_path = os.path.join(destination, filename)
        pilot.config.config.add_transfer_log(transfer_result, short_path)
        click.echo('{}. You can check the status below: \n'
                   'https://app.globus.org/activity/{}/overview\n'
                   'URL will be: {}'.format(transfer_result['message'],
                                            transfer_result['task_id'], url))
    else:
        click.echo('Uploading data...')
        response = pc.upload(dataframe, destination, test)
        if response.status_code == 200:
            click.echo('Upload Successful! URL is \n{}'.format(url))
        else:
            click.echo('Failed with status code: {}'.format(
                response.status_code))
def local_gcp():
    return globus_sdk.LocalGlobusConnectPersonal()
Example #10
0
    def start_deriva_flow(self,
                          data_path,
                          dcc_id,
                          catalog_id=None,
                          schema=None,
                          server=None,
                          output_dir=None,
                          delete_dir=False,
                          handle_git_repos=True,
                          dry_run=False,
                          test_sub=False,
                          verbose=False,
                          force_http=False,
                          **kwargs):
        """Start the Globus Automate Flow to ingest CFDE data into DERIVA.

        Arguments:
            data_path (str): The path to the data to ingest into DERIVA. The path can be:
                    1) A directory to be formatted into a BDBag
                    2) A Git repository to be copied into a BDBag
                    3) A premade BDBag directory
                    4) A premade BDBag in an archive file
            dcc_id (str): The CFDE-recognized DCC ID for this submission.
            catalog_id (int or str): The ID of the DERIVA catalog to ingest into.
                    Default None, to create a new catalog.
            schema (str): The named schema or schema file link to validate data against.
                    Default None, to only validate against the declared TableSchema.
            server (str): The DERIVA server to ingest to.
                    Default None, to use the Action Provider-set default.
            output_dir (str): The path to create an output directory in. The resulting
                    BDBag archive will be named after this directory.
                    If not set, the directory will be turned into a BDBag in-place.
                    For Git repositories, this is automatically set, but can be overridden.
                    If data_path is a file, this has no effect.
                    This dir MUST NOT be in the `data_path` directory or any subdirectories.
                    Default None.
            delete_dir (bool): Should the output_dir be deleted after submission?
                    Has no effect if output_dir is not specified.
                    For Git repositories, this is always True.
                    Default False.
            handle_git_repos (bool): Should Git repositories be detected and handled?
                    When this is False, Git repositories are handled as simple directories
                    instead of Git repositories.
                    Default True.
            dry_run (bool): Should the data be validated and bagged without starting the Flow?
                    When True, does not ingest into DERIVA or start the Globus Automate Flow,
                    and the return value will not have valid DERIVA Flow information.
                    Default False.
            test_sub (bool): Should the submission be run in "test mode" where
                    the submission will be inegsted into DERIVA and immediately deleted?
                    When True, the data wil not remain in DERIVA to be viewed and the
                    Flow will terminate before any curation step.
            verbose (bool): Should intermediate status messages be printed out?
                    Default False.

        Keyword Arguments:
            force_http (bool): Should the data be sent using HTTP instead of Globus Transfer,
                    even if Globus Transfer is available? Because Globus Transfer is more
                    robust than HTTP, it is highly recommended to leave this False.
                    Default False.

        Other keyword arguments are passed directly to the ``make_bag()`` function of the
        BDBag API (see https://github.com/fair-research/bdbag for details).
        """
        self.check()
        if verbose:
            print("Startup: Validating input")

        catalogs = self.remote_config['CATALOGS']
        if catalog_id in catalogs.keys():
            if schema:
                raise ValueError(
                    "You may not specify a schema ('{}') when ingesting to "
                    "a named catalog ('{}'). Retry without specifying "
                    "a schema.".format(schema, catalog_id))
            schema = catalogs[catalog_id]

        # Coerces the BDBag path to a .zip archive
        data_path = validation.validate_user_submission(
            data_path,
            schema,
            output_dir=output_dir,
            delete_dir=delete_dir,
            handle_git_repos=handle_git_repos,
            bdbag_kwargs=kwargs)

        flow_info = self.remote_config["FLOWS"][self.service_instance]
        dest_path = "{}{}".format(flow_info["cfde_ep_path"],
                                  os.path.basename(data_path))

        # If doing dry run, stop here before making Flow input
        if dry_run:
            return {
                "success":
                True,
                "message":
                "Dry run validated successfully. No data was transferred."
            }

        logger.debug("Creating input for Flow")
        flow_input = {
            "cfde_ep_id": flow_info["cfde_ep_id"],
            "test_sub": test_sub,
            "dcc_id": dcc_id
        }
        if catalog_id:
            flow_input["catalog_id"] = str(catalog_id)
        if server:
            flow_input["server"] = server

        # If local EP exists (and not force_http), can use Transfer
        # Local EP fetched now in case GCP started after Client creation
        local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
        logger.debug(f'Local endpoint: {local_endpoint}')
        if local_endpoint and not force_http:
            logger.debug(
                "Using local Globus Connect Personal Endpoint '{}'".format(
                    local_endpoint))
            # Populate Transfer fields in Flow
            flow_input.update({
                "source_endpoint_id": local_endpoint,
                "source_path": data_path,
                "cfde_ep_path": dest_path,
                "cfde_ep_url": flow_info["cfde_ep_url"],
                "is_directory": False,
            })
        # Otherwise, we must PUT the BDBag on the server
        else:
            logger.debug("GCP Not installed, uploading with HTTP PUT instead")
            data_url = "{}{}".format(flow_info["cfde_ep_url"], dest_path)
            globus_http.upload(data_path, data_url, self.https_authorizer)
            flow_input.update({
                "source_endpoint_id": False,
                "data_url": data_url,
            })

        if verbose:
            print("Flow input populated:\n{}".format(
                json.dumps(flow_input, indent=4, sort_keys=True)))
        # Get Flow scope
        flow_id = flow_info["flow_id"]
        # Start Flow
        if verbose:
            print("Starting Flow - Submitting data")
        try:
            flow_res = self.flow_client.run_flow(flow_id, self.flow_scope,
                                                 flow_input)
        except globus_sdk.GlobusAPIError as e:
            if e.http_status == 404:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not access ingest Flow. Are you in the CFDE DERIVA "
                     "Demo Globus Group? Check your membership or apply for access "
                     "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-"
                     "0efb3ba9a670/about")
                }
            else:
                raise
        self.last_flow_run = {
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"]
        }
        if verbose:
            print("Flow started successfully.")

        return {
            "success":
            True,
            "message":
            ("Started DERIVA ingest flow\nYour dataset has been "
             "submitted\nYou can check the progress with: cfde-submit status\n"
             ),
            "flow_id":
            flow_id,
            "flow_instance_id":
            flow_res["action_id"],
            "cfde_dest_path":
            dest_path,
            "http_link":
            "{}{}".format(flow_info["cfde_ep_url"], dest_path),
            "globus_web_link":
            ("https://app.globus.org/file-manager?origin_id={}&origin_path={}".
             format(flow_info["cfde_ep_id"], os.path.dirname(dest_path)))
        }
Example #11
0
    def start_deriva_flow(self,
                          data_path,
                          dcc_id,
                          catalog_id=None,
                          schema=None,
                          server=None,
                          dataset_acls=None,
                          output_dir=None,
                          delete_dir=False,
                          handle_git_repos=True,
                          dry_run=False,
                          test_sub=False,
                          verbose=False,
                          **kwargs):
        """Start the Globus Automate Flow to ingest CFDE data into DERIVA.

        Arguments:
            data_path (str): The path to the data to ingest into DERIVA. The path can be:
                    1) A directory to be formatted into a BDBag
                    2) A Git repository to be copied into a BDBag
                    3) A premade BDBag directory
                    4) A premade BDBag in an archive file
            dcc_id (str): The CFDE-recognized DCC ID for this submission.
            catalog_id (int or str): The ID of the DERIVA catalog to ingest into.
                    Default None, to create a new catalog.
            schema (str): The named schema or schema file link to validate data against.
                    Default None, to only validate against the declared TableSchema.
            server (str): The DERIVA server to ingest to.
                    Default None, to use the Action Provider-set default.
            dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset.
                    Default None, to use the CFDE default ACLs.
            output_dir (str): The path to create an output directory in. The resulting
                    BDBag archive will be named after this directory.
                    If not set, the directory will be turned into a BDBag in-place.
                    For Git repositories, this is automatically set, but can be overridden.
                    If data_path is a file, this has no effect.
                    This dir MUST NOT be in the `data_path` directory or any subdirectories.
                    Default None.
            delete_dir (bool): Should the output_dir be deleted after submission?
                    Has no effect if output_dir is not specified.
                    For Git repositories, this is always True.
                    Default False.
            handle_git_repos (bool): Should Git repositories be detected and handled?
                    When this is False, Git repositories are handled as simple directories
                    instead of Git repositories.
                    Default True.
            dry_run (bool): Should the data be validated and bagged without starting the Flow?
                    When True, does not ingest into DERIVA or start the Globus Automate Flow,
                    and the return value will not have valid DERIVA Flow information.
                    Default False.
            test_sub (bool): Should the submission be run in "test mode" where
                    the submission will be inegsted into DERIVA and immediately deleted?
                    When True, the data wil not remain in DERIVA to be viewed and the
                    Flow will terminate before any curation step.
            verbose (bool): Should intermediate status messages be printed out?
                    Default False.

        Keyword Arguments:
            force_http (bool): Should the data be sent using HTTP instead of Globus Transfer,
                    even if Globus Transfer is available? Because Globus Transfer is more
                    robust than HTTP, it is highly recommended to leave this False.
                    Default False.

        Other keyword arguments are passed directly to the ``make_bag()`` function of the
        BDBag API (see https://github.com/fair-research/bdbag for details).
        """
        if verbose:
            print("Startup: Validating input")
        data_path = os.path.abspath(data_path)
        if not os.path.exists(data_path):
            raise FileNotFoundError(
                "Path '{}' does not exist".format(data_path))

        if catalog_id in self.catalogs.keys():
            if schema:
                raise ValueError(
                    "You may not specify a schema ('{}') when ingesting to "
                    "a named catalog ('{}'). Retry without specifying "
                    "a schema.".format(schema, catalog_id))
            schema = self.catalogs[catalog_id]
        # Pull out known kwargs
        force_http = kwargs.pop("force_http", False)

        if handle_git_repos:
            if verbose:
                print("Checking for a Git repository")
            # If Git repo, set output_dir appropriately
            try:
                repo = git.Repo(data_path, search_parent_directories=True)
            # Not Git repo
            except git.InvalidGitRepositoryError:
                if verbose:
                    print("Not a Git repo")
            # Path not found, turn into standard FileNotFoundError
            except git.NoSuchPathError:
                raise FileNotFoundError(
                    "Path '{}' does not exist".format(data_path))
            # Is Git repo
            else:
                if verbose:
                    print("Git repo found, collecting metadata")
                # Needs to not have slash at end - is known Git repo already, slash
                # interferes with os.path.basename/dirname
                if data_path.endswith("/"):
                    data_path = data_path[:-1]
                # Set output_dir to new dir named with HEAD commit hash
                new_dir_name = "{}_{}".format(os.path.basename(data_path),
                                              str(repo.head.commit))
                output_dir = os.path.join(os.path.dirname(data_path),
                                          new_dir_name)
                # Delete temp dir after archival
                delete_dir = True

        # If dir and not already BDBag, make BDBag
        if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path):
            if verbose:
                print("Creating BDBag out of directory '{}'".format(data_path))
            # If output_dir specified, copy data to output dir first
            if output_dir:
                if verbose:
                    print("Copying data to '{}' before creating BDBag".format(
                        output_dir))
                output_dir = os.path.abspath(output_dir)
                # If shutil.copytree is called when the destination dir is inside the source dir
                # by more than one layer, it will recurse infinitely.
                # (e.g. /source => /source/dir/dest)
                # Exactly one layer is technically okay (e.g. /source => /source/dest),
                # but it's easier to forbid all parent/child dir cases.
                # Check for this error condition by determining if output_dir is a child
                # of data_path.
                if os.path.commonpath([data_path]) == os.path.commonpath(
                    [data_path, output_dir]):
                    raise ValueError(
                        "The output_dir ('{}') must not be in data_path ('{}')"
                        .format(output_dir, data_path))
                try:
                    shutil.copytree(data_path, output_dir)
                except FileExistsError:
                    raise FileExistsError(
                        ("The output directory must not exist. "
                         "Delete '{}' to submit.\nYou can set delete_dir=True "
                         "to avoid this issue in the future."
                         ).format(output_dir))
                # Process new dir instead of old path
                data_path = output_dir
            # If output_dir not specified, never delete data dir
            else:
                delete_dir = False
            # Make bag
            bdbag_api.make_bag(data_path, **kwargs)
            if not bdbag_api.is_bag(data_path):
                raise ValueError(
                    "Failed to create BDBag from {}".format(data_path))
            elif verbose:
                print("BDBag created at '{}'".format(data_path))

        # If dir (must be BDBag at this point), archive
        if os.path.isdir(data_path):
            if verbose:
                print("Archiving BDBag at '{}' using '{}'".format(
                    data_path, CONFIG["ARCHIVE_FORMAT"]))
            new_data_path = bdbag_api.archive_bag(data_path,
                                                  CONFIG["ARCHIVE_FORMAT"])
            if verbose:
                print("BDBag archived to file '{}'".format(new_data_path))
            # If requested (e.g. Git repo copied dir), delete data dir
            if delete_dir:
                if verbose:
                    print("Removing old directory '{}'".format(data_path))
                shutil.rmtree(data_path)
            # Overwrite data_path - don't care about dir for uploading
            data_path = new_data_path

        # Validate TableSchema in BDBag
        if verbose:
            print("Validating TableSchema in BDBag '{}'".format(data_path))
        validation_res = ts_validate(data_path, schema=schema)
        if not validation_res["is_valid"]:
            return {
                "success":
                False,
                "error":
                ("TableSchema invalid due to the following errors: \n{}\n".
                 format(validation_res["error"]))
            }
        elif verbose:
            print("Validation successful")

        # Now BDBag is archived file
        # Set path on destination
        dest_path = "{}{}".format(self.flow_info["cfde_ep_path"],
                                  os.path.basename(data_path))

        # If doing dry run, stop here before making Flow input
        if dry_run:
            return {
                "success":
                True,
                "message":
                "Dry run validated successfully. No data was transferred."
            }

        # Set up Flow
        if verbose:
            print("Creating input for Flow")
        # If local EP exists (and not force_http), can use Transfer
        # Local EP fetched now in case GCP started after Client creation
        local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
        if local_endpoint and not force_http:
            if verbose:
                print(
                    "Using local Globus Connect Personal Endpoint '{}'".format(
                        local_endpoint))
            # Populate Transfer fields in Flow
            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": local_endpoint,
                "source_path": data_path,
                "cfde_ep_id": self.flow_info["cfde_ep_id"],
                "cfde_ep_path": dest_path,
                "cfde_ep_url": self.flow_info["cfde_ep_url"],
                "is_directory": False,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server
        # Otherwise, we must PUT the BDBag on the server
        else:
            if verbose:
                print("No Globus Endpoint detected; using HTTP upload instead")
            headers = {}
            self.__https_authorizer.set_authorization_header(headers)
            data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path)

            with open(data_path, 'rb') as bag_file:
                bag_data = bag_file.read()

            put_res = requests.put(data_url, data=bag_data, headers=headers)

            # Regenerate headers on 401
            if put_res.status_code == 401:
                self.__https_authorizer.handle_missing_authorization()
                self.__https_authorizer.set_authorization_header(headers)
                put_res = requests.put(data_url,
                                       data=bag_data,
                                       headers=headers)

            # Error message on failed PUT or any unexpected response
            if put_res.status_code >= 300:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not upload BDBag to server (error {}):\n{}".format(
                        put_res.status_code, put_res.content))
                }
            elif put_res.status_code != 200:
                print(
                    "Warning: HTTP upload returned status code {}, which was unexpected."
                    .format(put_res.status_code))

            if verbose:
                print("Upload successful to '{}': {} {}".format(
                    data_url, put_res.status_code, put_res.content))

            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": False,
                "data_url": data_url,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server

        if verbose:
            print("Flow input populated:\n{}".format(
                json.dumps(flow_input, indent=4, sort_keys=True)))
        # Get Flow scope
        flow_def = self.flow_client.get_flow(flow_id)
        flow_scope = flow_def["globus_auth_scope"]
        # Start Flow
        if verbose:
            print("Starting Flow - Submitting data")
        try:
            flow_res = self.flow_client.run_flow(flow_id, flow_scope,
                                                 flow_input)
        except globus_sdk.GlobusAPIError as e:
            if e.http_status == 404:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not access ingest Flow. Are you in the CFDE DERIVA "
                     "Demo Globus Group? Check your membership or apply for access "
                     "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-"
                     "0efb3ba9a670/about")
                }
            else:
                raise
        self.last_flow_run = {
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"]
        }
        if verbose:
            print("Flow started successfully.")

        return {
            "success":
            True,
            "message":
            ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}".
             format(flow_id, flow_res["action_id"])),
            "flow_id":
            flow_id,
            "flow_instance_id":
            flow_res["action_id"],
            "cfde_dest_path":
            dest_path,
            "http_link":
            "{}{}".format(self.flow_info["cfde_ep_url"], dest_path),
            "globus_web_link":
            ("https://app.globus.org/file-manager?origin_id={}&origin_path={}".
             format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path)))
        }
Example #12
0
def getTokens():

    tokens = None
    try:
        # if we already have tokens, load and use them
        tokens = load_tokens_from_file(p.opt["globusTokenFile"])
    except:
        pass

    if not tokens:
        # if we need to get tokens, start the Native App authentication process
        tokens = do_native_app_authentication(CLIENT_ID, REDIRECT_URI, SCOPES)

        try:
            save_tokens_to_file(p.opt["globusTokenFile"], tokens)
        except:
            pass

    transfer_tokens = tokens['transfer.api.globus.org']

    auth_client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID)

    authorizer = globus_sdk.RefreshTokenAuthorizer(
        transfer_tokens['refresh_token'],
        auth_client,
        access_token=transfer_tokens['access_token'],
        expires_at=transfer_tokens['expires_at_seconds'],
        on_refresh=update_tokens_file_on_refresh)

    transfer = globus_sdk.TransferClient(authorizer=authorizer)

    myproxy_lifetime = 720  #in hours.  What's the maximum?
    try:
        r = transfer.endpoint_autoactivate(p.opt["archiveEndPoint"],
                                           if_expires_in=3600)
        while (r["code"] == "AutoActivationFailed"):
            print(
                "Endpoint requires manual activation, please use your UCAS name/password for this activation. "
                "You can activate via the command line or via web browser:\n"
                "WEB BROWSER -- Open the following URL in a browser to activate the "
                "endpoint:")
            print(
                f"https://app.globus.org/file-manager?origin_id={p.opt['archiveEndPoint']}"
            )
            print("CMD LINE -- run this from your shell: ")
            print(
                f"globus endpoint activate --myproxy --myproxy-lifetime {myproxy_lifetime} {p.opt['archiveEndPoint']}"
            )
            input("Press ENTER after activating the endpoint:")
            r = tc.endpoint_autoactivate(ep_id, if_expires_in=3600)

    except globus_sdk.exc.GlobusAPIError as ex:
        print("endpoint_autoactivation failed.")
        print(ex)
        if ex.http_status == 401:
            sys.exit('Refresh token has expired. '
                     'Please delete refresh-tokens.json and try again.')
        else:
            raise ex

    # print out a directory listing from an endpoint
    #print("Looking at archive end point")
    #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'):
    #    print(entry['name'] + ('/' if entry['type'] == 'dir' else ''))

    # revoke the access token that was just used to make requests against
    # the Transfer API to demonstrate that the RefreshTokenAuthorizer will
    # automatically get a new one
    #auth_client.oauth2_revoke_token(authorizer.access_token)
    # Allow a little bit of time for the token revocation to settle
    #time.sleep(1)
    # Verify that the access token is no longer valid
    #token_status = auth_client.oauth2_validate_token(
    #    transfer_tokens['access_token'])
    #assert token_status['active'] is False, 'Token was expected to be invalid.'

    #print('\nDoing a second directory listing with a new access token:')
    #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'):
    #    print(entry['name'] + ('/' if entry['type'] == 'dir' else ''))

    local_ep = globus_sdk.LocalGlobusConnectPersonal()
    local_ep_id = local_ep.endpoint_id

    #print("Looking at local end point")
    #for entry in transfer.operation_ls(local_ep_id):
    #    print(f"Local file: {entry['name']}")

    logging.info("BEGINNING PROCESSING OF archiveItems")
    for item, item_info in p.opt["archiveItems"].items():
        logging.info(f"Transferring {item}")
        if not item_info["source"].startswith('/'):
            logging.error(
                f"{item} source: {item_info['source']} must be absolute.  SKIPPING!"
            )
            continue
        if not item_info["destination"].startswith('/'):
            logging.error(
                f"{item} source: {item_info['destination']} must be absolute.  SKIPPING!"
            )
            continue
        try:
            transfer.operation_ls(p.opt["archiveEndPoint"],
                                  path=item_info["destination"])
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal(
                f"Destination path ({item_info['destination']}) does not exist on archiveEndPoint."
            )
            logging.fatal(e)
            sys.exit(1)

        # get leaf dir from source, and add it to destination
        dirname, leaf = os.path.split(item_info['source'])
        if leaf == '':
            _, leaf = os.path.split(dirname)
        destination_directory = os.path.join(item_info['destination'],
                                             leaf) + '/'

        # Check if destination_dir already exists, and skip if so
        # TODO: add support to overwrite?
        try:
            transfer.operation_ls(p.opt["archiveEndPoint"],
                                  path=destination_directory)
            logging.error(
                f"Destination {destination_directory} already exists on archiveEndPoint.  SKIPPING!"
            )
            continue
        except globus_sdk.exc.TransferAPIError as e:
            if e.code != u'ClientError.NotFound':
                logging.fatal(
                    f"Can't ls {p.opt['archiveEndPoint']} : {destination_directory}"
                )
                logging.fatal(e)
                sys.exit(1)

        # create destination directory
        try:
            logging.info(
                f"Creating destination directory {destination_directory}")
            transfer.operation_mkdir(p.opt["archiveEndPoint"],
                                     destination_directory)
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal(
                f"Can't mkdir {p.opt['archiveEndPoint']} : {destination_directory}"
            )
            logging.fatal(e)
            sys.exit(1)

        # TODO: set permissions for users to read dir
        #       look at https://github.com/globus/automation-examples/blob/master/share_data.py

        #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"], label=item_info["transfer-label"])
        tdata = globus_sdk.TransferData(transfer, local_ep_id,
                                        p.opt["archiveEndPoint"])
        tdata.add_item(item_info["source"],
                       destination_directory,
                       recursive=True)
        try:
            logging.info(
                f"Submitting transfer task - {item_info['transfer-label']}")
            task = transfer.submit_transfer(tdata)
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal("Transfer task submission failed")
            logging.fatal(e)
            sys.exit(1)
        logging.info(f"Task ID: {task['task_id']}")
        logging.info(
            f"This task can be monitored via the Web UI: https://app.globus.org/activity/{task['task_id']}"
        )