Exemple #1
0
def check_md5(filename: str,
              figshare_checksum: str,
              log: Logger = log_stdout()) -> bool:
    """
    Perform checksum after file retrieval against Figshare's computation

    :param filename: Full path of file on server
    :param figshare_checksum: MD5 checksum string from supplied_md5 metadata
    :param log: logger.LogClass object. Default is stdout via python logging

    :return: ``True`` if passed, ``False`` if not a match
    """

    log.info("Performing MD5 checksum ...")

    checksum_pass = False

    hash_md5 = hashlib.md5()
    with open(filename, "rb") as f:
        # Handle large files by chunking
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)

    checksum_compute = hash_md5.hexdigest()
    if checksum_compute == figshare_checksum:
        checksum_pass = True
        log.info("MD5 Checksum passed!!!")
    else:
        log.warning("Checksum failed!!!")
    log.info(f"MD5 Result:  {checksum_compute}")
    log.info(f"Expectation: {figshare_checksum}")

    return checksum_pass
    def __init__(self,
                 article_id,
                 fs_admin,
                 curation_id=None,
                 verbose=True,
                 log=None):
        self.article_id = article_id
        self.fs_admin = fs_admin
        self.verbose = verbose

        if isinstance(log, type(None)):
            self.log = log_stdout()
        else:
            self.log = log

        # Retrieves specific information for article (includes authors)
        if isinstance(curation_id, type(None)):
            self.curation_id = self.get_curation_id()
        else:
            self.curation_id = curation_id
        self.curation_dict = self.get_curation_dict()

        # Set version number
        if self.curation_dict['status'] == 'approved':
            self.version_no = self.curation_dict['version']
        else:
            self.version_no = self.curation_dict['version'] + 1

        self.name_dict = self.get_name_dict()
        self.folderName = self.get_folder_name()
Exemple #3
0
    def __init__(self,
                 token: str,
                 stage: bool = False,
                 admin_filter: list = None,
                 log: Logger = log_stdout()):

        self.token = token
        self.stage = stage

        if not self.stage:
            self.baseurl = "https://api.figshare.com/v2/account/"
        else:
            self.baseurl = "https://api.figsh.com/v2/account/"

        self.baseurl_institute = self.baseurl + "institution/"

        self.headers = {'Content-Type': 'application/json'}
        if self.token:
            self.headers['Authorization'] = f'token {self.token}'

        self.admin_filter = admin_filter
        if admin_filter is not None:
            self.ignore_admin = True
        else:
            self.ignore_admin = False
        self.log = log
Exemple #4
0
def private_file_retrieve(url, filename=None, token=None, log=None):
    """
    Purpose:
      Custom Request to privately retrieve a file with a token.
      This was built off of the figshare Python code, but a urlretrieve
      did not handle providing a token in the header.

    :param url: Full URL (str)
    :param filename: Full filename for file to be written (str)
    :param token: API token (str)
    :param log: logger.LogClass object. Default is stdout via python logging
    """

    if isinstance(log, type(None)):
        log = log_stdout()

    headers = dict()
    if token:
        headers['Authorization'] = f'token {token}'

    try:
        h = requests.head(url, headers=headers)
        h.raise_for_status()

        # Chunk read and write with stream option and copyfileobj
        with requests.get(url, stream=True, headers=headers) as r:
            with open(filename, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
    except (HTTPError, IOError) as error:
        log.warning(error)
        raise HTTPError(error)
Exemple #5
0
def save_metadata(json_response: Union[list, dict],
                  out_file_prefix: str,
                  metadata_source: str = 'CURATION',
                  root_directory: str = '',
                  metadata_directory: str = '',
                  save_csv: bool = False,
                  overwrite: bool = False,
                  log=None):
    """
    Write metadata contents to JSON and CSV file

    :param json_response: Content in list or dict
    :param out_file_prefix: Filename prefix. Appends .json and .csv
    :param metadata_source: Source of metadata,
    :param root_directory: Full path containing the working directory
    :param metadata_directory: Metadata path
    :param save_csv: Save a CSV file. Default: False
    :param overwrite: Overwrite file if it exists. Default: False
    :param log: LogClass or logging object. Default: log_stdout()
    """

    if log is None:
        log = log_stdout()

    log.debug("starting ...")
    log.info("")
    log.info(f"** SAVING {metadata_source} METADATA **")

    if not root_directory:
        root_directory = os.getcwd()

    metadata_path = os.path.join(root_directory, metadata_directory)

    out_file_prefix = f"{metadata_path}/{out_file_prefix}"

    # Write JSON file
    json_out_file = f"{out_file_prefix}.json"
    if not os.path.exists(json_out_file):
        write_json(json_out_file, json_response, log)
    else:
        log.info(f"File exists: {json_out_file}")
        if overwrite:
            log.info("Overwriting!")
            write_json(json_out_file, json_response, log)

    # Write CSV file
    if save_csv:
        df = pd.DataFrame.from_dict(json_response, orient='columns')
        csv_out_file = f"{out_file_prefix}.csv"
        if not os.path.exists(csv_out_file):
            log.info(f"Writing: {csv_out_file}")
            df.to_csv(csv_out_file, index=False)
        else:
            log.info(f"File exists: {csv_out_file}")
            if overwrite:
                log.info("Overwriting!")
                df.to_csv(csv_out_file, index=False)

    log.debug("finished.")
def test_log_stdout():
    log0 = logger.log_stdout()

    log0.info("Print INFO test")
    log0.debug("Print DEBUG test")
    log0.warning("Print WARNING test")

    assert isinstance(log0, logging.Logger)
Exemple #7
0
def tiny_url(url: str, alias=None, log=None) -> str:
    """
    Purpose:
      Generate a TinyURL

    :param url: str. HTTP URL
    :param alias: str. Alias if desired
    :param log: LogClass or logger object

    :return: response_data: str containing the shortened TinyURL
    """
    if log is None:
        log = log_stdout()

    endpoint = "http://tinyurl.com/api-create.php"
    encoded_url = urlencode({'url': url}, safe=url_safe, quote_via=quote)
    log.debug(f"encoded_url: {encoded_url}")
    get_url = f"{endpoint}?{encoded_url}"
    log.debug(f"get_url : {get_url}")

    params = dict()
    if alias is not None:
        params = {'alias': alias}

    expected_url = f"https://tinyurl.com/{alias}"

    expected_response = requests.get(expected_url)
    if expected_response.status_code == 200:
        log.info(f"TinyURL link already exists!")

        expected_request_url = f"{url}&alias={alias}"
        if expected_response.url != expected_request_url:
            log.warning(f"Input URL changed!")
            log.debug(f"Previous URL: {expected_response.url}")
            log.debug(f"New URL: {expected_request_url}")
            log.warning(f"Creating new TinyURL")
            response = requests.get(get_url)
            response_data = response.text
        else:
            response_data = expected_url
    else:
        log.info(f"TinyURL link does not exist. Creating!")
        # GET still works if the TinyURL alias exists, but points to the same URL
        response = requests.get(get_url, params=params)

        try:
            response.raise_for_status()
            response_data = response.text
        except HTTPError as error:
            log.warning(f"Caught an HTTPError: {error}")
            log.warning('Body:\n', response.text)
            raise HTTPError

    return response_data
Exemple #8
0
def review_report(depositor_name='',
                  curation_dict=config_default_dict['curation'],
                  log=None):
    """
    Purpose:
      Retrieve Curation Review Report and save on curation server
    """

    if isinstance(log, type(None)):
        log = log_stdout()

    log.info("")
    log.info("** CREATING CURATION REVIEW REPORT **")

    root_directory_main = curation_dict[curation_dict['parent_dir']]
    todo_folder = curation_dict['folder_todo']
    folder_ual_rdm = curation_dict['folder_ual_rdm']
    report_url = curation_dict['report_url']

    staging_directory = join(root_directory_main, todo_folder)

    # Complete path to UAL_RDM folder
    out_path = join(staging_directory, depositor_name, folder_ual_rdm)
    if not exists(out_path):
        log.info(f"Creating folder : {out_path}")
        makedirs(out_path, mode=0o777, exist_ok=True)
    else:
        log.warn(f"!!!! Folder exists, not creating : {out_path}")

    # MS-Word document filename
    simplify_name = depositor_name.replace('/v', '_v')
    filename = 'ReDATA-DepositReview_{}.docx'.format(simplify_name)
    out_file = join(out_path, filename)

    # Write file
    if not exists(out_file):
        log.info(f"Saving ReDATA Curation Report to: {out_path}")
        log.info(f"Saving as : {filename}")
        urlretrieve(report_url, out_file)
        permissions.curation(out_path)
    else:
        log.info(f"!!!! ReDATA Curation Report exists in {out_path} !!!!")
        log.info("!!!! Will not override !!!!")
Exemple #9
0
    def __init__(self,
                 curation_dict=config_default_dict['curation'],
                 log=None):

        if isinstance(log, type(None)):
            self.log = log_stdout()
        else:
            self.log = log

        self.root_directory_main = curation_dict[curation_dict['parent_dir']]
        self.todo_folder = curation_dict['folder_todo']
        self.underreview_folder = curation_dict['folder_underreview']
        self.reviewed_folder = curation_dict['folder_reviewed']
        self.published_folder = curation_dict['folder_published']
        self.rejected_folder = curation_dict['folder_rejected']

        self.stage_list = [
            self.todo_folder, self.underreview_folder, self.reviewed_folder,
            self.published_folder
        ]
Exemple #10
0
def walkthrough(data_path, ignore='', log=None):
    """
    Purpose:
      Perform walkthrough to find other README files

    :param data_path: path to DATA folder
    :param ignore: full path of default README.txt to ignore
    :param log: logger.LogClass object. Default is stdout via python logging
    :return:
    """

    if isinstance(log, type(None)):
        log = log_stdout()
    else:
        log = log

    for dir_path, dir_names, files in walk(data_path):
        for file in files:
            if 'README' in file.upper():  # case insensitive
                file_fullname = join(dir_path, file)
                if file_fullname != ignore:
                    log.info(f"File exists : {file_fullname}")
Exemple #11
0
    def __init__(self, config_dict=config_default_dict,
                 mc: move.MoveClass = None,
                 log=None, interactive=True):

        self.interactive = interactive

        self.curation_dict = config_dict['curation']
        self.dict = config_dict['qualtrics']
        self.token = self.dict['token']
        self.data_center = self.dict['datacenter']

        self.baseurl = f"https://{self.data_center}.qualtrics.com/API/v3/"
        self.headers = {"X-API-TOKEN": self.token,
                        "Content-Type": "application/json"}
        self.survey_id = self.dict['survey_id']
        self.file_format = 'csv'

        self.readme_survey_id = self.dict['readme_survey_id']

        # Initialize Deposit Agreement info
        self.da_response_id: str = ''
        self.da_survey_id: str = ''

        # Logging
        self.file_logging = False
        if isinstance(log, type(None)):
            self.log = log_stdout()
        else:
            self.log = log
            for handler in log.handlers:
                if isinstance(handler, logging.FileHandler):
                    self.log_filename = handler.baseFilename
                    self.file_logging = True

        if mc:
            self.mc = mc
        else:
            self.mc = move.MoveClass(curation_dict=self.curation_dict)
Exemple #12
0
    def __init__(self,
                 dn: DepositorName,
                 config_dict=config_default_dict,
                 update=False,
                 q: Qualtrics = None,
                 interactive=True,
                 log=None):
        self.config_dict = config_dict
        self.interactive = interactive

        self.dn = dn
        self.folderName = self.dn.folderName
        self.article_id = self.dn.article_id
        self.article_dict = self.dn.curation_dict

        if isinstance(log, type(None)):
            self.log = log_stdout()
        else:
            self.log = log

        self.log.info("")
        if not update:
            self.log.info("** STARTING README.txt CONSTRUCTION **")
            if self.interactive:
                self.log.info("PROMPT: Do you wish to create a README file?")
                self.user_response = input(
                    "PROMPT: Type 'Yes'/'yes'. Anything else will exit : "
                ).lower()
                self.log.info(f"RESPONSE: {self.user_response}")
            else:
                self.log.info(
                    "Interactive mode disabled. Always creating README.txt")
                self.user_response = 'yes'
        else:
            self.log.info("** UPDATING README.txt **")
            self.user_response = 'yes'

        # Use or initialize Qualtrics object
        if self.user_response != 'yes':
            return

        self.curation_dict = self.config_dict['curation']
        self.root_directory_main = self.curation_dict[
            self.curation_dict['parent_dir']]

        # Always obtain current data curation stage
        self.mc = move.MoveClass(curation_dict=self.curation_dict)
        self.current_stage = self.mc.get_source_stage(self.folderName)
        self.log.info(f"Current stage: {self.current_stage}")
        self.root_directory = join(self.root_directory_main,
                                   self.current_stage)

        # Paths
        self.folder_path = join(self.root_directory, self.folderName)
        self.metadata_path = join(
            self.folder_path,
            self.curation_dict['folder_metadata'])  # METADATA
        self.data_path = join(self.folder_path,
                              self.curation_dict['folder_copy_data'])  # DATA
        self.original_data_path = join(
            self.folder_path,
            self.curation_dict['folder_data'])  # ORIGINAL_DATA

        # This is the full path of the final README.txt file for creation
        self.readme_file_path = join(self.data_path, 'README.txt')

        # Symlink template name in METADATA
        self.default_readme_file = self.curation_dict['readme_template']

        if q:
            self.q = q
        else:
            self.q = Qualtrics(config_dict=self.config_dict,
                               mc=self.mc,
                               interactive=interactive,
                               log=self.log)

        # Retrieve Figshare metadata for jinja template engine
        self.figshare_readme_dict = self.retrieve_article_metadata()

        # Retrieve Qualtrics README information for jinja template engine
        self.qualtrics_readme_dict = self.retrieve_qualtrics_readme()

        # Retrieve list of README files provided by user
        self.README_files = self.get_readme_files()

        try:
            # Define template_source
            self.template_source = self.check_for_readme()

            if self.template_source == 'default':
                self.readme_template = self.select_template()
            else:
                self.readme_template = 'user_readme_template.md'

            # Save copy of template in DATA as README_template.md
            self.save_template()

            # Import README template as jinja2 template
            self.jinja_template = self.import_template()
        except SystemError:
            self.template_source = 'unknown'
            self.log.warning("More than one README files found!")
Exemple #13
0
    def __init__(self,
                 article_id,
                 log=None,
                 config_dict=config_default_dict,
                 metadata_only=False):

        # If log is not defined, then output log to stdout
        if isinstance(log, type(None)):
            self.log = log_stdout()
        else:
            self.log = log

        self.mc = move.MoveClass(curation_dict=config_dict['curation'],
                                 log=self.log)

        self.root_directory = join(self.mc.root_directory_main,
                                   self.mc.todo_folder)

        self.article_id = article_id

        self.curation_dict = config_dict['curation']
        self.figshare_dict = config_dict['figshare']

        self.fs = Figshare(token=self.figshare_dict['token'],
                           private=True,
                           stage=self.figshare_dict['stage'])
        self.fs_admin = FigshareInstituteAdmin(**self.figshare_dict,
                                               log=self.log)

        self.dn = DepositorName(self.article_id, self.fs_admin, log=self.log)

        # Sub-folders for data curation workflow
        self.data_directory = join(self.dn.folderName,
                                   self.curation_dict['folder_data'])
        self.copy_data_directory = join(self.dn.folderName,
                                        self.curation_dict['folder_copy_data'])
        self.metadata_directory = join(self.dn.folderName,
                                       self.curation_dict['folder_metadata'])

        self.metadata_only = metadata_only

        # Check if deposit is not archived (e.g., deleted by user, us, etc)
        if self.dn.curation_dict['status'] == 'closed':
            self.log.warning(
                "This deposit was archived for one of many reasons!")
            self.log.info(f"resolution_comment metadata info: "
                          f"'{self.dn.curation_dict['resolution_comment']}'")
            self.log.warning("Stopping data curation for this deposit")
            raise SystemError

        # Check if dataset has been retrieved
        try:
            source_stage = self.mc.get_source_stage(self.dn.folderName,
                                                    verbose=False)
            self.log.warn(
                f"Curation folder exists in {source_stage}. Will not retrieve!"
            )
            self.new_set = False
        except FileNotFoundError:
            self.new_set = True
            # Create folders
            self.make_folders()
            self.write_curation_metadata()
Exemple #14
0
def workflow(article_id,
             browser=True,
             log=None,
             config_dict=config_default_dict,
             metadata_only=False):
    """
    Purpose:
      This function follows our initial set-up to:
       1. Retrieve the data for a given deposit
       2. Set permissions and ownership (the latter needs to be tested and performed)
       3. Download curatorial review report
       4. Download Qualtrics Deposit Agreement form
       5. Check the README file

    :param article_id: str or int, Figshare article id
    :param browser: bool indicates opening a web browser for Qualtrics survey. Default: True
    :param log: logger.LogClass object. Default is stdout via python logging
    :param config_dict: dict of dict with hierarchy of sections
           (figshare, curation, qualtrics) follow by options
    :param metadata_only: When True, only downloads the item metadata.
    """

    # If log is not defined, then output log to stdout
    if isinstance(log, type(None)):
        log = log_stdout()

    try:
        pw = PrerequisiteWorkflow(article_id,
                                  log=log,
                                  config_dict=config_dict,
                                  metadata_only=metadata_only)
    except SystemError:
        return

    # Perform prerequisite workflow if dataset is entirely new
    if pw.new_set:
        # Check if a DOI is reserved. If not, reserve DOI
        pw.reserve_doi()

        # Retrieve data and place in 1.ToDo curation folder
        pw.download_data()

        # Download curation report
        pw.download_report()

        # Download Qualtrics deposit agreement form
        curation_dict = config_dict['curation']
        out_path = join(
            curation_dict[curation_dict['parent_dir']],
            curation_dict['folder_todo'],
            pw.dn.folderName,
            curation_dict['folder_ual_rdm'],
        )
        log.debug(f"out_path: {out_path}")
        q = Qualtrics(config_dict=config_dict, log=log)
        q.retrieve_deposit_agreement(pw.dn, out_path=out_path, browser=browser)

        # Check for README file and create one if it does not exist
        rc = ReadmeClass(pw.dn, log=log, config_dict=config_dict, q=q)
        try:
            rc.main()

            # Move to next curation stage, 2.UnderReview curation folder
            if rc.template_source != 'unknown':
                log.info(
                    "PROMPT: Do you wish to move deposit to the next curation stage?"
                )
                user_response = input(
                    "PROMPT: Type 'Yes'/'yes'. Anything else will skip : ")
                log.info(f"RESPONSE: {user_response}")
                if user_response.lower() == 'yes':
                    pw.move_to_next()
                else:
                    log.info("Skipping move ...")
        except SystemExit as msg:
            log.warning(msg)
            log.info(" > To construct, run the `update_readme` command")
Exemple #15
0
def download_files(article_id,
                   fs,
                   root_directory=None,
                   data_directory=None,
                   metadata_directory=None,
                   log=None,
                   metadata_only=False):
    """
    Purpose:
      Retrieve data for a Figshare deposit following data curation workflow

    :param article_id: Figshare article ID (int)
    :param fs: Figshare object
    :param root_directory: Root path for curation workflow (str)
    :param data_directory: Relative folder path for primary location of data (str)
    :param metadata_directory: Relative folder path for primary location of metadata (str)
    :param log: logger.LogClass object. Default is stdout via python logging
    :param metadata_only: bool indicates whether to retrieve metadata. Default: True
           If set, no files are downloaded
    """

    if isinstance(log, type(None)):
        log = log_stdout()

    log.info("")
    if metadata_only:
        log.info(f"** NO FILE RETRIEVAL: metadata_only={metadata_only} **")
    else:
        log.info("** DOWNLOADING DATA **")

    if root_directory is None:
        root_directory = os.getcwd()

    # Retrieve article information
    # article_details = fs.get_article_details(article_id)

    file_list = fs.list_files(article_id)
    n_files = len(file_list)

    if not data_directory:
        dir_path = os.path.join(root_directory, f"figshare_{article_id}/")
    else:
        dir_path = os.path.join(root_directory, data_directory)

    os.makedirs(dir_path, exist_ok=True)  # This might require Python >=3.2
    permissions.curation(dir_path)

    log.info(f"Total number of files: {n_files}")

    out_file_prefix = f"file_list_original_{article_id}"
    save_metadata(file_list,
                  out_file_prefix,
                  root_directory=root_directory,
                  metadata_directory=metadata_directory,
                  save_csv=True,
                  log=log)

    if not metadata_only:
        for n, file_dict in zip(range(n_files), file_list):
            log.info(f"Retrieving {n+1} of {n_files} : "
                     f"{file_dict['name']} ({file_dict['size']})")
            log.info(f"URL: {file_dict['download_url']}")
            filename = os.path.join(dir_path, file_dict['name'])
            retrieve_cnt = 0
            checksum_flag = False
            if not exists(filename):
                while retrieve_cnt < N_TRIES_MD5:
                    log.info(f"Retrieval attempt #{retrieve_cnt + 1}")
                    try:
                        private_file_retrieve(file_dict['download_url'],
                                              filename=filename,
                                              token=fs.token,
                                              log=log)
                        log.info("Download successful!")
                        retrieve_cnt += 1
                    except (HTTPError, IOError):
                        retrieve_cnt += 1

                    # Perform checksum
                    if exists(filename):
                        if not file_dict['is_link_only']:
                            checksum_flag = check_md5(
                                filename, file_dict['supplied_md5'])
                            if checksum_flag:
                                break
                        else:
                            log.info(
                                "Not performing checksum on linked-only record"
                            )
                            break
                else:
                    if not checksum_flag:
                        log.warning("File retrieval unsuccessful! "
                                    f"Aborted after {N_TRIES_MD5} tries")
            else:
                log.info("File exists! Not overwriting!")

    # Change permissions on folders and files
    # permissions.curation(dir_path)
    permissions.curation(dir_path, mode=0o555)  # read and execute only