def check_md5(filename: str, figshare_checksum: str, log: Logger = log_stdout()) -> bool: """ Perform checksum after file retrieval against Figshare's computation :param filename: Full path of file on server :param figshare_checksum: MD5 checksum string from supplied_md5 metadata :param log: logger.LogClass object. Default is stdout via python logging :return: ``True`` if passed, ``False`` if not a match """ log.info("Performing MD5 checksum ...") checksum_pass = False hash_md5 = hashlib.md5() with open(filename, "rb") as f: # Handle large files by chunking for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) checksum_compute = hash_md5.hexdigest() if checksum_compute == figshare_checksum: checksum_pass = True log.info("MD5 Checksum passed!!!") else: log.warning("Checksum failed!!!") log.info(f"MD5 Result: {checksum_compute}") log.info(f"Expectation: {figshare_checksum}") return checksum_pass
def __init__(self, article_id, fs_admin, curation_id=None, verbose=True, log=None): self.article_id = article_id self.fs_admin = fs_admin self.verbose = verbose if isinstance(log, type(None)): self.log = log_stdout() else: self.log = log # Retrieves specific information for article (includes authors) if isinstance(curation_id, type(None)): self.curation_id = self.get_curation_id() else: self.curation_id = curation_id self.curation_dict = self.get_curation_dict() # Set version number if self.curation_dict['status'] == 'approved': self.version_no = self.curation_dict['version'] else: self.version_no = self.curation_dict['version'] + 1 self.name_dict = self.get_name_dict() self.folderName = self.get_folder_name()
def __init__(self, token: str, stage: bool = False, admin_filter: list = None, log: Logger = log_stdout()): self.token = token self.stage = stage if not self.stage: self.baseurl = "https://api.figshare.com/v2/account/" else: self.baseurl = "https://api.figsh.com/v2/account/" self.baseurl_institute = self.baseurl + "institution/" self.headers = {'Content-Type': 'application/json'} if self.token: self.headers['Authorization'] = f'token {self.token}' self.admin_filter = admin_filter if admin_filter is not None: self.ignore_admin = True else: self.ignore_admin = False self.log = log
def private_file_retrieve(url, filename=None, token=None, log=None): """ Purpose: Custom Request to privately retrieve a file with a token. This was built off of the figshare Python code, but a urlretrieve did not handle providing a token in the header. :param url: Full URL (str) :param filename: Full filename for file to be written (str) :param token: API token (str) :param log: logger.LogClass object. Default is stdout via python logging """ if isinstance(log, type(None)): log = log_stdout() headers = dict() if token: headers['Authorization'] = f'token {token}' try: h = requests.head(url, headers=headers) h.raise_for_status() # Chunk read and write with stream option and copyfileobj with requests.get(url, stream=True, headers=headers) as r: with open(filename, 'wb') as f: shutil.copyfileobj(r.raw, f) except (HTTPError, IOError) as error: log.warning(error) raise HTTPError(error)
def save_metadata(json_response: Union[list, dict], out_file_prefix: str, metadata_source: str = 'CURATION', root_directory: str = '', metadata_directory: str = '', save_csv: bool = False, overwrite: bool = False, log=None): """ Write metadata contents to JSON and CSV file :param json_response: Content in list or dict :param out_file_prefix: Filename prefix. Appends .json and .csv :param metadata_source: Source of metadata, :param root_directory: Full path containing the working directory :param metadata_directory: Metadata path :param save_csv: Save a CSV file. Default: False :param overwrite: Overwrite file if it exists. Default: False :param log: LogClass or logging object. Default: log_stdout() """ if log is None: log = log_stdout() log.debug("starting ...") log.info("") log.info(f"** SAVING {metadata_source} METADATA **") if not root_directory: root_directory = os.getcwd() metadata_path = os.path.join(root_directory, metadata_directory) out_file_prefix = f"{metadata_path}/{out_file_prefix}" # Write JSON file json_out_file = f"{out_file_prefix}.json" if not os.path.exists(json_out_file): write_json(json_out_file, json_response, log) else: log.info(f"File exists: {json_out_file}") if overwrite: log.info("Overwriting!") write_json(json_out_file, json_response, log) # Write CSV file if save_csv: df = pd.DataFrame.from_dict(json_response, orient='columns') csv_out_file = f"{out_file_prefix}.csv" if not os.path.exists(csv_out_file): log.info(f"Writing: {csv_out_file}") df.to_csv(csv_out_file, index=False) else: log.info(f"File exists: {csv_out_file}") if overwrite: log.info("Overwriting!") df.to_csv(csv_out_file, index=False) log.debug("finished.")
def test_log_stdout(): log0 = logger.log_stdout() log0.info("Print INFO test") log0.debug("Print DEBUG test") log0.warning("Print WARNING test") assert isinstance(log0, logging.Logger)
def tiny_url(url: str, alias=None, log=None) -> str: """ Purpose: Generate a TinyURL :param url: str. HTTP URL :param alias: str. Alias if desired :param log: LogClass or logger object :return: response_data: str containing the shortened TinyURL """ if log is None: log = log_stdout() endpoint = "http://tinyurl.com/api-create.php" encoded_url = urlencode({'url': url}, safe=url_safe, quote_via=quote) log.debug(f"encoded_url: {encoded_url}") get_url = f"{endpoint}?{encoded_url}" log.debug(f"get_url : {get_url}") params = dict() if alias is not None: params = {'alias': alias} expected_url = f"https://tinyurl.com/{alias}" expected_response = requests.get(expected_url) if expected_response.status_code == 200: log.info(f"TinyURL link already exists!") expected_request_url = f"{url}&alias={alias}" if expected_response.url != expected_request_url: log.warning(f"Input URL changed!") log.debug(f"Previous URL: {expected_response.url}") log.debug(f"New URL: {expected_request_url}") log.warning(f"Creating new TinyURL") response = requests.get(get_url) response_data = response.text else: response_data = expected_url else: log.info(f"TinyURL link does not exist. Creating!") # GET still works if the TinyURL alias exists, but points to the same URL response = requests.get(get_url, params=params) try: response.raise_for_status() response_data = response.text except HTTPError as error: log.warning(f"Caught an HTTPError: {error}") log.warning('Body:\n', response.text) raise HTTPError return response_data
def review_report(depositor_name='', curation_dict=config_default_dict['curation'], log=None): """ Purpose: Retrieve Curation Review Report and save on curation server """ if isinstance(log, type(None)): log = log_stdout() log.info("") log.info("** CREATING CURATION REVIEW REPORT **") root_directory_main = curation_dict[curation_dict['parent_dir']] todo_folder = curation_dict['folder_todo'] folder_ual_rdm = curation_dict['folder_ual_rdm'] report_url = curation_dict['report_url'] staging_directory = join(root_directory_main, todo_folder) # Complete path to UAL_RDM folder out_path = join(staging_directory, depositor_name, folder_ual_rdm) if not exists(out_path): log.info(f"Creating folder : {out_path}") makedirs(out_path, mode=0o777, exist_ok=True) else: log.warn(f"!!!! Folder exists, not creating : {out_path}") # MS-Word document filename simplify_name = depositor_name.replace('/v', '_v') filename = 'ReDATA-DepositReview_{}.docx'.format(simplify_name) out_file = join(out_path, filename) # Write file if not exists(out_file): log.info(f"Saving ReDATA Curation Report to: {out_path}") log.info(f"Saving as : {filename}") urlretrieve(report_url, out_file) permissions.curation(out_path) else: log.info(f"!!!! ReDATA Curation Report exists in {out_path} !!!!") log.info("!!!! Will not override !!!!")
def __init__(self, curation_dict=config_default_dict['curation'], log=None): if isinstance(log, type(None)): self.log = log_stdout() else: self.log = log self.root_directory_main = curation_dict[curation_dict['parent_dir']] self.todo_folder = curation_dict['folder_todo'] self.underreview_folder = curation_dict['folder_underreview'] self.reviewed_folder = curation_dict['folder_reviewed'] self.published_folder = curation_dict['folder_published'] self.rejected_folder = curation_dict['folder_rejected'] self.stage_list = [ self.todo_folder, self.underreview_folder, self.reviewed_folder, self.published_folder ]
def walkthrough(data_path, ignore='', log=None): """ Purpose: Perform walkthrough to find other README files :param data_path: path to DATA folder :param ignore: full path of default README.txt to ignore :param log: logger.LogClass object. Default is stdout via python logging :return: """ if isinstance(log, type(None)): log = log_stdout() else: log = log for dir_path, dir_names, files in walk(data_path): for file in files: if 'README' in file.upper(): # case insensitive file_fullname = join(dir_path, file) if file_fullname != ignore: log.info(f"File exists : {file_fullname}")
def __init__(self, config_dict=config_default_dict, mc: move.MoveClass = None, log=None, interactive=True): self.interactive = interactive self.curation_dict = config_dict['curation'] self.dict = config_dict['qualtrics'] self.token = self.dict['token'] self.data_center = self.dict['datacenter'] self.baseurl = f"https://{self.data_center}.qualtrics.com/API/v3/" self.headers = {"X-API-TOKEN": self.token, "Content-Type": "application/json"} self.survey_id = self.dict['survey_id'] self.file_format = 'csv' self.readme_survey_id = self.dict['readme_survey_id'] # Initialize Deposit Agreement info self.da_response_id: str = '' self.da_survey_id: str = '' # Logging self.file_logging = False if isinstance(log, type(None)): self.log = log_stdout() else: self.log = log for handler in log.handlers: if isinstance(handler, logging.FileHandler): self.log_filename = handler.baseFilename self.file_logging = True if mc: self.mc = mc else: self.mc = move.MoveClass(curation_dict=self.curation_dict)
def __init__(self, dn: DepositorName, config_dict=config_default_dict, update=False, q: Qualtrics = None, interactive=True, log=None): self.config_dict = config_dict self.interactive = interactive self.dn = dn self.folderName = self.dn.folderName self.article_id = self.dn.article_id self.article_dict = self.dn.curation_dict if isinstance(log, type(None)): self.log = log_stdout() else: self.log = log self.log.info("") if not update: self.log.info("** STARTING README.txt CONSTRUCTION **") if self.interactive: self.log.info("PROMPT: Do you wish to create a README file?") self.user_response = input( "PROMPT: Type 'Yes'/'yes'. Anything else will exit : " ).lower() self.log.info(f"RESPONSE: {self.user_response}") else: self.log.info( "Interactive mode disabled. Always creating README.txt") self.user_response = 'yes' else: self.log.info("** UPDATING README.txt **") self.user_response = 'yes' # Use or initialize Qualtrics object if self.user_response != 'yes': return self.curation_dict = self.config_dict['curation'] self.root_directory_main = self.curation_dict[ self.curation_dict['parent_dir']] # Always obtain current data curation stage self.mc = move.MoveClass(curation_dict=self.curation_dict) self.current_stage = self.mc.get_source_stage(self.folderName) self.log.info(f"Current stage: {self.current_stage}") self.root_directory = join(self.root_directory_main, self.current_stage) # Paths self.folder_path = join(self.root_directory, self.folderName) self.metadata_path = join( self.folder_path, self.curation_dict['folder_metadata']) # METADATA self.data_path = join(self.folder_path, self.curation_dict['folder_copy_data']) # DATA self.original_data_path = join( self.folder_path, self.curation_dict['folder_data']) # ORIGINAL_DATA # This is the full path of the final README.txt file for creation self.readme_file_path = join(self.data_path, 'README.txt') # Symlink template name in METADATA self.default_readme_file = self.curation_dict['readme_template'] if q: self.q = q else: self.q = Qualtrics(config_dict=self.config_dict, mc=self.mc, interactive=interactive, log=self.log) # Retrieve Figshare metadata for jinja template engine self.figshare_readme_dict = self.retrieve_article_metadata() # Retrieve Qualtrics README information for jinja template engine self.qualtrics_readme_dict = self.retrieve_qualtrics_readme() # Retrieve list of README files provided by user self.README_files = self.get_readme_files() try: # Define template_source self.template_source = self.check_for_readme() if self.template_source == 'default': self.readme_template = self.select_template() else: self.readme_template = 'user_readme_template.md' # Save copy of template in DATA as README_template.md self.save_template() # Import README template as jinja2 template self.jinja_template = self.import_template() except SystemError: self.template_source = 'unknown' self.log.warning("More than one README files found!")
def __init__(self, article_id, log=None, config_dict=config_default_dict, metadata_only=False): # If log is not defined, then output log to stdout if isinstance(log, type(None)): self.log = log_stdout() else: self.log = log self.mc = move.MoveClass(curation_dict=config_dict['curation'], log=self.log) self.root_directory = join(self.mc.root_directory_main, self.mc.todo_folder) self.article_id = article_id self.curation_dict = config_dict['curation'] self.figshare_dict = config_dict['figshare'] self.fs = Figshare(token=self.figshare_dict['token'], private=True, stage=self.figshare_dict['stage']) self.fs_admin = FigshareInstituteAdmin(**self.figshare_dict, log=self.log) self.dn = DepositorName(self.article_id, self.fs_admin, log=self.log) # Sub-folders for data curation workflow self.data_directory = join(self.dn.folderName, self.curation_dict['folder_data']) self.copy_data_directory = join(self.dn.folderName, self.curation_dict['folder_copy_data']) self.metadata_directory = join(self.dn.folderName, self.curation_dict['folder_metadata']) self.metadata_only = metadata_only # Check if deposit is not archived (e.g., deleted by user, us, etc) if self.dn.curation_dict['status'] == 'closed': self.log.warning( "This deposit was archived for one of many reasons!") self.log.info(f"resolution_comment metadata info: " f"'{self.dn.curation_dict['resolution_comment']}'") self.log.warning("Stopping data curation for this deposit") raise SystemError # Check if dataset has been retrieved try: source_stage = self.mc.get_source_stage(self.dn.folderName, verbose=False) self.log.warn( f"Curation folder exists in {source_stage}. Will not retrieve!" ) self.new_set = False except FileNotFoundError: self.new_set = True # Create folders self.make_folders() self.write_curation_metadata()
def workflow(article_id, browser=True, log=None, config_dict=config_default_dict, metadata_only=False): """ Purpose: This function follows our initial set-up to: 1. Retrieve the data for a given deposit 2. Set permissions and ownership (the latter needs to be tested and performed) 3. Download curatorial review report 4. Download Qualtrics Deposit Agreement form 5. Check the README file :param article_id: str or int, Figshare article id :param browser: bool indicates opening a web browser for Qualtrics survey. Default: True :param log: logger.LogClass object. Default is stdout via python logging :param config_dict: dict of dict with hierarchy of sections (figshare, curation, qualtrics) follow by options :param metadata_only: When True, only downloads the item metadata. """ # If log is not defined, then output log to stdout if isinstance(log, type(None)): log = log_stdout() try: pw = PrerequisiteWorkflow(article_id, log=log, config_dict=config_dict, metadata_only=metadata_only) except SystemError: return # Perform prerequisite workflow if dataset is entirely new if pw.new_set: # Check if a DOI is reserved. If not, reserve DOI pw.reserve_doi() # Retrieve data and place in 1.ToDo curation folder pw.download_data() # Download curation report pw.download_report() # Download Qualtrics deposit agreement form curation_dict = config_dict['curation'] out_path = join( curation_dict[curation_dict['parent_dir']], curation_dict['folder_todo'], pw.dn.folderName, curation_dict['folder_ual_rdm'], ) log.debug(f"out_path: {out_path}") q = Qualtrics(config_dict=config_dict, log=log) q.retrieve_deposit_agreement(pw.dn, out_path=out_path, browser=browser) # Check for README file and create one if it does not exist rc = ReadmeClass(pw.dn, log=log, config_dict=config_dict, q=q) try: rc.main() # Move to next curation stage, 2.UnderReview curation folder if rc.template_source != 'unknown': log.info( "PROMPT: Do you wish to move deposit to the next curation stage?" ) user_response = input( "PROMPT: Type 'Yes'/'yes'. Anything else will skip : ") log.info(f"RESPONSE: {user_response}") if user_response.lower() == 'yes': pw.move_to_next() else: log.info("Skipping move ...") except SystemExit as msg: log.warning(msg) log.info(" > To construct, run the `update_readme` command")
def download_files(article_id, fs, root_directory=None, data_directory=None, metadata_directory=None, log=None, metadata_only=False): """ Purpose: Retrieve data for a Figshare deposit following data curation workflow :param article_id: Figshare article ID (int) :param fs: Figshare object :param root_directory: Root path for curation workflow (str) :param data_directory: Relative folder path for primary location of data (str) :param metadata_directory: Relative folder path for primary location of metadata (str) :param log: logger.LogClass object. Default is stdout via python logging :param metadata_only: bool indicates whether to retrieve metadata. Default: True If set, no files are downloaded """ if isinstance(log, type(None)): log = log_stdout() log.info("") if metadata_only: log.info(f"** NO FILE RETRIEVAL: metadata_only={metadata_only} **") else: log.info("** DOWNLOADING DATA **") if root_directory is None: root_directory = os.getcwd() # Retrieve article information # article_details = fs.get_article_details(article_id) file_list = fs.list_files(article_id) n_files = len(file_list) if not data_directory: dir_path = os.path.join(root_directory, f"figshare_{article_id}/") else: dir_path = os.path.join(root_directory, data_directory) os.makedirs(dir_path, exist_ok=True) # This might require Python >=3.2 permissions.curation(dir_path) log.info(f"Total number of files: {n_files}") out_file_prefix = f"file_list_original_{article_id}" save_metadata(file_list, out_file_prefix, root_directory=root_directory, metadata_directory=metadata_directory, save_csv=True, log=log) if not metadata_only: for n, file_dict in zip(range(n_files), file_list): log.info(f"Retrieving {n+1} of {n_files} : " f"{file_dict['name']} ({file_dict['size']})") log.info(f"URL: {file_dict['download_url']}") filename = os.path.join(dir_path, file_dict['name']) retrieve_cnt = 0 checksum_flag = False if not exists(filename): while retrieve_cnt < N_TRIES_MD5: log.info(f"Retrieval attempt #{retrieve_cnt + 1}") try: private_file_retrieve(file_dict['download_url'], filename=filename, token=fs.token, log=log) log.info("Download successful!") retrieve_cnt += 1 except (HTTPError, IOError): retrieve_cnt += 1 # Perform checksum if exists(filename): if not file_dict['is_link_only']: checksum_flag = check_md5( filename, file_dict['supplied_md5']) if checksum_flag: break else: log.info( "Not performing checksum on linked-only record" ) break else: if not checksum_flag: log.warning("File retrieval unsuccessful! " f"Aborted after {N_TRIES_MD5} tries") else: log.info("File exists! Not overwriting!") # Change permissions on folders and files # permissions.curation(dir_path) permissions.curation(dir_path, mode=0o555) # read and execute only