def test_date_validation(self): self.assertTrue(validate_date("2012-12-12")) self.assertTrue(validate_date("2012-12-12 12:12:12", date_format="%Y-%m-%d %H:%M:%S")) self.assertRaises(ValueError, validate_date, "201222-2-12") self.assertRaises(ValueError, validate_date, "202-22-12") self.assertRaises(ValueError, validate_date, "2012-22-12") self.assertRaises(ValueError, validate_date, "2012-02-42")
def get_records_to_harvest(parameters): """ Get APSRecord to harvest. Using the given parameters dict (from bst_apsharvest), we check how to get the list of records to process. Returns a tuple of (record_list, harvest_from_date, date_checked) where record_list is the list of APSRecord instances, harvest_from_date is the decided date to harvest from and date_checked is the datetime when the harvest was initiated. """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() new_harvest_date = None harvest_from_date = None harvest_until_date = None if parameters.get("input_file"): # We get input from file with open(parameters.get("input_file")) as fd: for line in fd.readlines(): doi = line.strip() if not doi: continue final_record_list.append(APSRecord(doi=doi)) if parameters.get("threshold_date"): # Input from user. Validate date try: validate_date(parameters.get("threshold_date")) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) raise
def test_date_validation(self): self.assertTrue(validate_date("2012-12-12")) self.assertTrue( validate_date("2012-12-12 12:12:12", date_format="%Y-%m-%d %H:%M:%S")) self.assertRaises(ValueError, validate_date, "201222-2-12") self.assertRaises(ValueError, validate_date, "202-22-12") self.assertRaises(ValueError, validate_date, "2012-22-12") self.assertRaises(ValueError, validate_date, "2012-02-42")
def get_records_to_harvest(parameters): """ Get APSRecord to harvest. Using the given parameters dict (from bst_apsharvest), we check how to get the list of records to process. Returns a tuple of (record_list, date_checked) where record_list is the list of APSRecord instances and date_checked is the datetime when checking was done. """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() new_harvest_date = None if parameters.get("threshold_date"): # Input from user. Validate date try: validate_date(parameters.get("threshold_date")) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) raise
def get_records_to_harvest(parameters): """ Get APSRecord to harvest. Using the given parameters dict (from bst_apsharvest), we check how to get the list of records to process. Returns a tuple of (record_list, date_checked) where record_list is the list of APSRecord instances and date_checked is the datetime when checking was done. """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() new_harvest_date = None if parameters.get("threshold_date"): # Input from user. Validate date try: validate_date(parameters.get("threshold_date")) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) raise
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: comma-separated list of DOIs to download fulltext/metadata for. @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode,)) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False if from_date: # We get records from APS directly new_harvest_date = None perpage = 100 # Are we harvesting from last time or a specific date? if from_date == "last": dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download") # Keeping current time until completed harvest. new_harvest_date = datetime.datetime.now() else: # Input from user. Validate date try: harvest_from_date = validate_date(from_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) return 1 # Turn harvest_from_date back into a string (away from datetime object) harvest_from_date = harvest_from_date.strftime("%Y-%m-%d") status_message = "Checking for new records from APS from %s" % \ (harvest_from_date,) if until_date: # Input from user. Validate date try: validate_date(until_date) except ValueError, e: write_message("Error parsing until_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) return 1 status_message += " until %s" % (until_date,)
raise if parameters.get("from_date"): # We get records from APS directly perpage = 100 # Are we harvesting from last time or a specific date? if parameters.get("from_date") == "last": dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download") # Keeping current time until completed harvest. new_harvest_date = datetime.datetime.now() else: # Input from user. Validate date try: harvest_from_date = validate_date(parameters.get("from_date")) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) raise # If threshold is not given, set it to from_date - CFG_APSHARVEST_THRESHOLD_DAYS if not parameters.get("threshold_date"): new_threshold_date = harvest_from_date - datetime.timedelta(days=CFG_APSHARVEST_THRESHOLD_DAYS) parameters["threshold_date"] = new_threshold_date.strftime("%Y-%m-%d") write_message("Setting dynamic threshold date to %s." % ( parameters["threshold_date"], )) status_message = "Checking for new records from APS from %s" % \
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if threshold_date: # Input from user. Validate date try: harvest_from_date = validate_date(threshold_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1
if from_date: # We get records from APS directly new_harvest_date = None perpage = 100 # Are we harvesting from last time or a specific date? if from_date == "last": dummy, harvest_from_date = fetch_last_updated( name="apsharvest_api_download") # Keeping current time until completed harvest. new_harvest_date = datetime.datetime.now() else: # Input from user. Validate date try: harvest_from_date = validate_date(from_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1 # Turn harvest_from_date back into a string (away from datetime object) harvest_from_date = harvest_from_date.strftime("%Y-%m-%d") status_message = "Checking for new records from APS from %s" % \ (harvest_from_date,) if until_date: # Input from user. Validate date try: validate_date(until_date)
if from_date: # We get records from APS directly new_harvest_date = None perpage = 100 # Are we harvesting from last time or a specific date? if from_date == "last": dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download") # Keeping current time until completed harvest. new_harvest_date = datetime.datetime.now() else: # Input from user. Validate date try: harvest_from_date = validate_date(from_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e),), stream=sys.stderr) return 1 # Turn harvest_from_date back into a string (away from datetime object) harvest_from_date = harvest_from_date.strftime("%Y-%m-%d") status_message = "Checking for new records from APS from %s" % \ (harvest_from_date,) if until_date: # Input from user. Validate date try: validate_date(until_date)