(record.recid or "new record"), verbose=2) record.add_metadata(path_to_converted) except APSHarvesterConversionError, e: msg = "Metadata conversion failed: %s\n%s" % \ (str(e), traceback.format_exc()[:-1]) write_message(msg, stream=sys.stderr) yield record, msg write_message("Converted metadata for %s" % (record.recid or "new record"), verbose=2) if parameters.get("fulltext"): record.add_fft(fulltext_file, parameters.get("hidden")) if record.date: store_last_updated(record.recid, record.date, name="apsharvest") yield record, "" def process_record_submission(self, parameters): """Run the submission process.""" if parameters.get("match"): # We will do a simple match with the database new_records, existing_records = self.check_records() self.records_to_insert.extend(new_records) self.records_to_update.extend(existing_records) else: # We insert everything self.records_to_insert.extend(self.records_harvested) if self.records_to_insert:
taskid = submit_records(record_filename, records_to_update, update_mode, taskid, silent=records and True or False) if not taskid: # Something went wrong write_message("Records were not submitted correctly") if records_failed: submit_records_via_mail(subject="APSHarvest failed records", body="%s" % ("\n".join([rec.doi for rec in records_failed]))) if from_date == "last": # Harvest of new records from APS successful # we update last harvested date store_last_updated(None, new_harvest_date, name="apsharvest_api_download") # We are done write_message("Harvested %d records. (%d failed)" % (count, len(records_failed))) def APS_connect(from_param, until_param=None, page=1, perpage=100): """ Manages connection to APS site and return connector. """ host = 'http://harvest.aps.org' function = '/content/journals/articles' from_param = 'from=' + str(from_param) params = "?" + from_param
except APSHarvesterConversionError, e: msg = "Metadata conversion failed: %s\n%s" % \ (str(e), traceback.format_exc()[:-1]) write_message(msg, stream=sys.stderr) yield record, msg write_message("Converted metadata for %s" % (record.recid or "new record"), verbose=2) if parameters.get("fulltext"): record.add_fft(fulltext_file, parameters.get("hidden")) if record.date: store_last_updated(record.recid, record.date, name="apsharvest") yield record, "" def process_record_submission(self, parameters): """Run the submission process.""" if parameters.get("match"): # We will do a simple match with the database new_records, existing_records = self.check_records() self.records_to_insert.extend(new_records) self.records_to_update.extend(existing_records) else: # We insert everything self.records_to_insert.extend(self.records_harvested)
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no", input_file=""): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string @param input_file: harvests articles with given file containing one DOI per line. @type input_file: string """ task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode,)) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if input_file: if not os.path.exists(input_file): write_message("Input file {0} does not exist!".format(input_file), stream=sys.stderr) return False # Unify all parameters into a dict using locals parameters = locals() # 1: We analyze parameters and fetch all requested records from APS final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest(parameters) write_message("Found %d record(s) to download." % (len(final_record_list),)) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return # 2: Extract fulltext/metadata XML and upload bunches of # records as configured job = APSHarvestJob(CFG_APSHARVEST_DIR, date_started=new_harvest_date, date_harvested_from=harvest_from_date) count = process_records(job, parameters, final_record_list) if parameters.get("from_date") == "last": # Harvest of new records from APS successful # we update last harvested date store_last_updated(None, new_harvest_date, name="apsharvest_api_download") # We are done write_message("Harvested %d records. (%d failed)" % (count, len(job.records_failed)))
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no", input_file=""): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string @param input_file: harvests articles with given file containing one DOI per line. @type input_file: string """ task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if input_file: if not os.path.exists(input_file): write_message("Input file {0} does not exist!".format(input_file), stream=sys.stderr) return False # Unify all parameters into a dict using locals parameters = locals() # 1: We analyze parameters and fetch all requested records from APS final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest( parameters) write_message("Found %d record(s) to download." % (len(final_record_list), )) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return # 2: Extract fulltext/metadata XML and upload bunches of # records as configured job = APSHarvestJob(CFG_APSHARVEST_DIR, date_started=new_harvest_date, date_harvested_from=harvest_from_date) count = process_records(job, parameters, final_record_list) if parameters.get("from_date") == "last": # Harvest of new records from APS successful # we update last harvested date store_last_updated(None, new_harvest_date, name="apsharvest_api_download") # We are done write_message("Harvested %d records. (%d failed)" % (count, len(job.records_failed)))