def __init__(self, directory): self.records_harvested = [] self.records_to_insert = [] self.records_to_update = [] self.records_failed = [] self.out_folder = create_work_folder(directory) self.date_started = datetime.datetime.now() self.mail_subject = "APS harvest results: %s" % \ (self.date_started.strftime("%Y-%m-%d %H:%M:%S"),) from invenio.refextract_kbs import get_kbs journal_mappings = get_kbs() if journal_mappings and "journals" in journal_mappings: journal_mappings = journal_mappings['journals'][1] else: journal_mappings = None self.journal_mappings = journal_mappings
def bst_consyn_harvest(feed=None, package=None, package_list=None, batch_size='500', delete_zip='False'): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are three execution modes: 1. Download from an atom feed. 2. Extract a zip package. 3. Extract a list of zip packages. @param feed: The URL of the atom feed to download. @type feed: string @param package: A path to a zip package @type package: string @param package_list: A path to a file with a list of paths to zip packages @type package_list: string @param batch_size: The number of records contained in each output file @type batch_size: int @param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not @type delete_zip: boolean """ if not feed: feed = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' + 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message('Warning delete_zip parameter is not a valid Boolean (True/False)\n' + 'the default value \'False\' has been used!\n') out_folder = create_work_folder(CFG_CONSYN_OUT_DIRECTORY) try: run_sql("SELECT filename FROM CONSYNHARVEST") except ProgrammingError: # Table missing, create it. run_sql("CREATE TABLE CONSYNHARVEST (" "filename VARCHAR(100) NOT NULL PRIMARY KEY," "date VARCHAR(50)," "size VARCHAR(30) );") if not package and not package_list: download_feed(feed, batch_size, delete_zip, new_sources, out_folder) elif package: extract_package(package, batch_size, delete_zip, out_folder) elif package_list: extract_multiple_packages(package_list, batch_size, delete_zip, new_sources, out_folder) task_sleep_now_if_required(can_stop_too=True) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() els = ElsevierPackage(path="whatever", CONSYN=True) task_update_progress("Converting files 2/2...") fetch_xml_files(consyn_files, els, new_files) task_sleep_now_if_required(can_stop_too=False) create_collection(batch_size, new_files, new_sources, out_folder)
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query, )) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'PoS server')]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url, )) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution, ) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename, ) append_filename = "%s.append.xml" % (input_filename, ) errors_filename = "%s.errors.xml" % (input_filename, ) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len( error_records) subject = "PoS Harvest results: " + datetime.now().strftime( "%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl, ) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[ ('u', url), ('y', 'PoS server') ]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution,) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl,) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def bst_consyn_harvest(feed=None, package=None, package_list=None, batch_size='500', delete_zip='False', upload_FTP='True'): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are three execution modes: 1. Download from an atom feed. 2. Extract a zip package. 3. Extract a list of zip packages. :param feed: The URL of the atom feed to download. :type feed: string :param package: A path to a zip package :type package: string :param package_list: A path to a file with a list of paths to zip packages :type package_list: string :param batch_size: The number of records contained in each output file :type batch_size: string representation of an integer :param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not :type delete_zip: string representation of a boolean :param upload_FTP: Flag to indicate whether the result files should be uploaded to the FTP server :type upload_FTP: string representation of a boolean """ if not feed: feed = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' + 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message( 'Warning delete_zip parameter is not a valid Boolean (True/False)\n' + 'the default value \'False\' has been used!\n') if upload_FTP.lower() == 'true': upload_FTP = True elif upload_FTP.lower() == 'false': upload_FTP = False else: upload_FTP = True write_message( 'Warning upload_FTP parameter is not a valid Boolean (True/False)\n' + 'the default value \'True\' has been used!\n') if not exists(CFG_CONSYN_OUT_DIRECTORY): rmdir(create_work_folder(CFG_CONSYN_OUT_DIRECTORY)) out_folder = CFG_CONSYN_OUT_DIRECTORY els = ElsevierPackage(CONSYN=True) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() if not package and not package_list: download_feed(feed, batch_size, delete_zip, new_sources, out_folder) task_update_progress("Converting files 2/3...") task_sleep_now_if_required(can_stop_too=True) fetch_xml_files(consyn_files, els, new_files) task_sleep_now_if_required(can_stop_too=False) else: xml_files = [] if package: xml_files = extract_package(package, batch_size, delete_zip, out_folder) elif package_list: xml_files = extract_multiple_packages(package_list, batch_size, delete_zip, new_sources, out_folder) task_update_progress("Converting files 2/3...") results = convert_files(xml_files, els, prefix=consyn_files) for dummy, (status_code, result) in results.iteritems(): if status_code == StatusCodes.OK: new_files.append(result) task_update_progress("Compiling output 3/3...") create_collection(batch_size, new_files, new_sources, out_folder, upload_FTP)
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName("record"): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(":")[2] conference = conference.split("/")[0] contribution = identifier.split(":")[2] contribution = contribution.split("/")[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % (conference.replace(" ", ""), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") # harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll("a") found = False for link in links: url = urllib.quote(link["href"], safe=":/") if url.endswith(".pdf"): found = True if results: rec = {} filename = join(out_folder, identifier + ".pdf") record_add_field(rec, "856", ind1="4", subfields=[("u", url), ("y", "Fulltext")]) record_add_field(rec, "FFT", subfields=[("a", filename), ("t", "PoS"), ("d", "Fulltext")]) try: print("Downloading " + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, "001", controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % ( total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files), ) print(subject) print(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
write_message("Found %d record(s) to download." % (len(final_record_list), )) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return #2: Fetch fulltext/metadata XML and upload bunches of records as configured # Create working directory if not exists out_folder = create_work_folder(CFG_APSHARVEST_DIR) from invenio.refextract_kbs import get_kbs journal_mappings = get_kbs() if journal_mappings and "journals" in journal_mappings: journal_mappings = journal_mappings['journals'][1] else: journal_mappings = None now = datetime.datetime.now() mail_subject = "APS harvest results: %s" % \ (now.strftime("%Y-%m-%d %H:%M:%S"),) count = 0 taskid = 0 records_harvested = []
write_message("Found %d record(s) to download." % (len(final_record_list),)) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return #2: Fetch fulltext/metadata XML and upload bunches of records as configured # Create working directory if not exists out_folder = create_work_folder(CFG_APSHARVEST_DIR) from invenio.refextract_kbs import get_kbs journal_mappings = get_kbs() if journal_mappings and "journals" in journal_mappings: journal_mappings = journal_mappings['journals'][1] else: journal_mappings = None now = datetime.datetime.now() mail_subject = "APS harvest results: %s" % \ (now.strftime("%Y-%m-%d %H:%M:%S"),) count = 0 taskid = 0 records_harvested = []
def bst_consyn_harvest( feed=None, package=None, package_list=None, batch_size="500", delete_zip="False", upload_FTP="True" ): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are three execution modes: 1. Download from an atom feed. 2. Extract a zip package. 3. Extract a list of zip packages. :param feed: The URL of the atom feed to download. :type feed: string :param package: A path to a zip package :type package: string :param package_list: A path to a file with a list of paths to zip packages :type package_list: string :param batch_size: The number of records contained in each output file :type batch_size: string representation of an integer :param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not :type delete_zip: string representation of a boolean :param upload_FTP: Flag to indicate whether the result files should be uploaded to the FTP server :type upload_FTP: string representation of a boolean """ if not feed: feed = "https://consyn.elsevier.com/batch/atom?key=%s" % (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message( "Warning batch_size parameter is not a valid integer\n" + "the default value '500' has been used!\n" ) if delete_zip.lower() == "true": delete_zip = True elif delete_zip.lower() == "false": delete_zip = False else: delete_zip = False write_message( "Warning delete_zip parameter is not a valid Boolean (True/False)\n" + "the default value 'False' has been used!\n" ) if upload_FTP.lower() == "true": upload_FTP = True elif upload_FTP.lower() == "false": upload_FTP = False else: upload_FTP = True write_message( "Warning upload_FTP parameter is not a valid Boolean (True/False)\n" + "the default value 'True' has been used!\n" ) if not exists(CFG_CONSYN_OUT_DIRECTORY): rmdir(create_work_folder(CFG_CONSYN_OUT_DIRECTORY)) out_folder = CFG_CONSYN_OUT_DIRECTORY els = ElsevierPackage(CONSYN=True) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() if not package and not package_list: download_feed(feed, batch_size, delete_zip, new_sources, out_folder) task_update_progress("Converting files 2/3...") task_sleep_now_if_required(can_stop_too=True) fetch_xml_files(consyn_files, els, new_files) task_sleep_now_if_required(can_stop_too=False) else: xml_files = [] if package: xml_files = extract_package(package, batch_size, delete_zip, out_folder) elif package_list: xml_files = extract_multiple_packages(package_list, batch_size, delete_zip, new_sources, out_folder) task_update_progress("Converting files 2/3...") results = convert_files(xml_files, els, prefix=consyn_files) for dummy, (status_code, result) in results.iteritems(): if status_code == StatusCodes.OK: new_files.append(result) task_update_progress("Compiling output 3/3...") create_collection(batch_size, new_files, new_sources, out_folder, upload_FTP)
def bst_consyn_harvest(feed=None, package=None, package_list=None, batch_size='500', delete_zip='False'): """ Task to convert xml files from consyn.elsevier.com to marc xml files. There are three excecution modes: 1. Download from an atom feed. 2. Extract a zip package. 3. Extract a list of zip packages. @param feed: The URL of the atom feed to download. @type feed: string @param package: A path to a zip package @type package: string @param package_list: A path to a file with a list of paths to zip packages @type package_list: string @param batch_size: The number of records contained in each output file @type batch_size: int @param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not @type delete_zip: boolean """ if not feed: feed = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' + 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message('Warning delete_zip parameter is not a valid Boolean (True/False)\n' + 'the default value \'False\' has been used!\n') out_folder = create_work_folder(CFG_CONSYN_OUT_DIRECTORY) try: run_sql("SELECT filename FROM CONSYNHARVEST") except ProgrammingError: # Table missing, create it. run_sql("CREATE TABLE CONSYNHARVEST (" "filename VARCHAR(100) NOT NULL PRIMARY KEY," "date VARCHAR(50)," "size VARCHAR(30) );") if not package and not package_list: download_feed(feed, batch_size, delete_zip, new_sources, out_folder) elif package: extract_package(package, batch_size, delete_zip, out_folder) elif package_list: extract_multiple_packages(package_list, batch_size, delete_zip, new_sources, out_folder) task_sleep_now_if_required(can_stop_too=True) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() els = ElsevierPackage(path="whatever", CONSYN=True) task_update_progress("Converting files 2/2...") fetch_xml_files(consyn_files, els, new_files) task_sleep_now_if_required(can_stop_too=False) create_collection(batch_size, new_files, new_sources, out_folder)