def __init__(self, journal_mappings=None): if journal_mappings: self.journal_mappings = journal_mappings else: try: self.journal_mappings = get_kbs()['journals'][1] except KeyError: self.journal_mappings = {}
def convert_files(files_to_convert, source_folder, marc_folder, from_date, to_date): """Converts the xml source files to marc xml files""" converted_files = [] journal_mappings = get_kbs()['journals'][1] edp = EDPSciencesPackage(journal_mappings) counter = 1 for filename in files_to_convert: task_update_progress('Converting files 3/3 \t%s of %s' % (counter, len(files_to_convert))) target_file = filename.split('/')[-1] target_file = join(marc_folder, target_file) target_folder = dirname(target_file) if not exists(target_folder): makedirs(target_folder) record = "" datestamp = edp.get_date(filename) if exists(target_file): if from_date and to_date: if datestamp >= from_date and\ datestamp <= to_date: converted_files.append(target_file) elif from_date: if datestamp >= from_date: converted_files.append(target_file) elif to_date: if datestamp <= to_date: converted_files.append(target_file) else: converted_files.append(target_file) else: if 'xml_rich' in filename: record = edp.get_record_rich(filename, refextract) else: if from_date and to_date: if datestamp >= from_date and\ datestamp <= to_date: record = edp.get_record(filename, refextract) elif from_date: if datestamp >= from_date: record = edp.get_record(filename, refextract) elif to_date: if datestamp <= to_date: record = edp.get_record(filename, refextract) else: record = edp.get_record(filename, refextract) if record: write_message("Converted file: %s" % (filename,)) with open(target_file, 'w') as out: out.write(record) converted_files.append(target_file) counter += 1 return converted_files
def normalize_journal_name(value=None): """normalize journal name via knowledgebase lookup""" if value is None: return '' newvalue = re_punctuation.sub(u' ', value.upper()) newvalue = re_group_captured_multiple_space.sub(u' ', newvalue) newvalue = newvalue.strip() standardized_titles = get_kbs()['journals'][1] return standardized_titles.get(newvalue, value)
def parse_references(reference_lines, recid=None, kbs_files=None): """Parse a list of references Given a list of raw reference lines (list of strings), output the MARC-XML content extracted version """ # RefExtract knowledge bases kbs = get_kbs(custom_kbs_files=kbs_files) # Identify journal titles, report numbers, URLs, DOIs, and authors... processed_references, counts, dummy_bad_titles_count = \ parse_references_elements(reference_lines, kbs) # Generate marc xml using the elements list fields = build_references(processed_references) # Generate the xml string to be outputted return build_record(counts, fields, recid=recid)
def parse_references(reference_lines, recid=1, kbs_files=None): """Parse a list of references Given a list of raw reference lines (list of strings), output the MARC-XML content extracted version """ # RefExtract knowledge bases kbs = get_kbs(custom_kbs_files=kbs_files) # Identify journal titles, report numbers, URLs, DOIs, and authors... (processed_references, counts, dummy_bad_titles_count) = \ parse_references_elements(reference_lines, kbs) # Generate marc xml using the elements list xml_out = build_xml_references(processed_references) # Generate the xml string to be outputted return create_xml_record(counts, recid, xml_out)
def __init__(self, directory): self.records_harvested = [] self.records_to_insert = [] self.records_to_update = [] self.records_failed = [] self.out_folder = create_work_folder(directory) self.date_started = datetime.datetime.now() self.mail_subject = "APS harvest results: %s" % \ (self.date_started.strftime("%Y-%m-%d %H:%M:%S"),) from invenio.refextract_kbs import get_kbs journal_mappings = get_kbs() if journal_mappings and "journals" in journal_mappings: journal_mappings = journal_mappings['journals'][1] else: journal_mappings = None self.journal_mappings = journal_mappings
def cli_main(options, args): if options.help or not args: usage() return if options.kb_journals: kbs_files = {'journals': options.kb_journals} else: kbs_files = {} kb = get_kbs(custom_kbs_files=kbs_files)['journals'] out_records = [] for path in args: f = open(path) try: xml = f.read() finally: f.close() out_records += convert_journals_list(kb, create_records(xml)) write_records(options, out_records)
def setUp(self): kb = [("TEST JOURNAL NAME", "Converted")] kbs_files = {'journals': kb} self.kb = get_kbs(custom_kbs_files=kbs_files)['journals']
def _build_journal_mappings(self): try: self.journal_mappings = get_kbs()['journals'][1] except KeyError: self.journal_mappings = {} return
def bst_consyn_harvest(feed_url=None, package=None, feed_file=None, package_list_file=None, batch_size='500', delete_zip='False', submit='False', threshold_date=None): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are four execution modes: 1. Download from an atom feed url. 2. Extract and convert a zip package. 3. Download from an atom feed file. 4. Extract and convert a list of zip packages. The feed is stored to the file system under the folder feeds. If no errors occur during the execution of the tasklet the feed is deleted. Records may be recovered running the tasklet again with the modes 2, 3 or 4. :param feed_url: A URL to the atom feed. :type feed: string. :param package: A path to a zip package. :type package: string. :param package: A path to an atom feed file. :type package: string. :param package_list_file: A path to a file with a list of paths to zip packages. The file must contain the path to each package in a different line. :type package_list_file: string. :param batch_size: The number of records contained in each output file. :type batch_size: string representation of an integer. :param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not. :type delete_zip: string representation of a boolean. :param submit: Flag to indicate whether the result files should be submited by email and uploaded to FTP server. :type submit: string representation of a boolean. :param threshold_date: threshold date only converts records that they were published after threshold_date :type threshold_date: string in the format YYYY-MM-DD """ if not feed_url: feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] feed_location = '' try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message('Warning delete_zip parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if submit.lower() == 'true': submit = True elif submit.lower() == 'false': submit = False else: submit = False write_message('Warning upload_FTP parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if threshold_date: import time date_format = "%Y-%m-%d" try: date = datetime(*(time.strptime( threshold_date, date_format)[0:6]) ) threshold_date = date.strftime('%Y-%m-%d') except ValueError: write_message('Error threshold_date parameter is not ' 'in the right format. It should be in ' 'form "YYYY-MM-DD".') task_update_status("ERROR") return if not exists(CFG_CONSYN_OUT_DIRECTORY): makedirs(CFG_CONSYN_OUT_DIRECTORY) out_folder = CFG_CONSYN_OUT_DIRECTORY journal_mappings = get_kbs()['journals'][1] els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() if package: xml_files = extract_package(package, delete_zip, out_folder, new_sources) elif package_list_file: package_list = [] with open(package_list_file, 'r') as package_file: for line in package_file: line = line.strip() if line: package_list.append(line) xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) elif feed_file: entries = parse_feed(feed_file) links = [a[0] for a in entries] package_list = [a[1] for a in entries] package_list = [join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list] for package in package_list: task_sleep_now_if_required() if not exists(package): index = package_list.index(package) link = links[index] link = link.replace(' ', '%20') try: message = ("Downloading %s to %s\n" % (link, package)) write_message(message) download_url(link, "zip", package, 5, 60.0) package_list.append(package) except InvenioFileDownloadError as err: message = "URL could not be opened: " + link write_message(message) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) else: feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds') if not exists(feeds_folder): makedirs(feeds_folder) date = datetime.now().strftime("%Y.%m.%d") feed_location = "feed-%s.xml" % date feed_location = join(feeds_folder, feed_location) xml_files = download_feed(feed_url, delete_zip, new_sources, out_folder, feed_location) task_update_progress("Converting files 2/3...") task_sleep_now_if_required() results = convert_files(xml_files, els, prefix=consyn_files, threshold_date=threshold_date) for dummy, (status_code, result) in results.iteritems(): if status_code == StatusCodes.OK: new_files.append(result) task_update_progress("Compiling output 3/3...") task_sleep_now_if_required() create_collection(batch_size, new_files, new_sources, out_folder, submit) if feed_location and not _errors_detected: remove(feed_location) for error in _errors_detected: write_message(str(error))
def bst_consyn_harvest(feed_url=None, package=None, feed_file=None, package_list_file=None, batch_size='500', delete_zip='False', submit='False', threshold_date=None): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are four execution modes: 1. Download from an atom feed url. 2. Extract and convert a zip package. 3. Download from an atom feed file. 4. Extract and convert a list of zip packages. The feed is stored to the file system under the folder feeds. If no errors occur during the execution of the tasklet the feed is deleted. Records may be recovered running the tasklet again with the modes 2, 3 or 4. :param feed_url: A URL to the atom feed. :type feed: string. :param package: A path to a zip package. :type package: string. :param package: A path to an atom feed file. :type package: string. :param package_list_file: A path to a file with a list of paths to zip packages. The file must contain the path to each package in a different line. :type package_list_file: string. :param batch_size: The number of records contained in each output file. :type batch_size: string representation of an integer. :param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not. :type delete_zip: string representation of a boolean. :param submit: Flag to indicate whether the result files should be submited by email and uploaded to FTP server. :type submit: string representation of a boolean. :param threshold_date: threshold date only converts records that they were published after threshold_date :type threshold_date: string in the format YYYY-MM-DD """ if not feed_url: feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] feed_location = '' try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message('Warning delete_zip parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if submit.lower() == 'true': submit = True elif submit.lower() == 'false': submit = False else: submit = False write_message('Warning upload_FTP parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if threshold_date: import time date_format = "%Y-%m-%d" try: date = datetime(*(time.strptime(threshold_date, date_format)[0:6])) threshold_date = date.strftime('%Y-%m-%d') except ValueError: write_message('Error threshold_date parameter is not ' 'in the right format. It should be in ' 'form "YYYY-MM-DD".') task_update_status("ERROR") return if not exists(CFG_CONSYN_OUT_DIRECTORY): makedirs(CFG_CONSYN_OUT_DIRECTORY) out_folder = CFG_CONSYN_OUT_DIRECTORY journal_mappings = get_kbs()['journals'][1] els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() if package: xml_files = extract_package(package, delete_zip, out_folder, new_sources) elif package_list_file: package_list = [] with open(package_list_file, 'r') as package_file: for line in package_file: line = line.strip() if line: package_list.append(line) xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) elif feed_file: entries = parse_feed(feed_file) links = [a[0] for a in entries] package_list = [a[1] for a in entries] package_list = [ join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list ] for package in package_list: task_sleep_now_if_required() if not exists(package): index = package_list.index(package) link = links[index] link = link.replace(' ', '%20') try: message = ("Downloading %s to %s\n" % (link, package)) write_message(message) download_url(link, "zip", package, 5, 60.0) package_list.append(package) except InvenioFileDownloadError as err: message = "URL could not be opened: " + link write_message(message) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) else: feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds') if not exists(feeds_folder): makedirs(feeds_folder) date = datetime.now().strftime("%Y.%m.%d") feed_location = "feed-%s.xml" % date feed_location = join(feeds_folder, feed_location) xml_files = download_feed(feed_url, delete_zip, new_sources, out_folder, feed_location) task_update_progress("Converting files 2/3...") task_sleep_now_if_required() results = convert_files(xml_files, els, prefix=consyn_files, threshold_date=threshold_date) for dummy, (status_code, result) in results.iteritems(): if status_code == StatusCodes.OK: new_files.append(result) task_update_progress("Compiling output 3/3...") task_sleep_now_if_required() create_collection(batch_size, new_files, new_sources, out_folder, submit) if feed_location and not _errors_detected: remove(feed_location) for error in _errors_detected: write_message(str(error))
if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return #2: Fetch fulltext/metadata XML and upload bunches of records as configured # Create working directory if not exists out_folder = create_work_folder(CFG_APSHARVEST_DIR) from invenio.refextract_kbs import get_kbs journal_mappings = get_kbs() if journal_mappings and "journals" in journal_mappings: journal_mappings = journal_mappings['journals'][1] else: journal_mappings = None now = datetime.datetime.now() mail_subject = "APS harvest results: %s" % \ (now.strftime("%Y-%m-%d %H:%M:%S"),) count = 0 taskid = 0 records_harvested = [] records_to_insert = [] records_to_update = [] records_failed = []
def __init__(self): try: self.journal_mappings = get_kbs()['journals'][1] except KeyError: self.journal_mappings = {}