def download_files(from_date, to_date): """Downloads the new files from the EDP Sciences FTP server.""" download_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, 'packages') old_files = listdir(download_folder) ftp = FtpHandler(CFG_EDPSCIENCE_SERVER, CFG_EDPSCIENCE_USERNAME, CFG_EDPSCIENCE_PASSWORD) ftp.cd('incoming') new_files = ftp.ls()[0] new_files = filter(lambda a: is_younger(a, from_date, ftp), new_files) files_to_download = filter(lambda a: a not in old_files, new_files) counter = 1 for filename in files_to_download: task_update_progress('Downloading files 1/3 \t%s of %s' % (counter, len(new_files))) write_message('Downloading file %s' % (filename,)) ftp.download(filename, download_folder) filename = join(download_folder, filename) counter += 1 ftp.close() return map(lambda a: join(download_folder, a), new_files)
def iterate_over_new(list, fmt): "Iterate over list of IDs" global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)): run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt)) else: run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def iterate_over_new(list, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def iterate_over_new(recIDs, fmt): """Iterate over list of IDs. @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get( fmt.lower(), _update_format) for count, recID in enumerate(recIDs): t1 = os.times()[4] reformat_function(recID, fmt) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS, old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS): """ Checks a folder job files, parses and executes them @param new_job_dir: path to the directory with new jobs @type new_job_dir: string @param old_job_dir: path to the directory where the old jobs are moved @type old_job_dir: string """ global _NUMBER, _TASKID write_message('Checking directory %s for new jobs' % new_job_dir) task_update_progress('Checking for new jobs') _TASKID = task_get_task_param('task_id') files = os.listdir(new_job_dir) for file in files: file_fullpath = os.path.join(new_job_dir, file) if has_signature(file_fullpath): write_message('New Job found: %s' % file) job = json_decode_file(file_fullpath) if not getval(job, 'isbatch'): args = job_to_args(job) if not launch_task(args): write_message('Error submitting task') else: ## We need the job description for the batch engine ## So we need to use the new path inside the oldjobs dir process_batch(os.path.join(old_job_dir, file)) ## Move the file to the done dir shutil.move(file_fullpath, os.path.join(old_job_dir, file)) ## Update number for next job _NUMBER += 1 return 1
def fill_self_cites_tables(config): """ This will fill the self-cites tables with data The purpose of this function is to fill these tables on a website that never ran the self-cites daemon """ algorithm = config['algorithm'] tags = get_authors_tags() all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')] citations_fun = get_citations_fun(algorithm) write_message('using %s' % citations_fun.__name__) if algorithm == 'friends': # We only needs this table for the friends algorithm or assimilated # Fill intermediary tables for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'intermediate %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() update_self_cites_tables(recid, config, tags) # Fill self-cites table for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'final %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() compute_and_store_self_citations(recid, tags, citations_fun)
def bst_doi_timestamp(reset=0): prepate_doi_table() now = datetime.now() last_run = ((run_sql("SELECT max(creation_date) FROM doi")[0][0] or datetime(2014, 1, 1)) - timedelta(days=4)).strftime("%Y-%m-%d") if int(reset): last_run = (datetime(2014, 1, 1) - timedelta(days=4)).strftime("%Y-%m-%d") write_message("Retrieving DOIs modified since %s" % last_run) restart_on_error = True while restart_on_error: restart_on_error = False for publisher, re_match in CFG_SCOAP3_DOIS.items(): task_update_progress("Retrieving DOIs for %s" % publisher) write_message("Retriving DOIs for %s" % publisher) try: res = get_all_modified_dois(publisher, last_run, re_match, debug=True) for doi in res: if run_sql("SELECT doi FROM doi WHERE doi=%s", (doi, )): continue write_message("New DOI discovered for publisher %s: %s" % (publisher, doi)) run_sql("INSERT INTO doi(doi, creation_date) VALUES(%s, %s)", (doi, now)) except URLError as e: write_message("Problem with connection! %s" % (e,)) restart_on_error = True except socket.timeout as e: write_message("Timeout error %s" % (e,)) write_message("Finishing and rescheduling") restart_on_error = True except ValueError as e: write_message("Value error in JSON string! %s" % (e,)) restart_on_error = True
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def solr_add_ranges(id_ranges): sub_range_length = task_get_option("flush") id_ranges_to_index = [] for id_range in id_ranges: lower_recid = id_range[0] upper_recid = id_range[1] i_low = lower_recid while i_low <= upper_recid: i_up = min(i_low + sub_range_length - 1, upper_recid) id_ranges_to_index.append((i_low, i_up)) i_low += sub_range_length tags_to_index = get_tags() # Indexes latest records first by reversing # This allows the ranker to return better results during long indexing # runs as the ranker cuts the hitset using latest records id_ranges_to_index.reverse() next_commit_counter = 0 for id_range_to_index in id_ranges_to_index: lower_recid = id_range_to_index[0] upper_recid = id_range_to_index[1] status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid) write_message(status_msg) task_update_progress(status_msg) next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter) solr_commit_if_necessary(next_commit_counter, final_commit=True)
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name): """ Generate sitemaps themselves. @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps @param records: the list of (recid, modification_date) tuples to process @param output_directory: directory where to store the sitemaps @param sitemap_name: the name (prefix) of the sitemap files(s) """ sitemap_id = 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 write_message("... Getting sitemap '%s'..." % sitemap_name) write_message("... Generating urls for %s records..." % len(records)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(records): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_RECORDS, priority = DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records))) task_sleep_now_if_required(can_stop_too=True)
def bst_consyn_harvest(CONSYNATOMURL="https://consyn.elsevier.com/batch/atom?key=QUhvbHRrYW1wOzM0Mjc%253d"): """ Task to download metadata given an ATOM feed from consyn.elsevier.com and a folder to store the files. @param CONSYNATOMURL: The URL of the atom feed to download. """ if not os.path.exists(CFG_CONSYN_OUT_DIRECTORY): folders = CFG_CONSYN_OUT_DIRECTORY.split("/") folder = "/" for i in range(1, len(folders)): folder = os.path.join(folder, folders[i]).strip() if not os.path.exists(folder): os.mkdir(folder) try: run_sql("SELECT filename FROM CONSYNHARVEST") except: run_sql("CREATE TABLE CONSYNHARVEST (" "filename VARCHAR(100) NOT NULL PRIMARY KEY," "date VARCHAR(50)," "size VARCHAR(30) );") # Get list of entries from XML document xmlString = "" try: task_update_progress("Downloading and extracting files 1/2...") result_file = download_url(url=CONSYNATOMURL, retry_count=5, timeout=60.0) xmlString = open(result_file, 'r').read() except InvenioFileDownloadError, err: write_message("URL could not be opened: %s" % (CONSYNATOMURL,)) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") return
def iterate_over_new(recIDs, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) for count, recID in enumerate(recIDs): t1 = os.times()[4] formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID, of=fmt, on_the_fly=True, save_missing=False) save_preformatted_record(recID=recID, of=fmt, res=formatted_record, needs_2nd_pass=needs_2nd_pass, low_priority=True) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def bst_move_dbdump(sourcedir, destdir, number_to_keep): """ Will move a MySQL dump from local machine to a remote space. @param sourcedir: directory where the local dump is stored. @type sourcedir: string @param destdir: directory where the dump should live. @type destdir: string """ output_file_prefix = CFG_DATABASE_NAME + '-dbdump-' files = [x for x in os.listdir(sourcedir) if x.startswith(output_file_prefix)] task_update_progress("Starting moving of database-dump") if len(files) != 1: write_message("... none or too many files found. Exiting.") return filename = files[0] full_path_source = sourcedir + os.sep + filename write_message("... moving %s" % (full_path_source,)) full_path_destination = destdir + os.sep + filename try: shutil.copy(full_path_source, full_path_destination) except Exception, e: write_message("... could not move %s to %s: %s" % (full_path_source, full_path_destination, str(e))) return
def fetch_concerned_records(name): task_update_progress("Fetching record ids") last_recid, last_date = fetch_last_updated(name) if task_get_option('new'): # Fetch all records inserted since last run sql = "SELECT `id`, `creation_date` FROM `bibrec` " \ "WHERE `creation_date` >= %s " \ "AND `id` > %s " \ "ORDER BY `creation_date`" records = run_sql(sql, (last_date.isoformat(), last_recid)) elif task_get_option('modified'): # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "AND `id` > %s " \ "ORDER BY `modification_date`" records = run_sql(sql, (last_date.isoformat(), last_recid)) else: given_recids = task_get_option('recids') for collection in task_get_option('collections'): given_recids.add(get_collection_reclist(collection)) if given_recids: format_strings = ','.join(['%s'] * len(given_recids)) records = run_sql("SELECT `id`, NULL FROM `bibrec` " \ "WHERE `id` IN (%s) ORDER BY `id`" % format_strings, list(given_recids)) else: records = [] task_update_progress("Done fetching record ids") return records
def fetch_concerned_arxiv_records(name): task_update_progress("Fetching arxiv record ids") dummy, last_date = fetch_last_updated(name) # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "AND `creation_date` > NOW() - INTERVAL 7 DAY " \ "ORDER BY `modification_date`" \ "LIMIT 5000" records = run_sql(sql, [last_date.isoformat()]) def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False def check_pdf_date(recid): doc = get_pdf_doc(recid) if doc: return doc.md > last_date return False records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)] records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)] write_message("recids %s" % repr([(r, mod_date.isoformat()) \ for r, mod_date in records])) task_update_progress("Done fetching arxiv record ids") return records
def fetch_updated_arxiv_records(date): """Fetch all the arxiv records modified since the last run""" def check_arxiv(recid): """Returns True for arxiv papers""" for report_number in get_fieldvalues(recid, '037__9'): if report_number == 'arXiv': return True return False # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "ORDER BY `modification_date`" records = run_sql(sql, [date.isoformat()]) records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)] # Show all records for debugging purposes if task_get_option('verbose') >= 9: write_message('recids:', verbose=9) for recid, mod_date in records: write_message("* %s, %s" % (recid, mod_date), verbose=9) task_update_progress("Done fetching %s arxiv record ids" % len(records)) return records
def shall_sleep(recid, i, tot, time_estimator): """Check if we shall sleep""" time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress("%s (%s%%) -> %s" % (recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)))) return True return False
def match_missing_ids(remote_ids, batch_size): """ For ID pairings that are missing, this function splits the missing IDs into batches. The records are pulled from remote, the 035 field read and then the remote ID appended to the local record. Parameters: remote_ids - a list of missing remote rec-ids batch_size - How many records to match at a time Returns: count_appends - number of records being appended count_problems - number of records which could not be matched at all """ count_appends = 0 count_problems = 0 batches = [remote_ids[x:x+batch_size] for x in xrange(0, len(remote_ids), batch_size)] _print("Identified %d records which their remote IDs updating." % len(remote_ids)) _print("Processing %d batches of size %d" % (len(batches), batch_size)) for i, batch in enumerate(batches, 1): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Batch %d of %d" % (i, len(batches))) _print("Batch %d of %d" % (i, len(batches))) try: appends, problems = process_record_batch(batch) count_appends += len(appends) count_problems += len(problems) write_to_file('missing_ids.txt', problems, append=True) _print("Submitting batch #%d to BibUpload for appending..." % i, 4) start_bibupload_job(appends) except StandardError, e: _print("Error occured during match of batch %d: %s\n%s" % (i, e, traceback.format_exc()), 2)
def percent_update(index, percent_last): """ Calculates completion percentage, updates task progress """ per = 100 * float(index)/float(len(remote_ids)) if per > (percent_last + 0.5): percent_last = per task_update_progress("Local matching %.1f%% (%d/%d)" % (per, index, len(remote_ids))) return percent_last
def bst_autoclaim(): orcid_personid_map = get_orcid_personid_map() papers = get_papers_with_orcid() for i, recid in enumerate(papers): autoclaim_paper(recid, orcid_personid_map) if i % 10 == 0: task_update_progress("Done %s out of %s records (%s%%)" % (i, len(papers), 100*(i)/len(papers))) task_sleep_now_if_required(can_stop_too=True)
def compute_cache(pids): bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) for i, p in enumerate(pids): bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) _compute_cache_for_person(p) bibtask.task_sleep_now_if_required(can_stop_too=True)
def download_feed(feed, batch_size, delete_zip, new_sources, directory): """ Get list of entries from XML document """ xmlString = "" try: task_update_progress("Downloading and extracting files 1/2...") result_path = download_url(url=feed, retry_count=5, timeout=60.0) try: result_file = open(result_path, 'r') xmlString = result_file.read() finally: result_file.close() remove(result_path) except InvenioFileDownloadError as err: write_message("URL could not be opened: %s" % (feed,)) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") return dom = xml.dom.minidom.parseString(xmlString) entries = dom.getElementsByTagName("entry") # Loop through entries for entry in entries: # Get URL and filename fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href") fileName = entry.getElementsByTagName("title")[0].firstChild.data # Output location is directory + filename outFilename = join(directory, fileName) outFilename = outFilename.lstrip() # Check if file has already been fetched existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY)) if len(existing_files) == 1: write_message("Not downloading %s, already found %s in %s\n" % (fileUrl, existing_files[0], outFilename)) else: try: write_message("Downloading %s to %s\n" % (fileUrl, outFilename)) download_url(fileUrl, "zip", outFilename, 5, 60.0) new_sources.append(outFilename) except InvenioFileDownloadError as err: write_message("URL could not be opened: %s" % (fileUrl,)) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue try: extractAll(outFilename, delete_zip, directory) except BadZipfile: write_message("Error BadZipfile %s", (outFilename,)) task_update_status("CERROR") remove(outFilename)
def bst_refresh_author_profiles(): """Deletes all the WebAuthorProfile cache.""" task_update_progress("Deleting images...") for name in os.listdir(os.path.join(CFG_WEBDIR, 'img', 'tmp')): name = os.path.join(CFG_WEBDIR, 'img', 'tmp', name) if os.path.isdir(name): rmtree(name, ignore_errors=True) task_update_progress("Truncating DB cache...") run_sql("TRUNCATE wapCACHE")
def _task_update_overall_status(message): """ Generates an overall update message for the BibEncode task. Stores the messages in a global list for notifications @param message: the message that should be printed as task status @type message: string """ message = "[%d/%d]%s" % (_BATCH_STEP, _BATCH_STEPS, message) task_update_progress(message) global _UPD_HISTORY _UPD_HISTORY.append(message)
def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9)
def rebuild_tables(rank_method_code, config): """Rebuild the tables from scratch Called by bibrank -w selfcites -R """ task_update_progress('emptying tables') empty_self_cites_tables() task_update_progress('filling tables') fill_self_cites_tables(rank_method_code, config) return True
def task_run_core(): """ run daemon """ if task_get_option("update-borrowers"): list_of_borrowers = db.get_all_borrowers() total_borrowers = len(list_of_borrowers) done = 0 for borrower in list_of_borrowers: user_id = borrower[0] update_user_info_from_ldap(user_id) done+=1 task_update_progress("Done %d out of %d." % (done, total_borrowers)) task_sleep_now_if_required(can_stop_too=True) if task_get_option("overdue-letters"): expired_loans = db.get_all_expired_loans() total_expired_loans = len(expired_loans) done = 0 for (borrower_id, _bor_name, recid, _barcode, _loaned_on, _due_date, _number_of_renewals, number_of_letters, date_letters, _notes, loan_id) in expired_loans: number_of_letters=int(number_of_letters) content = '' if number_of_letters == 0: content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id) elif number_of_letters == 1 and must_send_second_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id) elif number_of_letters == 2 and must_send_third_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) elif number_of_letters >= 3 and must_send_third_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) if content != '': title = book_title_from_MARC(recid) subject = "LOAN RECALL: " + title update_expired_loan(loan_id) send_overdue_letter(borrower_id, subject, content) done+=1 task_update_progress("Done %d out of %d." % (done, total_expired_loans)) task_sleep_now_if_required(can_stop_too=True) time.sleep(1) return 1
def get_citation_weight(rank_method_code, config, chunk_size=25000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ quick = task_get_option("quick") != "no" # id option forces re-indexing a certain range # even if there are no new recs if task_get_option("id"): # construct a range of records to index updated_recids = [] for first, last in task_get_option("id"): updated_recids += range(first, last+1) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message('Records to process: %s' % str_updated_recids) index_update_time = None else: bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code) if not quick: bibrank_update_time = "0000-00-00 00:00:00" write_message("bibrank: %s" % bibrank_update_time) index_update_time = get_bibindex_update_time() write_message("bibindex: %s" % index_update_time) if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"): index_update_time = "0000-00-00 00:00:00" updated_recids = get_modified_recs(bibrank_update_time, index_update_time) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message("%s records to update" % str_updated_recids) if updated_recids: begin_time = time.time() try: function = config.get("rank_method", "function") config.get(function, 'collections') except ConfigParser.NoOptionError: config.set(function, 'collections', None) # Process fully the updated records weights = process_and_store(updated_recids, config, chunk_size) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % (end_time - begin_time)) task_update_progress("citation analysis done") else: weights = None write_message("No new records added since last time this " "rank method was executed") return weights, index_update_time
def download_feed(feed_url, batch_size, delete_zip, new_sources, directory, feed_location): """ Get list of entries from XML document """ try: task_update_progress("Downloading and extracting files 1/2...") result_path = download_url(url=feed_url, content_type="xml", download_to_file=feed_location, retry_count=5, timeout=60.0) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % (feed_url,)) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") return xml_files = [] entries = parse_feed(result_path) for fileUrl, fileName in entries: task_sleep_now_if_required() # Output location is directory + filename outFilename = join(directory, fileName) outFilename = outFilename.lstrip() # Check if file has already been fetched existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY)) if len(existing_files) == 1: write_message("Not downloading %s, already found %s in %s\n" % (fileUrl, existing_files[0], outFilename)) else: fileUrl = fileUrl.replace(' ', '%20') try: write_message("Downloading %s to %s\n" % (fileUrl, outFilename)) download_url(fileUrl, "zip", outFilename, 5, 60.0) new_sources.append(outFilename) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % fileUrl) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue try: xml_files.extend(extractAll(outFilename, delete_zip, directory)) except BadZipfile: _errors_detected.append(err) write_message("Error BadZipfile %s", (outFilename,)) task_update_status("CERROR") remove(outFilename) return xml_files
def bst_synchronize_recids(search_terms=SEARCH_TERMS, log_dir=None, collection=COLLECTION, batch_size=BATCH_SIZE, debug=False, remote_ids=None): """Synchronize record IDs between the CERN Document Server (CDS) and Inspire This BibTasklet is intended to be a general purpose replacement for 'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should be executable on both CDS and Inspire. Generally there should be no need to modify these parameters, the script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf to determine what type of Invenio instance we're running on. These parameters will be set by default to the correct values to synchronise all IDs, though you may want to limit records manually. Parameters: search_terms - The term to use to get record IDs (Default "035:<LOCAL>) log_dir - The directory to store the log file in (Defaults to CFG_TMPSHAREDDIR) collection - What collection to take from (Default is no collection) batch_size - How many records to try and ammend at once (Default 200) debug - If True, this script will run against the TEST instances (Default false) remote_ids - Comma seperated values of remote IDs, if this is specified, remote IDs will not be searched for. """ configure_globals(search_terms, log_dir, debug) _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE)) if not remote_ids: task_update_progress("Finding remote records on %s with %s IDs" % (REMOTE_INSTANCE, LOCAL_INSTANCE)) remote_ids = get_remote_ids(search_terms, collection) else: remote_ids = [int(rid) for rid in remote_ids.split(',')] task_sleep_now_if_required(can_stop_too=True) task_update_progress("Matching remote IDs to local records") missing_ids = match_remote_ids(remote_ids) count_appends, count_problems = match_missing_ids(missing_ids, batch_size) _print("======================== FINAL SCORE ========================", 1) _print(" Records matched: %d" % (len(remote_ids)-len(missing_ids)), 1) _print(" Records appended: %d" % count_appends, 1) _print(" IDs not matched (broken link!): %d" % count_problems, 1) _print("=============================================================", 1) _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE)) return True
def submit_records(records_filename, records_list, mode, directory, taskid=0, silent=False, devmode=False, subject=None): """ Performs the logic to submit given file (filepath) of records either by e-mail or using BibUpload with given mode. Taskid is given to indicate if the task submission should wait for any previously submitted tasks. The submission can also be made "silent" in the sense of not updating the modification date of the records. @param records_filename: filepath to XML file containing records. @type records_filename: string @param records_list: list of APSRecord objects for records @type records_list: list @param mode: which submission mode is it? @type mode: string @param taskid: bibsched taskid, wait for task to complete before submission @type taskid: int @param silent: do not update the modification date of the records @type silent: bool @return: returns the given taskid upon submission, or True/False from email. """ if devmode: return None if not subject: now = datetime.datetime.now() subject = "APS harvest results: %s" % ( now.strftime("%Y-%m-%d %H:%M:%S"), ) # Check if we should create bibupload or e-mail if mode == "email": # Lets parse the records and find our IDs. list_of_dois = [] for record in records_list: # We strip away the first part of the DOI for readability. list_of_dois.append('/'.join(record.doi.split('/')[1:])) # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS. body = "Harvested new records: %s" % (records_filename, ) try: try: shutil.move(records_filename, directory) records_filename = os.path.join( directory, os.path.basename(records_filename)) body = "Harvested new records on %s. They are located here:\n %s" % \ (now.strftime("%Y-%m-%d %H:%M:%S"), records_filename) except IOError, e: # Some IOError body = "Error while harvesting records: \nError saving %s - %s" % \ (records_filename, str(e)) raise e finally: body = "%s\nRecords harvested (%s total):\n%s\n" % ( body, str(len(list_of_dois)), "\n".join(list_of_dois)) res = submit_records_via_mail(subject, body) write_message("Sent e-mail to %s with path to %s" % (CFG_APSHARVEST_EMAIL, records_filename)) return res else: # We submit a BibUpload task and wait for it to finish task_update_progress("Waiting for task to finish") if taskid != 0: write_message("Going to wait for %d to finish" % (taskid, )) while not can_launch_bibupload(taskid): # Lets wait until the previously launched task exits. task_sleep_now_if_required(can_stop_too=False) time.sleep(5.0) taskid = submit_bibupload_for_records(mode, records_filename, silent) write_message("Submitted BibUpload task #%s with mode %s" % (str(taskid), mode)) return taskid
def run_bibsort_update(recids=None, method_list=None): """Updates bibsort tables for the methods in method_list and for the records in recids. If recids is None: recids = all records that have been modified or inserted since last update If method_list is None: method_list = all the methods available in bsrMETHOD table""" write_message('Initial data for run_bibsort_update method: ' \ 'number of recids = %s; method_list=%s' \ %(str(len(recids)), method_list), verbose=5) write_message('Updating sorting data.') bibsort_methods, errors = get_bibsort_methods_details(method_list) if errors: return False method_list = bibsort_methods.keys() if not method_list: write_message('No methods found in bsrMETHOD table.. exiting.') return True #we could have 4 types of methods: #(i) RNK methods -> they should be rebalanced, not updated #(ii) RNK methods to delete -> we should delete their data #(iii) non RNK methods to update #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated #check which of the methods are RNK methods (they do not need modified recids) rnk_methods = get_rnk_methods(bibsort_methods) rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods( rnk_methods, bibsort_methods) #check which of the methods have no data, so they are actually new, #so they need balancing(sorting) instead of updating non_rnk_methods = [ method for method in bibsort_methods.keys() if method not in rnk_methods ] non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods( non_rnk_methods) #(i) + (iv) methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted if methods_to_balance: # several methods require rebalancing(sorting) and not updating return run_bibsort_rebalance(methods_to_balance) #(ii) #remove the data for the ranking methods that have been deleted for method in rnk_methods_deleted: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Deleting data for method %s" % method) write_message('Starting deleting the data for RNK method %s' % method, verbose=5) executed_ok = delete_bibsort_data_for_method( bibsort_methods[method]['id']) if not executed_ok: write_message('Method %s could not be deleted correctly, aborting..' \ %method, sys.stderr) return False #(iii) #methods to actually update if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids update_timestamp = False if not recids: recids = get_modified_or_inserted_recs(non_rnk_methods_updated) if recids == 0: #error signal return False if not recids: write_message("No records inserted or modified in bibrec table " \ "since the last update of bsrMETHODDATA.") return True write_message("These records have been recently modified/inserted: %s" \ %str(recids), verbose=5) update_timestamp = True recids_i = intbitset(recids) for method in non_rnk_methods_updated: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Updating method %s" % method) write_message('Starting updating method %s' % method, verbose=5) executed_ok = update_bibsort_tables(recids_i, method, update_timestamp) if not executed_ok: write_message('Method %s could not be executed correctly, aborting..' \ %method, sys.stderr) return False return True
def task_run_core(): """ Run daemon """ write_message("Starting...") if task_get_option("update-borrowers"): write_message("Started update-borrowers") list_of_borrowers = db.get_all_borrowers() total_borrowers = len(list_of_borrowers) for done, borrower in enumerate(list_of_borrowers): user_id = borrower[0] update_user_info_from_ldap(user_id) if done % 10 == 0: task_update_progress("Borrower: updated %d out of %d." % (done, total_borrowers)) task_sleep_now_if_required(can_stop_too=True) task_update_progress("Borrower: updated %d out of %d." % (done + 1, total_borrowers)) write_message("Updated %d out of %d total borrowers" % (done + 1, total_borrowers)) if task_get_option("update-requests"): write_message("Started update-requests") list_of_reqs = db.get_loan_request_by_status( CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING) for (_request_id, recid, bc, _name, borrower_id, _library, _location, _date_from, _date_to, _request_date) in list_of_reqs: description = db.get_item_description(bc) list_of_barcodes = db.get_barcodes(recid, description) for barcode in list_of_barcodes: update_requests_statuses(barcode) task_sleep_now_if_required(can_stop_too=True) task_update_progress( "Requests due updated from 'waiting' to 'pending'.") write_message("Requests due updated from 'waiting' to 'pending'.") if task_get_option("overdue-letters"): write_message("Started overdue-letters") expired_loans = db.get_all_expired_loans() total_expired_loans = len(expired_loans) for done, (borrower_id, _bor_name, recid, _barcode, _loaned_on, _due_date, _number_of_renewals, number_of_letters, date_letters, _notes, loan_id) in enumerate(expired_loans): number_of_letters = int(number_of_letters) content = '' if number_of_letters == 0: content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id) elif number_of_letters == 1 and must_send_second_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id) elif number_of_letters == 2 and must_send_third_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) elif number_of_letters >= 3 and must_send_third_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) if content != '': title = book_title_from_MARC(recid) subject = "LOAN RECALL: " + title update_expired_loan(loan_id) send_overdue_letter(borrower_id, CFG_BIBCIRCULATION_LOANS_EMAIL, subject, content) if done % 10 == 0: task_update_progress("Loan recall: sent %d out of %d." % (done, total_expired_loans)) task_sleep_now_if_required(can_stop_too=True) task_update_progress( "Loan recall: processed %d out of %d expires loans." % (done + 1, total_expired_loans)) write_message("Processed %d out of %d expired loans." % (done + 1, total_expired_loans)) # Recalls for expired ILLs write_message("Started overdue-letters for Inter Library Loans") expired_ills = db.get_all_expired_ills() total_expired_ills = len(expired_ills) for done, (ill_id, borrower_id, item_info, number_of_letters, date_letters) in enumerate(expired_ills): number_of_letters = int(number_of_letters) content = '' if number_of_letters == 0: content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL1'], ill_id, ill=1) elif number_of_letters == 1 and must_send_second_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL2'], ill_id, ill=1) elif number_of_letters == 2 and must_send_third_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1) elif number_of_letters >= 3 and must_send_third_recall( date_letters): content = generate_email_body( CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1) if content != '' and looks_like_dictionary(item_info): item_info = eval(item_info) if item_info.has_key('title'): book_title = item_info['title'] subject = "ILL RECALL: " + str(book_title) update_expired_loan(loan_id=ill_id, ill=1) send_overdue_letter(borrower_id, CFG_BIBCIRCULATION_ILLS_EMAIL, subject, content) if done % 10 == 0: task_update_progress("ILL recall: sent %d out of %d." % (done, total_expired_ills)) task_sleep_now_if_required(can_stop_too=True) task_update_progress( "ILL recall: processed %d out of %d expired ills." % (done + 1, total_expired_ills)) write_message("Processed %d out of %d expired ills." % (done + 1, total_expired_ills)) return 1
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def generate_sitemaps(sitemap_index_writer, collection_names, export_fulltext=True): """ Generate sitemaps themselves. Return list of generated sitemaps files """ sitemap_id = 1 writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 for lang in CFG_SITE_LANGS: writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang, lastmod=datetime.today(), changefreq=DEFAULT_CHANGEFREQ_HOME, priority=DEFAULT_PRIORITY_HOME, alternate=True) nb_urls += 1 write_message("... Getting all public records...") recids = get_all_public_records(collection_names) write_message("... Generating urls for %s records..." % len(recids)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(recids): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_RECORDS, priority=DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress("Sitemap for recid %s/%s" % (i + 1, len(recids))) task_sleep_now_if_required(can_stop_too=True) write_message("... Generating urls for collections...") collections = get_all_public_collections(collection_names) for i, (collection, lastmod) in enumerate(collections): for lang in CFG_SITE_LANGS: if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url('%s/collection/%s?ln=%s' % (CFG_SITE_URL, quote(collection), lang), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS, priority=DEFAULT_PRIORITY_COLLECTIONS, alternate=True) if i % 100 == 0: task_update_progress("Sitemap for collection %s/%s" % (i + 1, len(collections))) task_sleep_now_if_required(can_stop_too=True) if export_fulltext: write_message("... Generating urls for fulltexts...") recids = filter_fulltexts(recids) for i, (recid, lastmod) in enumerate(recids): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/files' % (CFG_SITE_RECORD, recid), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS, priority=DEFAULT_PRIORITY_FULLTEXTS) if i % 100 == 0: task_update_progress("Sitemap for files page %s/%s" % (i, len(recids))) task_sleep_now_if_required(can_stop_too=True) write_message("... Generating urls for comments...") recids = filter_comments(recids) for i, (recid, lastmod) in enumerate(recids): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/comments' % (CFG_SITE_RECORD, recid), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COMMENTS, priority=DEFAULT_PRIORITY_COMMENTS) if i % 100 == 0: task_update_progress("Sitemap for comments page %s/%s" % (i, len(recids))) task_sleep_now_if_required(can_stop_too=True) write_message("... Generating urls for reviews") recids = filter_reviews(recids) for i, (recid, lastmod) in enumerate(recids): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 write_message("") writer = SitemapWriter(sitemap_id) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/reviews' % (CFG_SITE_RECORD, recid), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_REVIEWS, priority=DEFAULT_PRIORITY_REVIEWS) if i % 100 == 0: task_update_progress("Sitemap for reviews page %s/%s" % (i, len(recids))) task_sleep_now_if_required(can_stop_too=True)
def task_run_core(): """ Reimplement to add the body of the task.""" ## ## ------->--->time--->------> ## (-1) | ( 0) | ( 1) ## | | | ## [T.db] | [T.fc] | [T.db] ## | | | ## |<-tol|tol->| ## ## the above is the compare_timestamps_with_tolerance result "diagram" ## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp ## ( -1, 0, 1) stand for the returned value ## tol stands for the tolerance in seconds ## ## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc ## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the ## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus ## slightly before the T.db (practically the time distance between the start of the task and the last call of ## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the ## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if ## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if ## webcoll runs again it will not be fully ran ## task_run_start_timestamp = get_current_time_timestamp() colls = [] # decide whether we need to run or not, by comparing last updated timestamps: write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3) if task_has_option("part"): write_message("Running cache update part %s only." % task_get_option("part"), verbose=3) if check_nbrecs_for_all_external_collections() or task_has_option("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if task_has_option("collection"): coll = get_collection(task_get_option("collection")) colls.append(coll) if task_has_option("recursive"): r_type_descendants = coll.get_descendants(type='r') colls += r_type_descendants v_type_descendants = coll.get_descendants(type='v') colls += v_type_descendants else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: if task_get_option('part', 1) == 1: i = 0 for coll in colls: i += 1 write_message("%s / reclist cache update" % coll.name) if str(coll.dbquery).startswith("hostedcollection:"): coll.set_nbrecs_for_external_collection() else: coll.calculate_reclist() task_sleep_now_if_required() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # thirdly, update collection webpage cache: if task_get_option("part", 2) == 2: i = 0 for coll in colls: i += 1 write_message("%s / webpage cache update" % coll.name) coll.update_webpage_cache() task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not task_has_option("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3) else: ## cache up to date, we don't have to run write_message("Collection cache is up to date, no need to run.") ## we are done: return True
def process_batch_job(batch_job_file): """ Processes a batch job description dictionary @param batch_job_file: a fullpath to a batch job file @type batch_job_file: string @return: 1 if the process was successfull, 0 if not @rtype; int """ def upload_marcxml_file(marcxml): """ Creates a temporary marcxml file and sends it to bibupload """ xml_filename = 'bibencode_' + str(batch_job['recid']) + '_' + str( uuid.uuid4()) + '.xml' xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename) xml_file = file(xml_filename, 'w') xml_file.write(marcxml) xml_file.close() targs = ['-c', xml_filename] task_low_level_submission('bibupload', 'bibencode', *targs) #---------# # GENERAL # #---------# _task_write_message("----------- Handling Master -----------") ## Check the validity of the batch file here batch_job = json_decode_file(batch_job_file) ## Sanitise batch description and raise errrors batch_job = sanitise_batch_job(batch_job) ## Check if the record exists if record_exists(batch_job['recid']) < 1: raise Exception("Record not found") recdoc = BibRecDocs(batch_job['recid']) #--------------------# # UPDATE FROM MASTER # #--------------------# ## We want to add new stuff to the video's record, using the master as input if getval(batch_job, 'update_from_master'): found_master = False bibdocs = recdoc.list_bibdocs() for bibdoc in bibdocs: bibdocfiles = bibdoc.list_all_files() for bibdocfile in bibdocfiles: comment = bibdocfile.get_comment() description = bibdocfile.get_description() subformat = bibdocfile.get_subformat() m_comment = getval(batch_job, 'bibdoc_master_comment', comment) m_description = getval(batch_job, 'bibdoc_master_description', description) m_subformat = getval(batch_job, 'bibdoc_master_subformat', subformat) if (comment == m_comment and description == m_description and subformat == m_subformat): found_master = True batch_job['input'] = bibdocfile.get_full_path() ## Get the aspect of the from the record try: ## Assumes pbcore metadata mapping batch_job['aspect'] = get_fieldvalues( 124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0] except IndexError: pass break if found_master: break if not found_master: _task_write_message("Video master for record %d not found" % batch_job['recid']) task_update_progress("Video master for record %d not found" % batch_job['recid']) ## Maybe send an email? return 1 ## Clean the job to do no upscaling etc if getval(batch_job, 'assure_quality'): batch_job = clean_job_for_quality(batch_job) global _BATCH_STEPS _BATCH_STEPS = len(batch_job['jobs']) ## Generate the docname from the input filename's name or given name bibdoc_video_docname, bibdoc_video_extension = decompose_file( batch_job['input'])[1:] if not bibdoc_video_extension or getval(batch_job, 'bibdoc_master_extension'): bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension') if getval(batch_job, 'bibdoc_master_docname'): bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname') write_message("Creating BibDoc for %s" % bibdoc_video_docname) ## If the bibdoc exists, receive it if bibdoc_video_docname in recdoc.get_bibdoc_names(): bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname) ## Create a new bibdoc if it does not exist else: bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname) ## Get the directory auf the newly created bibdoc to copy stuff there bibdoc_video_directory = bibdoc_video.get_base_dir() #--------# # MASTER # #--------# if not getval(batch_job, 'update_from_master'): if getval(batch_job, 'add_master'): ## Generate the right name for the master ## The master should be hidden first an then renamed ## when it is really available ## !!! FIX !!! _task_write_message("Adding %s master to the BibDoc" % bibdoc_video_docname) master_format = compose_format( bibdoc_video_extension, getval(batch_job, 'bibdoc_master_subformat', 'master')) ## If a file of the same format is there, something is wrong, remove it! ## it might be caused by a previous corrupted submission etc. if bibdoc_video.format_already_exists_p(master_format): bibdoc_video.delete_file(master_format, 1) bibdoc_video.add_file_new_format( batch_job['input'], version=1, description=getval(batch_job, 'bibdoc_master_description'), comment=getval(batch_job, 'bibdoc_master_comment'), docformat=master_format) #-----------# # JOBS LOOP # #-----------# return_code = 1 global _BATCH_STEP for job in batch_job['jobs']: _task_write_message("----------- Job %s of %s -----------" % (_BATCH_STEP, _BATCH_STEPS)) ## Try to substitute docname with master docname if getval(job, 'bibdoc_docname'): job['bibdoc_docname'] = Template( job['bibdoc_docname']).safe_substitute( {'bibdoc_master_docname': bibdoc_video_docname}) #-------------# # TRANSCODING # #-------------# if job['mode'] == 'encode': ## Skip the job if assure_quality is not set and marked as fallback if not getval(batch_job, 'assure_quality') and getval( job, 'fallback'): continue if getval(job, 'profile'): profile = get_encoding_profile(job['profile']) else: profile = None ## We need an extension defined fot the video container bibdoc_video_extension = getval(job, 'extension', getval(profile, 'extension')) if not bibdoc_video_extension: raise Exception("No container/extension defined") ## Get the docname and subformat bibdoc_video_subformat = getval(job, 'bibdoc_subformat') bibdoc_slave_video_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname) ## The subformat is incompatible with ffmpegs name convention ## We do the encoding without and rename it afterwards bibdoc_video_fullpath = compose_file(bibdoc_video_directory, bibdoc_slave_video_docname, bibdoc_video_extension) _task_write_message( "Transcoding %s to %s;%s" % (bibdoc_slave_video_docname, bibdoc_video_extension, bibdoc_video_subformat)) ## We encode now directly into the bibdocs directory encoding_result = encode_video( input_file=batch_job['input'], output_file=bibdoc_video_fullpath, acodec=getval(job, 'audiocodec'), vcodec=getval(job, 'videocodec'), abitrate=getval(job, 'videobitrate'), vbitrate=getval(job, 'audiobitrate'), resolution=getval(job, 'resolution'), passes=getval(job, 'passes', 1), special=getval(job, 'special'), specialfirst=getval(job, 'specialfirst'), specialsecond=getval(job, 'specialsecond'), metadata=getval(job, 'metadata'), width=getval(job, 'width'), height=getval(job, 'height'), aspect=getval(batch_job, 'aspect'), # Aspect for every job profile=getval(job, 'profile'), update_fnc=_task_update_overall_status, message_fnc=_task_write_message) return_code &= encoding_result ## only on success if encoding_result: ## Rename it, adding the subformat os.rename( bibdoc_video_fullpath, compose_file(bibdoc_video_directory, bibdoc_slave_video_docname, bibdoc_video_extension, bibdoc_video_subformat, 1)) bibdoc_video._build_file_list() bibdoc_video_format = compose_format(bibdoc_video_extension, bibdoc_video_subformat) if getval(job, 'bibdoc_comment'): bibdoc_video.set_comment(getval(job, 'bibdoc_comment'), bibdoc_video_format) if getval(job, 'bibdoc_description'): bibdoc_video.set_description( getval(job, 'bibdoc_description'), bibdoc_video_format) #------------# # EXTRACTION # #------------# # if there are multiple extraction jobs, all the produced files # with the same name will be in the same bibdoc! Make sure that # you use different subformats or docname templates to avoid # conflicts. if job['mode'] == 'extract': if getval(job, 'profile'): profile = get_extract_profile(job['profile']) else: profile = {} bibdoc_frame_subformat = getval(job, 'bibdoc_subformat') _task_write_message("Extracting frames to temporary directory") tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4()) os.mkdir(tmpdir) #Move this to the batch description bibdoc_frame_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname) tmpfname = ( tmpdir + "/" + bibdoc_frame_docname + '.' + getval(profile, 'extension', getval(job, 'extension', 'jpg'))) extraction_result = extract_frames( input_file=batch_job['input'], output_file=tmpfname, size=getval(job, 'size'), positions=getval(job, 'positions'), numberof=getval(job, 'numberof'), width=getval(job, 'width'), height=getval(job, 'height'), aspect=getval(batch_job, 'aspect'), profile=getval(job, 'profile'), update_fnc=_task_update_overall_status, ) return_code &= extraction_result ## only on success: if extraction_result: ## for every filename in the directorys, create a bibdoc that contains ## all sizes of the frame from the two directories files = os.listdir(tmpdir) for filename in files: ## The docname was altered by BibEncode extract through substitution ## Retrieve it from the filename again bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext( filename) _task_write_message("Creating new bibdoc for %s" % bibdoc_frame_docname) ## If the bibdoc exists, receive it if bibdoc_frame_docname in recdoc.get_bibdoc_names(): bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname) ## Create a new bibdoc if it does not exist else: bibdoc_frame = recdoc.add_bibdoc( docname=bibdoc_frame_docname) ## The filename including path from tmpdir fname = os.path.join(tmpdir, filename) bibdoc_frame_format = compose_format( bibdoc_frame_extension, bibdoc_frame_subformat) ## Same as with the master, if the format allready exists, ## override it, because something went wrong before if bibdoc_frame.format_already_exists_p( bibdoc_frame_format): bibdoc_frame.delete_file(bibdoc_frame_format, 1) _task_write_message("Adding %s jpg;%s to BibDoc" % (bibdoc_frame_docname, getval(job, 'bibdoc_subformat'))) bibdoc_frame.add_file_new_format( fname, version=1, description=getval(job, 'bibdoc_description'), comment=getval(job, 'bibdoc_comment'), docformat=bibdoc_frame_format) ## Remove the temporary folders _task_write_message("Removing temporary directory") shutil.rmtree(tmpdir) _BATCH_STEP = _BATCH_STEP + 1 #-----------------# # FIX BIBDOC/MARC # #-----------------# _task_write_message("----------- Handling MARCXML -----------") ## Fix the BibDoc for all the videos previously created _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname) bibdoc_video._build_file_list() ## Fix the MARC _task_write_message("Fixing MARC") cli_fix_marc({}, [batch_job['recid']], False) if getval(batch_job, 'collection'): ## Make the record visible by moving in from the collection marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>" "<datafield tag=\"980\" ind1=\" \" ind2=\" \">" "<subfield code=\"a\">%s</subfield></datafield></record>" ) % (batch_job['recid'], batch_job['collection']) upload_marcxml_file(marcxml) #---------------------# # ADD MASTER METADATA # #---------------------# if getval(batch_job, 'add_master_metadata'): _task_write_message("Adding master metadata") pbcore = pbcore_metadata(input_file=getval(batch_job, 'input'), pbcoreIdentifier=batch_job['recid'], aspect_override=getval(batch_job, 'aspect')) marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT) upload_marcxml_file(marcxml) #------------------# # ADD MARC SNIPPET # #------------------# if getval(batch_job, 'marc_snippet'): marc_snippet = open(getval(batch_job, 'marc_snippet')) marcxml = marc_snippet.read() marc_snippet.close() upload_marcxml_file(marcxml) #--------------# # DELETE INPUT # #--------------# if getval(batch_job, 'delete_input'): _task_write_message("Deleting input file") # only if successfull if not return_code: # only if input matches pattern if getval(batch_job, 'delete_input_pattern', '') in getval(batch_job, 'input'): try: os.remove(getval(batch_job, 'input')) except OSError: pass #--------------# # NOTIFICATION # #--------------# ## Send Notification emails on errors if not return_code: if getval(batch_job, 'notify_user'): _notify_error_user( getval(batch_job, 'notify_user'), getval(batch_job, 'submission_filename', batch_job['input']), getval(batch_job, 'recid'), getval(batch_job, 'submission_title', "")) _task_write_message("Notify user because of an error") if getval(batch_job, 'notify_admin'): _task_write_message("Notify admin because of an error") if type(getval(batch_job, 'notify_admin') == type(str())): _notify_error_admin(batch_job, getval(batch_job, 'notify_admin')) else: _notify_error_admin(batch_job) else: if getval(batch_job, 'notify_user'): _task_write_message("Notify user because of success") _notify_success_user( getval(batch_job, 'notify_user'), getval(batch_job, 'submission_filename', batch_job['input']), getval(batch_job, 'recid'), getval(batch_job, 'submission_title', "")) return 1
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ rules_to_reset = task_get_option("reset_rules") if rules_to_reset: write_message("Resetting the following rules: %s" % rules_to_reset) for rule in rules_to_reset: reset_rule_last_run(rule) plugins = load_plugins() rules = load_rules(plugins) write_message("Loaded rules: %s" % rules, verbose=9) task_set_option('plugins', plugins) recids_for_rules = get_recids_for_rules(rules) write_message("recids for rules: %s" % recids_for_rules, verbose=9) update_database = not (task_has_option('record_ids') or task_get_option( 'no_upload', False) or task_get_option('no_tickets', False)) if update_database: next_starting_dates = {} for rule_name, rule in rules.iteritems(): next_starting_dates[rule_name] = get_next_starting_date(rule) all_recids = intbitset([]) single_rules = set() batch_rules = set() for rule_name, rule_recids in recids_for_rules.iteritems(): all_recids.union_update(rule_recids) if plugins[rules[rule_name]["check"]]["batch"]: batch_rules.add(rule_name) else: single_rules.add(rule_name) records_to_upload_holdingpen = [] records_to_upload_replace = [] records_to_submit_tickets = [] for batch in iter_batches(all_recids, CFG_BATCH_SIZE): for rule_name in batch_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) records = [] for i, record_id, record in batch: if record_id in rule_recids: records.append(record) if len(records): check_records(rule, records) # Then run them through normal rules for i, record_id, record in batch: progress_percent = int(float(i) / len(all_recids) * 100) task_update_progress("Processing record %s/%s (%i%%)." % (i, len(all_recids), progress_percent)) write_message("Processing record %s" % record_id) for rule_name in single_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) if record_id in rule_recids: check_record(rule, record) if record.amended: if record.holdingpen: records_to_upload_holdingpen.append(record) else: records_to_upload_replace.append(record) if not record.valid: records_to_submit_tickets.append(record) if len(records_to_submit_tickets) >= CFG_BATCH_SIZE: Tickets(records_to_submit_tickets).submit() records_to_submit_tickets = [] if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_holdingpen, True) records_to_upload_holdingpen = [] if len(records_to_upload_replace) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_replace, False) records_to_upload_replace = [] ## In case there are still some remaining amended records if records_to_submit_tickets: Tickets(records_to_submit_tickets).submit() if records_to_upload_holdingpen: upload_amendments(records_to_upload_holdingpen, True) if records_to_upload_replace: upload_amendments(records_to_upload_replace, False) # Update the database with the last time each rule was ran if update_database: for rule_name, rule in rules.iteritems(): update_rule_last_run(rule_name, next_starting_dates[rule_name]) return True
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False # Unify all parameters into a dict using locals parameters = locals() # 1: We analyze parameters and fetch all requested records from APS final_record_list, new_harvest_date = get_records_to_harvest(parameters) write_message("Found %d record(s) to download." % (len(final_record_list), )) if reportonly: write_message("'Report-only' mode. We exit now.") return if not final_record_list: # No records to harvest, quit. write_message("Nothing to harvest.") return # 2: Extract fulltext/metadata XML and upload bunches of # records as configured job = APSHarvestJob(CFG_APSHARVEST_DIR) count = process_records(job, parameters, final_record_list) if parameters.get("from_date") == "last": # Harvest of new records from APS successful # we update last harvested date store_last_updated(None, new_harvest_date, name="apsharvest_api_download") # We are done write_message("Harvested %d records. (%d failed)" % (count, len(job.records_failed)))
def perform_fulltext_harvest(record_list, add_metadata, attach_fulltext, hidden_fulltext, out_folder, threshold_date=None, journal_mappings=None): """ For every record in given list APSRecord(record ID, DOI, date last updated), yield a APSRecord with added FFT dictionary containing URL to fulltext/metadata XML downloaded locally. If a download is unsuccessful, an error message is given. @return: tuple of (APSRecord, error_message) """ count = 0 request_end = None request_start = None for record in record_list: task_sleep_now_if_required(can_stop_too=False) # Unless this is the first request, lets sleep a bit if request_end and request_start: request_dt = request_end - request_start write_message("Checking request time (%d)" % (request_dt, ), verbose=3) if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT: write_message("Initiating sleep for %.1f seconds" % (request_dt, ), verbose=3) time.sleep(request_dt) count += 1 task_update_progress("Harvesting record (%d/%d)" % (count, len(record_list))) if not record.doi: msg = "No DOI found for record %d" % (record.recid or "", ) write_message("Error: %s" % (msg, ), stream=sys.stderr) yield record, msg continue url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi} result_file = os.path.join(out_folder, "%s.zip" % (record.doi.replace('/', '_'))) try: request_start = time.time() if os.path.exists(result_file): # File already downloaded recently, lets see if it is the same file_last_modified = get_file_modified_date(result_file) if not compare_datetime_to_iso8601_date( file_last_modified, record.last_modified): # File is not older than APS version, we should not download. raise APSHarvesterFileExits write_message("Trying to save to %s" % (result_file, ), verbose=5) result_file = download_url(url=url, download_to_file=result_file, content_type="zip", retry_count=5, timeout=60.0) write_message("Downloaded %s to %s" % (url, result_file), verbose=2) except InvenioFileDownloadError, e: msg = "URL could not be opened: %s" % (url, ) write_message("Error: %s" % (msg, ), stream=sys.stderr) yield record, msg continue except APSHarvesterFileExits: write_message("File exists at %s" % (result_file, ), verbose=2)
def _task_run_core(): """ Runs the requested task in the bibsched environment. """ if bibtask.task_get_option('update_personid'): record_ids = bibtask.task_get_option('record_ids') if record_ids: record_ids = map(int, record_ids) all_records = bibtask.task_get_option('all_records') bibtask.task_update_progress('Updating personid...') run_rabbit(record_ids, all_records) bibtask.task_update_progress('PersonID update finished!') if bibtask.task_get_option("disambiguate"): bibtask.task_update_progress('Performing full disambiguation...') run_tortoise(bool(bibtask.task_get_option("from_scratch"))) bibtask.task_update_progress('Full disambiguation finished!') if bibtask.task_get_option("merge"): bibtask.task_update_progress('Merging results...') run_merge() bibtask.task_update_progress('Merging finished!') return 1
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True): """ bst_arxiv_doi_update Updates DOIs on documents harvested from ArXiv. Parameters: * input_uri - Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml * log_dir - Directory to store log files in * logging - True or False, default True """ if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI...") # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True) bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retreiving DOI data") task_update_status("FAILED") return False root = tree.getroot() doi_count = 0 new_count = 0 missing_count = 0 task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] record_xml = append_to_record(rec_id, doi, published_date) if record_xml: new_count += 1 _print( "* Now we will run the bibupload and bibindex for " + str(rec_id) + " record", 5) _print( "** We will upload the following xml code " + repr(record_xml), 9) bibupload.add(record_xml) bibindex.add(rec_id) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) else: missing_count += 1 _print('No record found matching arxiv ID: ' + arxiv, 9) _print("======================== FINAL SCORE ========================", 1) _print("DOIs found and processed: " + str(doi_count), 1) _print("Arxiv IDs without corresponding records: " + str(missing_count), 1) _print("Records requiring appends: " + str(new_count), 1) if logging: task_update_progress("Logging...") write_list_to_file(log_dir, 'errors', ERRORS) write_list_to_file(log_dir, 'messages', MESSAGES) task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add" % (str(doi_count), str(new_count))) task_update_status("DONE") bibupload.__del__() bibindex.__del__() return True
def bst_doi_timestamp(reset=0): prepate_doi_table() now = datetime.now() last_run = ((run_sql("SELECT max(creation_date) FROM doi")[0][0] or datetime(2014, 1, 1)) - timedelta(days=4)).strftime("%Y-%m-%d") if int(reset): last_run = (datetime(2014, 1, 1) - timedelta(days=4)).strftime("%Y-%m-%d") write_message("Retrieving DOIs modified since %s" % last_run) restart_on_error = True while restart_on_error: restart_on_error = False for publisher, re_match in CFG_SCOAP3_DOIS.items(): task_update_progress("Retrieving DOIs for %s" % publisher) write_message("Retriving DOIs for %s" % publisher) try: res = get_all_modified_dois(publisher, last_run, re_match, debug=True) for doi in res: if publisher == "10.1093": db_entry = run_sql( "SELECT doi, publication_date FROM doi WHERE doi=%s", (doi, )) pub_date = None if 'published-online' in res[doi]: if len(res[doi]['published-online']['date-parts'] [0]) == 3: pub_date = datetime.strptime( '-'.join( map( str, res[doi]['published-online'] ['date-parts'][0])), "%Y-%m-%d") write_message(db_entry) if db_entry: if db_entry[0][ 1]: # publication date is in the system continue else: if pub_date: run_sql( "UPDATE doi SET publication_date = %s WHERE doi=%s", (pub_date, doi)) else: continue else: write_message( "New DOI discovered for publisher %s: %s, publication: %s" % (publisher, doi, pub_date)) if pub_date: run_sql( "INSERT INTO doi(doi, creation_date, publication_date) VALUES(%s, %s, %s)", (doi, now, pub_date)) else: run_sql( "INSERT INTO doi(doi, creation_date) VALUES(%s, %s)", (doi, now)) else: if run_sql("SELECT doi FROM doi WHERE doi=%s", (doi, )): continue write_message( "New DOI discovered for publisher %s: %s" % (publisher, doi)) run_sql( "INSERT INTO doi(doi, creation_date) VALUES(%s, %s)", (doi, now)) except URLError as e: write_message("%s %s %s" % (publisher, last_run, re_match)) write_message("Problem with connection! %s" % (e, )) #restart_on_error = True except socket.timeout as e: write_message("Timeout error %s" % (e, )) write_message("Finishing and rescheduling") #restart_on_error = True except ValueError as e: write_message("Value error in JSON string! %s" % (e, ))
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location): """ Get list of entries from XML document """ try: task_update_progress("Downloading and extracting files 1/2...") result_path = download_url(url=feed_url, content_type="xml", download_to_file=feed_location, retry_count=5, timeout=60.0) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % (feed_url, )) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") return xml_files = [] entries = parse_feed(result_path) if not entries: return xml_files # look what files already exist # there are currently O(10^5) files in the directory tree rooted # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards # of 5 minutes to walk. # might make sense to have a db table with already harvested files task_sleep_now_if_required() allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY) task_sleep_now_if_required() for fileUrl, fileName in entries: if fileName in allfilenames: write_message( "Not downloading %s, found file with same name in %s" % ( fileName, CFG_CONSYN_OUT_DIRECTORY, )) continue task_sleep_now_if_required() # Output location is directory + filename outFilename = join(directory, fileName) outFilename = outFilename.lstrip() fileUrl = fileUrl.replace(' ', '%20') try: write_message("Downloading %s to %s\n" % (fileUrl, outFilename)) download_url(fileUrl, "zip", outFilename, 5, 60.0) new_sources.append(outFilename) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % fileUrl) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue try: xml_files.extend(extractAll(outFilename, delete_zip, directory)) except BadZipfile: _errors_detected.append(err) write_message("Error BadZipfile %s", (outFilename, )) task_update_status("CERROR") remove(outFilename) return xml_files
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: host = CFG_DATABASE_HOST port = CFG_DATABASE_PORT connection = None try: if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'): connection = get_connection_for_dump_on_slave() write_message("Dump on slave requested") write_message("... checking if slave is well up...") check_slave_is_up(connection) write_message("... checking if slave is in consistent state...") check_slave_is_in_consistent_state(connection) write_message("... detaching slave database...") detach_slave(connection) write_message("... scheduling dump on slave helper...") helper_arguments = [] if task_get_option("number"): helper_arguments += ["--number", str(task_get_option("number"))] if task_get_option("output"): helper_arguments += ["--output", str(task_get_option("output"))] if task_get_option("params"): helper_arguments += ["--params", str(task_get_option("params"))] if task_get_option("ignore_tables"): helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))] if task_get_option("compress"): helper_arguments += ["--compress"] if task_get_option("slave"): helper_arguments += ["--slave", str(task_get_option("slave"))] helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper'] task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments) write_message("Slave scheduled with ID %s" % task_id) task_update_progress("DONE") return True elif task_get_option('dump_on_slave_helper_mode'): write_message("Dumping on slave mode") connection = get_connection_for_dump_on_slave() write_message("... checking if slave is well down...") check_slave_is_down(connection) host = CFG_DATABASE_SLAVE task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) params = task_get_option('params', None) compress = task_get_option('compress', False) slave = task_get_option('slave', False) ignore_tables = task_get_option('ignore_tables', None) if ignore_tables: ignore_tables = get_table_names(ignore_tables) else: ignore_tables = None output_file_suffix = task_get_task_param('task_starting_time') output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql' if compress: output_file_suffix = "%s.gz" % (output_file_suffix,) write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") if slave: output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,) else: output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,) output_file = output_file_prefix + output_file_suffix dump_path = output_dir + os.sep + output_file dump_database(dump_path, \ host=host, port=port, params=params, \ compress=compress, \ ignore_tables=ignore_tables) write_message("Database dump ended") finally: if connection and task_get_option('dump_on_slave_helper_mode'): write_message("Reattaching slave") attach_slave(connection) # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_file_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def bst_consyn_harvest(feed_url=None, package=None, feed_file=None, package_list_file=None, batch_size='500', delete_zip='False', submit='False', threshold_date=None): """ Task to convert xml files from consyn.elsevier.com to Marc xml files. There are four execution modes: 1. Download from an atom feed url. 2. Extract and convert a zip package. 3. Download from an atom feed file. 4. Extract and convert a list of zip packages. The feed is stored to the file system under the folder feeds. If no errors occur during the execution of the tasklet the feed is deleted. Records may be recovered running the tasklet again with the modes 2, 3 or 4. :param feed_url: A URL to the atom feed. :type feed: string. :param package: A path to a zip package. :type package: string. :param package: A path to an atom feed file. :type package: string. :param package_list_file: A path to a file with a list of paths to zip packages. The file must contain the path to each package in a different line. :type package_list_file: string. :param batch_size: The number of records contained in each output file. :type batch_size: string representation of an integer. :param delete_zip: Flag to indicate if the downloaded zip files should be kept on the disk or not. :type delete_zip: string representation of a boolean. :param submit: Flag to indicate whether the result files should be submited by email and uploaded to FTP server. :type submit: string representation of a boolean. :param threshold_date: threshold date only converts records that they were published after threshold_date :type threshold_date: string in the format YYYY-MM-DD """ if not feed_url: feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \ (CFG_CONSYN_ATOM_KEY,) new_files = [] new_sources = [] feed_location = '' try: batch_size = int(batch_size) except ValueError: batch_size = 500 write_message('Warning batch_size parameter is not a valid integer\n' 'the default value \'500\' has been used!\n') if delete_zip.lower() == 'true': delete_zip = True elif delete_zip.lower() == 'false': delete_zip = False else: delete_zip = False write_message('Warning delete_zip parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if submit.lower() == 'true': submit = True elif submit.lower() == 'false': submit = False else: submit = False write_message('Warning upload_FTP parameter is not' ' a valid Boolean (True/False)\n' 'the default value \'False\' has been used!\n') if threshold_date: import time date_format = "%Y-%m-%d" try: date = datetime(*(time.strptime(threshold_date, date_format)[0:6])) threshold_date = date.strftime('%Y-%m-%d') except ValueError: write_message('Error threshold_date parameter is not ' 'in the right format. It should be in ' 'form "YYYY-MM-DD".') task_update_status("ERROR") return if not exists(CFG_CONSYN_OUT_DIRECTORY): makedirs(CFG_CONSYN_OUT_DIRECTORY) out_folder = CFG_CONSYN_OUT_DIRECTORY journal_mappings = get_kbs()['journals'][1] els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings) consyn_files = join(out_folder, "consyn-files") consyn_files = consyn_files.lstrip() if package: xml_files = extract_package(package, delete_zip, out_folder, new_sources) elif package_list_file: package_list = [] with open(package_list_file, 'r') as package_file: for line in package_file: line = line.strip() if line: package_list.append(line) xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) elif feed_file: entries = parse_feed(feed_file) links = [a[0] for a in entries] package_list = [a[1] for a in entries] package_list = [ join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list ] for package in package_list: task_sleep_now_if_required() if not exists(package): index = package_list.index(package) link = links[index] link = link.replace(' ', '%20') try: message = ("Downloading %s to %s\n" % (link, package)) write_message(message) download_url(link, "zip", package, 5, 60.0) package_list.append(package) except InvenioFileDownloadError as err: message = "URL could not be opened: " + link write_message(message) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue xml_files = extract_multiple_packages(package_list, delete_zip, new_sources, out_folder) else: feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds') if not exists(feeds_folder): makedirs(feeds_folder) date = datetime.now().strftime("%Y.%m.%d") feed_location = "feed-%s.xml" % date feed_location = join(feeds_folder, feed_location) xml_files = download_feed(feed_url, delete_zip, new_sources, out_folder, feed_location) task_update_progress("Converting files 2/3...") task_sleep_now_if_required() results = convert_files(xml_files, els, prefix=consyn_files, threshold_date=threshold_date) for dummy, (status_code, result) in results.iteritems(): if status_code == StatusCodes.OK: new_files.append(result) task_update_progress("Compiling output 3/3...") task_sleep_now_if_required() create_collection(batch_size, new_files, new_sources, out_folder, submit) if feed_location and not _errors_detected: remove(feed_location) for error in _errors_detected: write_message(str(error))
def task_run_core(): """Run the harvesting task. The row argument is the oaiharvest task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ reposlist = [] datelist = [] dateflag = 0 possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES] filepath_prefix = tmpHARVESTpath + "_" + str( task_get_task_param("task_id")) ### go ahead: build up the reposlist if task_get_option("repository") is not None: ### user requests harvesting from selected repositories write_message("harvesting from selected repositories") for reposname in task_get_option("repository"): row = get_row_from_reposname(reposname) if row == []: write_message("source name " + reposname + " is not valid") continue else: reposlist.append(get_row_from_reposname(reposname)) else: ### user requests harvesting from all repositories write_message("harvesting from all repositories in the database") reposlist = get_all_rows_from_db() ### go ahead: check if user requested from-until harvesting if task_get_option("dates"): ### for each repos simply perform a from-until date harvesting... ### no need to update anything dateflag = 1 for element in task_get_option("dates"): datelist.append(element) error_happened_p = False j = 0 for repos in reposlist: j += 1 task_sleep_now_if_required() reponame = str(repos[0][6]) postmode = str(repos[0][9]) setspecs = str(repos[0][10]) harvested_files_list = [] if postmode in possible_postmodes: # Harvest phase harvestpath = filepath_prefix + "_" + str(j) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_harvested" if dateflag == 1: task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \ (reponame, \ str(datelist[0]), str(datelist[1]), j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=str(datelist[0]), until=str(datelist[1]), setspecs=setspecs) if exit_code == 1: write_message("source " + reponame + \ " was harvested from " + str(datelist[0]) \ + " to " + str(datelist[1])) harvested_files_list = file_list else: write_message("an error occurred while harvesting " "from source " + reponame + " for the dates chosen") error_happened_p = True continue elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0: write_message("source " + reponame + \ " was never harvested before - harvesting whole " "repository") task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get(prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, setspecs=setspecs) if exit_code == 1: update_lastrun(repos[0][0]) harvested_files_list = file_list else: write_message("an error occurred while harvesting from " "source " + reponame) error_happened_p = True continue elif dateflag != 1 and repos[0][8] != 0: ### check that update is actually needed, ### i.e. lastrun+frequency>today timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lastrundate = re.sub(r'\.[0-9]+$', '', str(repos[0][7])) # remove trailing .00 timeinsec = int(repos[0][8]) * 60 * 60 updatedue = add_timestamp_and_timelag(lastrundate, timeinsec) proceed = compare_timestamps_with_tolerance(updatedue, timenow) if proceed == 0 or proceed == -1: #update needed! write_message("source " + reponame + " is going to be updated") fromdate = str(repos[0][7]) fromdate = fromdate.split()[0] # get rid of time # of the day for the moment task_update_progress("Harvesting %s (%i/%i)" % \ (reponame, j, \ len(reposlist))) exit_code, file_list = oai_harvest_get( prefix=repos[0][2], baseurl=repos[0][1], harvestpath=harvestpath, fro=fromdate, setspecs=setspecs) if exit_code == 1: update_lastrun(repos[0][0]) harvested_files_list = file_list else: write_message("an error occurred while harvesting " "from source " + reponame) error_happened_p = True continue else: write_message("source " + reponame + " does not need updating") continue elif dateflag != 1 and repos[0][8] == 0: write_message("source " + reponame + \ " has frequency set to 'Never' so it will not be updated") continue # Harvesting done, now convert/extract/filter/upload as requested if len(harvested_files_list) < 1: write_message("No records harvested for %s" % (reponame, )) continue active_files_list = harvested_files_list # Convert phase if 'c' in postmode: converted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Converting material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) converted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_converted" converted_files_list.append(converted_file) (exitcode, err_msg) = call_bibconvert(config=str(repos[0][5]), harvestpath=active_file, convertpath=converted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully converted") else: write_message( "an error occurred while converting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for converted_file in converted_files_list: write_message("File %s contains %i records." % \ (converted_file, get_nb_records_in_file(converted_file))) active_files_list = converted_files_list if 'e' in postmode: # Download tarball for each harvested/converted record, then run plotextrator. # Update converted xml files with generated xml or add it for upload extracted_files_list = [] i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Extracting material harvested from %s (%i/%i)" % \ (reponame, i, len(active_files_list))) extracted_file = filepath_prefix + "_" + str(i) + "_" + \ time.strftime("%Y%m%d%H%M%S") + "_extracted" extracted_files_list.append(extracted_file) (exitcode, err_msg) = call_plotextractor(active_file, extracted_file) if exitcode == 0: write_message("material harvested from source " + reponame + " was successfully extracted") else: write_message( "an error occurred while extracting from " + reponame + ': \n' + err_msg) error_happened_p = True continue # print stats: for extracted_file in extracted_files_list: write_message("File %s contains %i records." % \ (extracted_file, get_nb_records_in_file(extracted_file))) active_files_list = extracted_files_list # Filter-phase if 'f' in postmode: # first call bibfilter: res = 0 uploaded = False i = 0 for active_file in active_files_list: i += 1 task_sleep_now_if_required() task_update_progress("Filtering material harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibfilter(str(repos[0][11]), active_file) if len(active_files_list) > 0: if res == 0: write_message("material harvested from source " + reponame + " was successfully bibfiltered") else: write_message("an error occurred while bibfiltering " "harvest from " + reponame) error_happened_p = True continue # print stats: for active_file in active_files_list: write_message("File %s contains %i records." % \ (active_file + ".insert.xml", get_nb_records_in_file(active_file + ".insert.xml"))) write_message("File %s contains %i records." % \ (active_file + ".correct.xml", get_nb_records_in_file(active_file + ".correct.xml"))) write_message("File %s contains %i records." % \ (active_file + ".append.xml", get_nb_records_in_file(active_file + ".append.xml"))) write_message("File %s contains %i records." % \ (active_file + ".holdingpen.xml", get_nb_records_in_file(active_file + ".holdingpen.xml"))) # Upload files if "u" in postmode: if 'f' in postmode: # upload filtered files i = 0 for active_file in active_files_list: task_sleep_now_if_required() i += 1 if get_nb_records_in_file(active_file + ".insert.xml") > 0: task_update_progress("Uploading new records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".insert.xml", \ ["-i"], oai_src_id = repos[0][0]) uploaded = True task_sleep_now_if_required() if get_nb_records_in_file(active_file + ".correct.xml") > 0: task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".correct.xml", \ ["-c"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".append.xml") > 0: task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".append.xml", \ ["-a"], oai_src_id = repos[0][0]) uploaded = True if get_nb_records_in_file(active_file + ".holdingpen.xml") > 0: task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file + ".holdingpen.xml", \ ["-o"], oai_src_id = repos[0][0]) uploaded = True if len(active_files_list) > 0: if res == 0: if uploaded: write_message( "material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: # upload files normally res = 0 i = 0 uploaded = False for active_file in active_files_list: i += 1 task_sleep_now_if_required() if get_nb_records_in_file(active_file) > 0: task_update_progress("Uploading records harvested from %s (%i/%i)" % \ (reponame, \ i, \ len(active_files_list))) res += call_bibupload(active_file, oai_src_id=repos[0][0]) uploaded = True if res == 0: if uploaded: write_message( "material harvested from source " + reponame + " was successfully uploaded") else: write_message("nothing to upload") else: write_message("an error occurred while uploading " "harvest from " + reponame) error_happened_p = True continue else: ### this should not happen write_message("invalid postprocess mode: " + postmode + " skipping repository") error_happened_p = True continue if error_happened_p: return False else: return True
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True, asana_key=CFG_ASANA_API_KEY, asana_parent_id=ASANA_PARENT_TASK_ID, skip_result_types='missing'): """Update DOIs on documents harvested from ArXiv. Parameters: :param input_uri: Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml :param log_dir: Directory to store log files in :param logging: True or False, default True :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY NOTE: Passing the value of None for this parameter will skip writing to Asana and instead email the instance admin :param asana_parent_id: The taskID of the task in Asana to log subtasks to :param skip_result_types: Error messages to not bother with during reporting, input as Comma Seperated Values CSVs Possible values: missing, ambigous, incorrect """ skip_results = verify_skip_results(skip_result_types) if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI: %s" % (input_uri, )) # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=False) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retrieving DOI data") return False except ExpatError: _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1) task_update_progress("Failed parsing DOI data") return False root = tree.getroot() try: date_el = root.find('date') date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'), date_el.get('day')) _print("Processing DOIs last updated on date %s" % date_str) except AttributeError: _print("Warning: Couldn't get last published date of Arxiv DOI feed.") doi_count = 0 new_count = 0 # Stores any DOIs with have issues with in structure: # Missing: (doi, arxiv preprint_id, published date) # Ambiguous: (doi, arxiv preprint_id, rec_ids) # Incorrect: (rec_id, old-doi, new-doi) problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []} task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] try: record_xml = append_to_record(rec_id, doi, published_date) except DOIError as ex: problem_dois['incorrect'].append((rec_id, ex.message, doi)) continue if record_xml: new_count += 1 _print( "* Now we will run the bibupload for " + "%s record" % rec_id, 5) _print( "** We will upload the following xml code %s" % repr(record_xml), 9) bibupload.add(record_xml) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id))) else: _print('No record found matching arxiv ID: %s' % arxiv, 9) problem_dois['missing'].append((doi, arxiv, published_date)) _print("========================| FINAL SCORE |=======================", 1) _print("DOIs found and processed: %d" % doi_count, 1) _print( "Arxiv IDs without corresponding records: %d" % len(problem_dois['missing']), 1) _print( "Arxiv IDs corresponding to multiple records (duplicates): %d" % len(problem_dois['ambiguous']), 1) _print( "Inspire records with an incorrect DOI: %d" % len(problem_dois['incorrect']), 1) _print("Records without DOIs requiring appends: %d" % new_count, 1) _print("==============================================================", 1) bibupload.cleanup() notify_on_errors(problem_dois, log_dir, doi_count, new_count, asana_key, asana_parent_id, skip_results) return True
def generate_sitemaps(collection_names, fulltext_filter=''): """ Generate sitemaps themselves. Return list of generated sitemaps files """ sitemap_id = 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps = [writer.get_name()] nb_urls = 0 for [lang, lang_name] in language_list_long(): writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang, lastmod=datetime.today(), changefreq=DEFAULT_CHANGEFREQ_HOME, priority=DEFAULT_PRIORITY_HOME) nb_urls += 1 recids = get_all_public_records(collection_names) task_update_progress("Generating urls for %s records" % len(recids)) #task_sleep_now_if_required(can_stop_too=True) for (recid, lastmod) in recids: if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_RECORDS, priority=DEFAULT_PRIORITY_RECORDS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for collections") for (collection, lastmod) in get_all_public_collections(collection_names): for [lang, lang_name] in language_list_long(): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter('%s/sitemap-%s.xml' % (CFG_WEBDIR, sitemap_id)) sitemaps.append(writer.get_name()) nb_urls = writer.add_url('%s/collection/%s?ln=%s' % (CFG_SITE_URL, quote(collection), lang), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS, priority=DEFAULT_PRIORITY_COLLECTIONS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for fulltexts") for (recid, lastmod) in filter_fulltexts(recids, fulltext_filter): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/files' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS, priority=DEFAULT_PRIORITY_FULLTEXTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for comments") for (recid, lastmod) in filter_comments(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/comments' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COMMENTS, priority=DEFAULT_PRIORITY_COMMENTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for reviews") for (recid, lastmod) in filter_reviews(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/reviews' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_REVIEWS, priority=DEFAULT_PRIORITY_REVIEWS) #task_sleep_now_if_required(can_stop_too=False) try: writer.close() except: pass return sitemaps
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email", update_mode="email", from_date="", until_date=None, metadata="yes", fulltext="yes", hidden="yes", match="no", reportonly="no", threshold_date=None, devmode="no"): """ Task to download APS metadata + fulltext given a list of arguments. Operates in two ways: 1. Harvesting of new/updated metadata+fulltext from APS via REST API This means that new records are being looked for at APS servers. Active when from_date and until_date is given, in addition when a DOI not already in the system is given. If the value "last" is given to from_date the harvester will harvest any new records since last run. If match is set to "yes" the records harvested will be matched against the database and split into "new" and "updated" records. 2. Attachment of fulltext only from APS for existing records When the records to be processed already exists in the system, the task only harvests the fulltext's themselves and attaches them to the records. Examples: Get full update for existing records via record identifier: >>> bst_apsharvest(recids="13,513,333") Get full update for existing records via a search query and unhide fulltext: >>> bst_apsharvest(query="find j prstab", hidden="no") Get metadata only update for an existing doi: >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no") Get fulltext only update for a record and append to record: >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append") Get new records from APS, send update to holding pen and email new records >>> bst_apsharvest(from_date="last", update_mode="o") Get records from APS updated between given dates, insert new and correct >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04", new_mode="insert", update_mode="correct") @param dois: comma-separated list of DOIs to download fulltext/metadata for. @type dois: string @param recids: comma-separated list of recids of record containing a DOI to download fulltext for. @type recids: string @param query: an Invenio search query of records to download fulltext for. @type query: string @param records: get any records modified, created or both since last time in the database to download fulltext for, can be either: "new" - fetches all new records added "modified" - fetches all modified records added "both" - both of the above @type records: string @param new_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param update_mode: which mode should the fulltext files be submitted in: "email" - does NOT run bibupload and sends an email instead. Default. "insert" - inserts the records into the database "append" - appends the fulltext to the existing attached files "correct" - corrects existing attached fulltext files, or adds new "replace" - replaces all attached files with new fulltext file The fulltext is appended by default to new records. @type mode: string @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01 If the value is "last" it means to get records since last harvest. @type from_date: string @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01 @type until_date: string @param fulltext: should the record have fulltext attached? "yes" or "no" @type fulltext: string @param hidden: should the fulltext be hidden when attached? "yes" or "no" @type hidden: string @param match: should a simple match with the database be done? "yes" or "no" @type match: string @param reportonly: only report number of records to harvest, then exit? "yes" or "no" @type reportonly: string @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01 @type threshold_date: string @param devmode: Activate devmode. Full verbosity and no uploads/mails. @type devmode: string """ # This is the list of APSRecord objects to be harvested. final_record_list = APSRecordList() task_update_progress("Parsing input parameters") # Validate modes for mode in [new_mode, update_mode]: if mode not in ("append", "a", "correct", "c", "o", "replace", "r", "insert", "i", "email"): raise Exception("Warning: given upload mode '%s' is not valid." % (mode, )) # We hide fulltext by default if hidden.lower() == "no": hidden = False else: hidden = True # We attach fulltext by default if fulltext.lower() == "no": fulltext = False else: fulltext = True # We attach meta-data by default if metadata.lower() == "no": metadata = False else: metadata = True # We do not match records by default if match.lower() == "yes": match = True else: match = False # We do not reportonly by default if devmode.lower() == "yes": devmode = True task_set_task_param('verbose', 9) else: devmode = False # We do not reportonly by default if reportonly.lower() == "yes": reportonly = True else: reportonly = False if threshold_date: # Input from user. Validate date try: harvest_from_date = validate_date(threshold_date) except ValueError, e: write_message("Error parsing from_date, use (YYYY-MM-DD): %s" % (str(e), ), stream=sys.stderr) return 1
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx % 200 == 0: task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite( " - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite( " - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) pids_having_rec = set( [int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec, )) if inspire_id: matched_pids = list( get_author_by_external_id(inspire_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) try: raise ValueError(msg) except ValueError: register_exception(alert_admin=True) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) try: raise ValueError(msg) except ValueError: register_exception(alert_admin=True) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) try: raise ValueError(msg) except ValueError: register_exception(alert_admin=True) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) try: raise ValueError(msg) except ValueError: register_exception(alert_admin=True) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(selfcites.iteritems(), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(selfrefs.iteritems(), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(authorcites.iteritems(), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking rec report numbers: %.2f sec" % (t5-t4)) write_message("... checking rec journals: %.2f sec" % (t6-t5)) write_message("... checking rec DOI: %.2f sec" % (t7-t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7-t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
try: bibarchive = BibRecDocs(recid) except Exception, e: write_message("Could not instantiate record #%s: %s" % (recid, e)) return 0 write_message("Going to create related file formats for record #%s" % recid) i = 0 for docname in docnames: i += 1 task_sleep_now_if_required() msg = "Processing %s (%i/%i)" % (docname, i, len(docnames)) write_message(msg) task_update_progress(msg) try: bibdoc = bibarchive.get_bibdoc(docname) except Exception, e: write_message("Could not process docname %s: %s" % (docname, e)) continue (prev_desc, prev_comment) = \ get_description_and_comment(bibarchive.get_bibdoc(docname).list_latest_files()) # List all files that are not icons or subformats current_files = [bibdocfile.get_path() for bibdocfile in bibdoc.list_latest_files() if \ not bibdocfile.get_subformat() and not bibdocfile.is_icon()] ## current_files = [] ## if not force:
def rabbit(bibrecs=None, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): logger = Logger("Rabbit") if verbose: logger.verbose = True if not bibrecs: logger.log("Running on all records") else: logger.log("Running on %s " % (str(bibrecs))) populate_mnames_pids_cache() global M_NAME_PIDS_CACHE memoized_compare_names = memoized(comp_names) compare_names = lambda x, y: memoized_compare_names(*sorted((x, y))) def find_pids_by_matchable_name_with_cache(matchable_name): try: matched_pids = [M_NAME_PIDS_CACHE[matchable_name]] except KeyError: matched_pids = get_authors_by_name(matchable_name, use_matchable_name=True) if matched_pids: M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0] return matched_pids if USE_EXT_IDS: def get_matched_pids_by_external_ids(sig, rec, pids_having_rec): ''' This function returns all the matched pids after iterating through all available external IDs of the system. ''' for get_external_id_of_signature in external_id_getters: external_id = get_external_id_of_signature(sig + (rec, )) if external_id: matched_pids = list( get_author_by_external_id(external_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() return matched_pids threshold = 0.8 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) bibrecs = list(bibrecs) for idx, rec in enumerate(bibrecs): logger.log("Considering %s" % str(rec)) if idx % 100 == 0: task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if idx % 1000 == 0: destroy_partial_marc_caches() populate_partial_marc_caches(bibrecs[idx:idx + 1000]) logger.log( float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs))) if rec in deleted: remove_papers([rec]) continue author_refs = get_author_refs_of_paper(rec) coauthor_refs = get_coauthor_refs_of_paper(rec) markrefs = frozenset( chain(izip(cycle([100]), imap(itemgetter(0), author_refs)), izip(cycle([700]), imap(itemgetter(0), coauthor_refs)))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, get_name_by_bibref(new)) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logger.log(" - Deleted signatures: %s" % str(old_signatures)) logger.log(" - Added signatures: %s" % str(new_signatures)) logger.log(" - Matrix: %s" % str(matrix)) #[new_signatures, old_signatures] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logger.log(" - Best match: %s " % str(best_match)) for new, old in best_match: logger.log(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new])) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) remaining_personid_rows = ([ x for x in personid_rows if x[1:3] in old_signatures ]) pids_having_rec = set([int(row[0]) for row in remaining_personid_rows]) logger.log(" - Not matched: %s" % str(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matchable_name = create_matchable_name(name) matched_pids = list() if USE_EXT_IDS: matched_pids = get_matched_pids_by_external_ids( sig, rec, pids_having_rec) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0], m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0] updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if not matched_pids: for matching_function in M_NAME_FUNCTIONS[1:]: matchable_name = matching_function(name) matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if matched_pids: break matched_pids = [p for p in matched_pids if int(p) not in used_pids] best_matched_pid = None for matched_pid in matched_pids: # Because of the wrongly labeled data in the db, all # of the possible choices have to be checked. If one of the # coauthors, who had his signature already considered, claimed # in the past one of the signatures of currently considered # author, the algorithm will think that two signatures belong # to the same person, and, will create an unnecessary new # profile. if not int(matched_pid) in pids_having_rec: best_matched_pid = matched_pid break if not best_matched_pid: new_pid = new_person_from_signature( list(sig) + [rec], name, matchable_name) M_NAME_PIDS_CACHE[matchable_name] = new_pid used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, best_matched_pid, m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid used_pids.add(best_matched_pid) updated_pids.add(best_matched_pid) pids_having_rec.add(best_matched_pid) logger.log('Finished with %s' % str(rec)) logger.update_status_final() destroy_partial_marc_caches() if personids_to_update_extids: updated_pids |= set(personids_to_update_extids) if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS, force_cache_tables=True) destroy_partial_marc_caches() destroy_mnames_pids_cache() remove_empty_authors() task_update_progress("Done!")
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ plugins = load_plugins() rules = load_rules(plugins) task_set_option('plugins', plugins) recids_for_rules = get_recids_for_rules(rules) all_recids = intbitset([]) single_rules = set() batch_rules = set() for rule_name, rule_recids in recids_for_rules.iteritems(): all_recids.union_update(rule_recids) if plugins[rules[rule_name]["check"]]["batch"]: batch_rules.add(rule_name) else: single_rules.add(rule_name) records_to_upload_holdingpen = [] records_to_upload_replace = [] for batch in iter_batches(all_recids, CFG_BATCH_SIZE): for rule_name in batch_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) records = [] for i, record_id, record in batch: if record_id in rule_recids: records.append(record) if len(records): check_records(rule, records) # Then run them trught normal rules for i, record_id, record in batch: progress_percent = int(float(i) / len(all_recids) * 100) task_update_progress("Processing record %s/%s (%i%%)." % (i, len(all_recids), progress_percent)) write_message("Processing record %s" % record_id) for rule_name in single_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) if record_id in rule_recids: check_record(rule, record) if record.amended: if record.holdingpen: records_to_upload_holdingpen.append(record) else: records_to_upload_replace.append(record) if not record.valid: submit_ticket(record, record_id) if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_holdingpen, True) records_to_upload_holdingpen = [] if len(records_to_upload_replace) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_replace, False) records_to_upload_replace = [] ## In case there are still some remaining amended records if records_to_upload_holdingpen: upload_amendments(records_to_upload_holdingpen, True) if records_to_upload_replace: upload_amendments(records_to_upload_replace, False) # Update the database with the last time the rules was ran for rule in rules.keys(): update_rule_last_run(rule) return True
def get_citation_informations(recid_list, tags, fetch_catchup_info=True): """scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, } references_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, } # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_journal'][0:2], (tags['refs_journal'], )) or \ run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_report_number'][0:2], (tags['refs_report_number'], )): done = 0 # for status reporting for recid in recid_list: if done % 10 == 0: task_sleep_now_if_required() # in fact we can sleep any time here if done % 1000 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) done += 1 if recid in INTBITSET_OF_DELETED_RECORDS: # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags['refs_report_number']: references_info['report-numbers'][recid] \ = get_fieldvalues(recid, tags['refs_report_number'], sort=False) msg = "references_info['report-numbers'][%s] = %r" \ % (recid, references_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['refs_journal']: references_info['journals'][recid] = [] for ref in get_fieldvalues(recid, tags['refs_journal'], sort=False): try: # Inspire specific parsing journal, volume, page = ref.split(',') except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ','.join([journal, alt_volume, page]) references_info['journals'][recid] += [alt_ref] references_info['journals'][recid] += [ref] msg = "references_info['journals'][%s] = %r" \ % (recid, references_info['journals'][recid]) write_message(msg, verbose=9) if tags['refs_doi']: references_info['doi'][recid] \ = get_fieldvalues(recid, tags['refs_doi'], sort=False) msg = "references_info['doi'][%s] = %r" \ % (recid, references_info['doi'][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags['record_pri_number'] or tags['record_add_number']: records_info['report-numbers'][recid] = [] if tags['record_pri_number']: records_info['report-numbers'][recid] \ += get_fieldvalues(recid, tags['record_pri_number'], sort=False) if tags['record_add_number']: records_info['report-numbers'][recid] \ += get_fieldvalues(recid, tags['record_add_number'], sort=False) msg = "records_info[%s]['report-numbers'] = %r" \ % (recid, records_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['doi']: records_info['doi'][recid] = [] for tag in tags['doi']: records_info['doi'][recid] += get_fieldvalues(recid, tag, sort=False) msg = "records_info[%s]['doi'] = %r" \ % (recid, records_info['doi'][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags['publication']: records_info['journals'][recid] = get_journal_info(recid, tags) msg = "records_info[%s]['journals'] = %r" \ % (recid, records_info['journals'][recid]) write_message(msg, verbose=9) else: mesg = "Warning: there are no records with tag values for " \ "%s or %s. Nothing to do." % \ (tags['refs_journal'], tags['refs_report_number']) write_message(mesg) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info
def get_citation_weight(rank_method_code, config, chunk_size=20000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() quick = task_get_option("quick") != "no" # id option forces re-indexing a certain range # even if there are no new recs if task_get_option("id"): # construct a range of records to index updated_recids = [] for first, last in task_get_option("id"): updated_recids += range(first, last + 1) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str( updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message('Records to process: %s' % str_updated_recids) index_update_time = None else: bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code) if not quick: bibrank_update_time = "0000-00-00 00:00:00" write_message("bibrank: %s" % bibrank_update_time) index_update_time = get_bibindex_update_time() write_message("bibindex: %s" % index_update_time) if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"): index_update_time = "0000-00-00 00:00:00" updated_recids = get_modified_recs(bibrank_update_time, index_update_time) if len(updated_recids) > 10000: str_updated_recids = str(updated_recids[:10]) + ' ... ' + str( updated_recids[-10:]) else: str_updated_recids = str(updated_recids) write_message("%s records to update" % str_updated_recids) if updated_recids: # result_intermediate should be warranted to exists! # but if the user entered a "-R" (do all) option, we need to # make an empty start set if quick: dicts = { 'cites_weight': last_updated_result(rank_method_code), 'cites': get_cit_dict("citationdict"), 'refs': get_cit_dict("reversedict"), 'selfcites': get_cit_dict("selfcitdict"), 'selfrefs': get_cit_dict("selfcitedbydict"), 'authorcites': get_initial_author_dict(), } else: dicts = { 'cites_weight': {}, 'cites': {}, 'refs': {}, 'selfcites': {}, 'selfrefs': {}, 'authorcites': {}, } # Process fully the updated records process_and_store(updated_recids, config, dicts, chunk_size, quick) end_time = time.time() write_message("Total time of get_citation_weight(): %.2f sec" % \ (end_time - begin_time)) task_update_progress("citation analysis done") cites_weight = dicts['cites_weight'] else: cites_weight = {} write_message("No new records added since last time this " \ "rank method was executed") return cites_weight, index_update_time
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems() if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_low_level_submission('bibupload', 'oairepository', '-c', filename) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) if tot > 0: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: os.remove(filename) return True
def _analyze_documents(records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message("WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if bibclassify_text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message('INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ bibclassify_engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message('WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = bibclassify_engine.clean_before_output(single_keywords) cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append(bibclassify_engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def task_run_core(name=NAME): """Entry point for the arxiv-pdf-checker task""" # First gather recids to process recids = task_get_option('recids') if recids: start_date = None else: start_date = datetime.now() dummy, last_date = fetch_last_updated(name) recids = fetch_updated_arxiv_records(last_date) if task_get_option('missing'): recids |= fetch_records_missing_arxiv_fulltext() else: recids |= fetch_records_missing_arxiv_fulltext() & \ fetch_records_modified_since(last_date) updated_recids = set() try: for count, recid in enumerate(recids): if count % 50 == 0: msg = 'Done %s of %s' % (count, len(recids)) write_message(msg) task_update_progress(msg) # BibTask sleep task_sleep_now_if_required(can_stop_too=True) write_message('processing %s' % recid, verbose=9) try: if process_one(recid): updated_recids.add(recid) time.sleep(6) except AlreadyHarvested: write_message('already harvested successfully') time.sleep(6) except FoundExistingPdf: write_message('pdf already attached (matching md5)') time.sleep(6) except PdfNotAvailable: write_message("no pdf available") time.sleep(20) except InvenioFileDownloadError, e: write_message("failed to download: %s" % e) time.sleep(20) finally: # We want to process updated records even in case we are interrupted msg = 'Updated %s records' % len(updated_recids) write_message(msg) task_update_progress(msg) write_message(repr(updated_recids)) # For all updated records, we want to sync the 8564 tags # and reextract references if updated_recids: submit_fixmarc_task(updated_recids) submit_refextract_task(updated_recids) # Store last run date of the daemon # not if it ran on specific recids from the command line with --id # but only if it ran on the modified records if start_date: store_last_updated(0, start_date, name) return True