def call_bibupload(marcxmlfile, mode=None, oai_src_id=-1, sequence_id=None): """ Creates a bibupload task for the task scheduler in given mode on given file. Returns the generated task id and logs the event in oaiHARVESTLOGS, also adding any given oai source identifier. :param marcxmlfile: base-marcxmlfilename to upload :param mode: mode to upload in :param oai_src_id: id of current source config :param sequence_id: sequence-number, if relevant :return: task_id if successful, otherwise None. """ if mode is None: mode = ["-r", "-i"] if os.path.exists(marcxmlfile): try: args = mode # Add job with priority 6 (above normal bibedit tasks) # and file to upload to arguments args.extend(["-P", "6", marcxmlfile]) if sequence_id: args.extend(['-I', str(sequence_id)]) task_id = task_low_level_submission("bibupload", "oaiharvest", *tuple(args)) create_oaiharvest_log(task_id, oai_src_id, marcxmlfile) except Exception as msg: write_message("An exception during submitting oaiharvest task occured : %s " % (str(msg))) return None return task_id else: write_message("marcxmlfile %s does not exist" % (marcxmlfile,)) return None
def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE): """Inserts a new record into all the data structures""" #data_dict data_dict[recid] = value #data_dict_ordered & data_list_sorted #calculate at which index the rec should be inserted in data_list_sorted index_for_insert = binary_search(data_list_sorted, value, data_dict) #we have to calculate the weight of this record in data_dict_ordered #and it will be the med between its neighbours in the data_list_sorted if index_for_insert == len(data_list_sorted):#insert at the end of the list #append at the end of the list data_list_sorted.append(recid) #weight = highest weight + the distance data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing else: if index_for_insert == 0: #insert at the begining of the list left_neighbor_weight = 0 else: left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]] right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]] #the recid's weight will be the med between left and right weight = (right_neighbor_weight - left_neighbor_weight)/2 if weight < 1: #there is no more space to insert, we have to create some space data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + spacing create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing) else: data_list_sorted.insert(index_for_insert, recid) data_dict_ordered[recid] = left_neighbor_weight + weight write_message("Record %s done." %recid, verbose=5) return index_for_insert
def iterate_over_new(list, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') format_record(recID, fmt, on_the_fly=True) formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def run_sorting_method(recids, method_name, method_id, definition, washer): """Does the actual sorting for the method_name for all the records in the database""" run_sorting_for_rnk = False if definition.startswith('RNK'): run_sorting_for_rnk = True field_data_dictionary = get_field_data(recids, method_name, definition) if not field_data_dictionary: write_message("POSSIBLE ERROR: The sorting method --%s-- has no data!" \ %method_name) return True apply_washer(field_data_dictionary, washer) #do we have any locale constraint? sorting_locale = locale_for_sorting(washer) sorted_data_list, sorted_data_dict = \ sort_dict(field_data_dictionary, CFG_BIBSORT_WEIGHT_DISTANCE, run_sorting_for_rnk, sorting_locale) executed = write_to_methoddata_table(method_id, field_data_dictionary, \ sorted_data_dict, sorted_data_list) if not executed: return False if CFG_BIBSORT_BUCKETS > 1: bucket_dict, bucket_last_rec_dict = split_into_buckets(sorted_data_list, len(sorted_data_list)) for idx in bucket_dict: executed = write_to_buckets_table(method_id, idx, bucket_dict[idx], \ sorted_data_dict[bucket_last_rec_dict[idx]]) if not executed: return False else: executed = write_to_buckets_table(method_id, 1, intbitset(sorted_data_list), \ sorted_data_list[-1]) if not executed: return False return True
def get_modified_or_inserted_recs(method_list): """Returns a list of recids that have been inserted or modified since the last update of the bibsort methods in method_list method_list should already contain a list of methods that SHOULD be updated, if it contains new methods, an error will be thrown""" if not method_list: #just to be on the safe side return 0 try: query = """SELECT min(d.last_updated) from "bsrMETHODDATA" d, "bsrMETHOD" m WHERE m.name in (%s) AND d."id_bsrMETHOD" = m.id""" % \ ("%s," * len(method_list))[:-1] last_updated = str(run_sql(query, tuple(method_list))[0][0]) except Error as err: write_message("Error when trying to get the last_updated date " \ "from bsrMETHODDATA: [%s]" %err, sys.stderr) return 0 recids = [] try: results = run_sql("SELECT id from bibrec \ where modification_date >= %s", (last_updated, )) if results: recids = [result[0] for result in results] except Error as err: write_message("Error when trying to get the list of " \ "modified records: [%s]" %err, sys.stderr) return 0 return recids
def create_ticket(queue, subject, text=""): """ This function will submit a ticket using the configured BibCatalog system. :param queue: the ticketing queue to send a ticket to :type queue: string :param subject: subject of the ticket :type subject: string :param text: the main text or body of the ticket. Optional. :type text: string :return: return the ID of the created ticket, or None on failure :rtype: int or None """ # Initialize BibCatalog connection as default user, if possible if bibcatalog_system is not None: bibcatalog_response = bibcatalog_system.check_system() else: bibcatalog_response = "No ticket system configured" if bibcatalog_response != "": write_message("BibCatalog error: %s\n" % (bibcatalog_response,)) return None ticketid = bibcatalog_system.ticket_submit(subject=subject, queue=queue) if text: comment = bibcatalog_system.ticket_comment(None, ticketid, text) if comment is None: write_message("Error: commenting on ticket %s failed." % (str(ticketid),)) return ticketid
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name): """ Generate sitemaps themselves. @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps @param records: the list of (recid, modification_date) tuples to process @param output_directory: directory where to store the sitemaps @param sitemap_name: the name (prefix) of the sitemap files(s) """ sitemap_id = 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 write_message("... Getting sitemap '%s'..." % sitemap_name) write_message("... Generating urls for %s records..." % len(records)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(records): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_RECORDS, priority = DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records))) task_sleep_now_if_required(can_stop_too=True)
def update_sorting(methods, recids): """Runs the updating of the sorting tables for methods and recids Recids is a list of integer numbers(record ids) but can also contain intervals""" method_list = [] if methods: method_list = methods.strip().split(',') recid_list = [] if recids: cli_recid_list = recids.strip().split(',') for recid in cli_recid_list: if recid.find('-') > 0: rec_range = recid.split('-') try: recid_min = int(rec_range[0]) recid_max = int(rec_range[1]) for rec in range(recid_min, recid_max + 1): recid_list.append(rec) except Error as err: write_message("Error: [%s] occured while trying \ to parse the recids argument." %err, sys.stderr) return False else: recid_list.append(int(recid)) return run_bibsort_update(recid_list, method_list)
def get_config_parameter(jobname, parameter_name, is_parameter_collection = False): """Detect export method of JOBNAME. Basically, parse JOBNAME.cfg and return export_method. Return None if problem found.""" jobconfig = ConfigParser() jobconffile = CFG_ETCDIR + os.sep + 'bibexport' + os.sep + jobname + '.cfg' if not os.path.exists(jobconffile): write_message("ERROR: cannot find config file %s." % jobconffile) return None jobconfig.read(jobconffile) if is_parameter_collection: all_items = jobconfig.items(section='export_job') parameters = [] for item_name, item_value in all_items: if item_name.startswith(parameter_name): parameters.append(item_value) return parameters else: parameter = jobconfig.get('export_job', parameter_name) return parameter
def fetch_updated_arxiv_records(date): """Fetch all the arxiv records modified since the last run""" def check_arxiv(recid): """Returns True for arxiv papers""" for report_number in get_fieldvalues(recid, "037__9"): if report_number == "arXiv": return True return False # Fetch all records inserted since last run sql = ( "SELECT `id`, `modification_date` FROM `bibrec` " "WHERE `modification_date` >= %s " "ORDER BY `modification_date`" ) records = run_sql(sql, [date.isoformat()]) records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)] # Show all records for debugging purposes if task_get_option("verbose") >= 9: write_message("recids:", verbose=9) for recid, mod_date in records: write_message("* %s, %s" % (recid, mod_date), verbose=9) task_update_progress("Done fetching %s arxiv record ids" % len(records)) return records
def get_external_links_from_db(ref, dict_of_ids, reference_indicator): """returns a dictionary containing the number of external links for each recid external link=citation that is not in our database """ ext_links = {} dict_all_ref = {} for recid in dict_of_ids: dict_all_ref[recid] = 0 ext_links[dict_of_ids[recid]] = 0 reference_db_id = reference_indicator[0:2] reference_tag_regex = reference_indicator + "[a-z]" tag_list = run_sql("select id from bib" + reference_db_id + \ "x where tag RLIKE %s", (reference_tag_regex, )) tag_set = set() for tag in tag_list: tag_set.add(tag[0]) ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \ bibrec_bib" + reference_db_id + "x group by \ id_bibrec, field_number") for item in ref_list: recid = int(item[0]) id_bib = int(item[1]) if recid in dict_of_ids and id_bib in tag_set: dict_all_ref[recid] += 1 for recid in dict_of_ids: total_links = dict_all_ref[recid] internal_links = ref[dict_of_ids[recid]] ext_links[dict_of_ids[recid]] = total_links - internal_links if ext_links[dict_of_ids[recid]] < 0: ext_links[dict_of_ids[recid]] = 0 write_message("External link information extracted", verbose=2) write_message("External links: %s" % str(ext_links), verbose=9) return ext_links
def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse): """the core function of the PAGERANK_EXT method returns an array with the ranks coresponding to each recid""" weights_old = array((), float32) weights_old = ones((len_), float32) weights_new = array((), float32) converged = False nr_of_check_points = 0 difference = len_ while not converged: nr_of_check_points += 1 for step in (range(check_point)): weights_new = zeros((len_), float32) for (i, j) in sparse.keys(): weights_new[i] += sparse[(i, j)]*weights_old[j] total_sum = 0.0 for j in semi_sparse: total_sum += semi_sparse[j]*weights_old[j] weights_new[1:len_] = weights_new[1:len_] + total_sum if step == check_point - 1: diff = weights_new - weights_old difference = sqrt(dot(diff, diff))/len_ write_message("Finished step: %s, %s " \ % (str(check_point*(nr_of_check_points-1) + step), \ str(difference)), verbose=5) weights_old = weights_new.copy() converged = (difference < conv_threshold) write_message("PageRank calculated for all recids finnished in %s steps. \ The threshold was %s" % (str(nr_of_check_points), \ str(difference)), verbose=2) #return weights_old[1:len_]/(len_ - weights_old[0]) return weights_old[1:len_]
def get_data_for_definition_marc(tags, recids): '''Having a list of tags and a list of recids, it returns a dictionary with the values correspondig to the tags''' #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x] #user: 140s, sys: 21s, total: 160s - cdsdev if isinstance(recids, (int, long)): recids = intbitset([recids, ]) # for each recid we need only one value #on which we sort, so we can stop looking for a value # as soon as we find one tag_index = 0 field_data_dict = {} while len(recids) > 0 and tag_index < len(tags): write_message('%s records queried for values for tags %s.' \ %(len(recids), tags), verbose=5) res = _get_values_from_marc_tag(tags[tag_index], recids) res_dict = dict(res) #field_data_dict.update(res_dict) #we can not use this, because res_dict might contain recids #that are already in field_data_dict, and we should not overwrite their value field_data_dict = dict(res_dict, **field_data_dict) #there might be keys that we do not want (ex: using 'between') #so we should remove them res_dict_keys = intbitset(res_dict.keys()) recids_not_needed = res_dict_keys.difference(recids) for recid in recids_not_needed: del field_data_dict[recid] #update the recids to contain only the recid that do not have values yet recids.difference_update(res_dict_keys) tag_index += 1 return field_data_dict
def query_records(params): """Produce record IDs from given query parameters. By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def solr_add_ranges(id_ranges): sub_range_length = task_get_option("flush") id_ranges_to_index = [] for id_range in id_ranges: lower_recid = id_range[0] upper_recid = id_range[1] i_low = lower_recid while i_low <= upper_recid: i_up = min(i_low + sub_range_length - 1, upper_recid) id_ranges_to_index.append((i_low, i_up)) i_low += sub_range_length tags_to_index = get_tags() # Indexes latest records first by reversing # This allows the ranker to return better results during long indexing # runs as the ranker cuts the hitset using latest records id_ranges_to_index.reverse() next_commit_counter = 0 for id_range_to_index in id_ranges_to_index: lower_recid = id_range_to_index[0] upper_recid = id_range_to_index[1] status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid) write_message(status_msg) task_update_progress(status_msg) next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter) solr_commit_if_necessary(next_commit_counter, final_commit=True)
def upload_amendments(records, holdingpen): """ Upload a modified record """ if task_get_option("no_upload", False) or len(records) == 0: return xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">' for record in records: xml += record_xml_output(record) xml += "</collection>" tmp_file_fd, tmp_file = mkstemp( suffix='.xml', prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"), dir=CFG_TMPSHAREDDIR ) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if holdingpen: flag = "-o" else: flag = "-r" task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def load_configuration(): """Loads the configuration for the bibsort.cfg file into the database""" config_file = cfg.get('CFG_SORTER_CONFIGURATION', pkg_resources.resource_filename( 'invenio.legacy.bibsort', 'bibsort.cfg')) write_message('Reading config data from: %s' % (config_file, )) config = ConfigParser.ConfigParser() try: config.readfp(open(config_file)) except StandardError as err: write_message("Cannot find configuration file: %s" \ %config_file, stream=sys.stderr) return False to_insert = [] for section in config.sections(): try: name = config.get(section, "name") definition = config.get(section, "definition") washer = config.get(section, "washer") except (ConfigParser.NoOptionError, StandardError) as err: write_message("For each sort_field you need to define at least \ the name, the washer and the definition. \ [error: %s]" %err, stream=sys.stderr) return False to_insert.append((name, definition, washer)) # all the values were correctly read from the config file truncate_table("bsrMETHOD") write_message('Old data has been deleted from bsrMETHOD table', verbose=5) for row in to_insert: run_sql("""INSERT INTO "bsrMETHOD"(name, definition, washer) VALUES (%s, %s, %s)""", (row[0], row[1], row[2])) write_message('Method %s has been inserted into bsrMETHOD table' \ %row[0], verbose=5) return True
def _dump_database(dirname, filename): """ Dump Invenio database into SQL file called FILENAME living in DIRNAME. """ write_message("... writing %s" % dirname + os.sep + filename) cmd = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd): msg = "ERROR: cannot find %s." % cmd write_message(msg, stream=sys.stderr) raise StandardError(msg) cmd += " --skip-opt --add-drop-table --add-locks --create-options " \ " --quick --extended-insert --set-charset --disable-keys " \ " --host=%s --user=%s --password=%s %s | %s -c " % \ (escape_shell_arg(CFG_DATABASE_HOST), escape_shell_arg(CFG_DATABASE_USER), escape_shell_arg(CFG_DATABASE_PASS), escape_shell_arg(CFG_DATABASE_NAME), CFG_PATH_GZIP) dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename) if dummy1: msg = "ERROR: mysqldump exit code is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy2: msg = "ERROR: mysqldump stdout is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy3: msg = "ERROR: mysqldump stderr is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg)
def fun(): try: return task_run_core(name, core_func, extra_vars) except Exception: # Remove extra '\n' write_message(traceback.format_exc()[:-1]) raise
def print_missing(num): """ Print the contents of rnkCITATIONDATAEXT table containing external records that were cited by NUM or more internal records. NUM is by default taken from the -E command line option. """ if not num: num = task_get_option("print-extcites") write_message( "Listing external papers cited by %i or more \ internal records:" % num ) res = run_sql( """SELECT COUNT(id_bibrec), extcitepubinfo FROM rnkCITATIONDATAEXT GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s ORDER BY COUNT(id_bibrec) DESC""", (num,), ) for cnt, brec in res: print(str(cnt), "\t", brec) write_message("Listing done.")
def fetch_concerned_arxiv_records(name): task_update_progress("Fetching arxiv record ids") dummy, last_date = fetch_last_updated(name) # Fetch all records inserted since last run sql = "SELECT `id`, `modification_date` FROM `bibrec` " \ "WHERE `modification_date` >= %s " \ "AND `creation_date` > NOW() - INTERVAL 7 DAY " \ "ORDER BY `modification_date`" \ "LIMIT 5000" records = run_sql(sql, [last_date.isoformat()]) def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False def check_pdf_date(recid): doc = get_pdf_doc(recid) if doc: return doc.md > last_date return False records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)] records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)] write_message("recids %s" % repr([(r, mod_date.isoformat()) \ for r, mod_date in records])) task_update_progress("Done fetching arxiv record ids") return records
def replace_cites(recid, new_cites): """ Given a set of citations, replaces the citations of given recid in the database. The changes are logged into rnkCITATIONLOG. See @replace_refs """ old_cites = set(row[0] for row in run_sql("""SELECT citer FROM rnkCITATIONDICT WHERE citee = %s""", [recid])) cites_to_add = new_cites - old_cites cites_to_delete = old_cites - new_cites for cite in cites_to_add: write_message('adding cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated) VALUES (%s, %s, %s)""", (recid, cite, now)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now)) for cite in cites_to_delete: write_message('deleting cite %s %s' % (recid, cite), verbose=1) now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("""DELETE FROM rnkCITATIONDICT WHERE citee = %s and citer = %s""", (recid, cite)) run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date) VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def iterate_over_new(recIDs, fmt): """Iterate over list of IDs. @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get( fmt.lower(), _update_format) for count, recID in enumerate(recIDs): t1 = os.times()[4] reformat_function(recID, fmt) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def set_amended(self, message): """ Mark the record as amended """ write_message("Amended record %s by rule %s: %s" % (self.record_id, self.rule["name"], message)) self.amendments.append("Rule %s: %s" % (self.rule["name"], message)) self.amended = True if self.rule["holdingpen"]: self.holdingpen = True
def insert_into_cit_db(dic, name): """Stores citation dictionary in the database""" ndate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) s = serialize_via_marshal(dic) write_message("size of %s %s" % (name, len(s))) # check that this column really exists run_sql("""REPLACE INTO rnkCITATIONDATA(object_name, object_value, last_updated) VALUES (%s, %s, %s)""", (name, s, ndate))
def _task_write_message(message): """ Stores the messages in a global list for notifications @param message: the message that should be printed as task status @type message: string """ write_message(message) global _MSG_HISTORY _MSG_HISTORY.append(message)
def calculate_time_weights(len_, time_decay, dates): """calculates the time coeficients for each paper""" current_year = int(datetime.datetime.now().strftime("%Y")) date_coef = {} for j in range(len_): date_coef[j] = exp(time_decay*(dates[j] - current_year)) write_message("Time weights calculated", verbose=5) write_message("Time weights: %s" % str(date_coef), verbose=9) return date_coef
def statistics_on_sparse(sparse): """returns the number of papers that cite themselves""" count_diag = 0 for (i, j) in sparse.keys(): if i == j: count_diag += 1 write_message("The number of papers that cite themselves: %s" % \ str(count_diag), verbose=3) return count_diag
def leaves(ref): """returns the number of papers that do not cite any other paper""" nr_of_leaves = 0 for i in ref: if i == 0: nr_of_leaves += 1 write_message("The number of papers that do not cite \ any other papers: %s" % str(leaves), verbose=3) return nr_of_leaves
def run_bibsort_update(recids=None, method_list=None): """Updates bibsort tables for the methods in method_list and for the records in recids. If recids is None: recids = all records that have been modified or inserted since last update If method_list is None: method_list = all the methods available in bsrMETHOD table""" write_message('Initial data for run_bibsort_update method: ' \ 'number of recids = %s; method_list=%s' \ %(str(len(recids)), method_list), verbose=5) write_message('Updating sorting data.') bibsort_methods, errors = get_bibsort_methods_details(method_list) if errors: return False method_list = bibsort_methods.keys() if not method_list: write_message('No methods found in bsrMETHOD table.. exiting.') return True #we could have 4 types of methods: #(i) RNK methods -> they should be rebalanced, not updated #(ii) RNK methods to delete -> we should delete their data #(iii) non RNK methods to update #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated #check which of the methods are RNK methods (they do not need modified recids) rnk_methods = get_rnk_methods(bibsort_methods) rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods( rnk_methods, bibsort_methods) #check which of the methods have no data, so they are actually new, #so they need balancing(sorting) instead of updating non_rnk_methods = [ method for method in bibsort_methods.keys() if method not in rnk_methods ] non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods( non_rnk_methods) #(i) + (iv) methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted if methods_to_balance: # several methods require rebalancing(sorting) and not updating return run_bibsort_rebalance(methods_to_balance) #(ii) #remove the data for the ranking methods that have been deleted for method in rnk_methods_deleted: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Deleting data for method %s" % method) write_message('Starting deleting the data for RNK method %s' % method, verbose=5) executed_ok = delete_bibsort_data_for_method( bibsort_methods[method]['id']) if not executed_ok: write_message('Method %s could not be deleted correctly, aborting..' \ %method, sys.stderr) return False #(iii) #methods to actually update if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids update_timestamp = False if not recids: recids = get_modified_or_inserted_recs(non_rnk_methods_updated) if recids == 0: #error signal return False if not recids: write_message("No records inserted or modified in bibrec table " \ "since the last update of bsrMETHODDATA.") return True write_message("These records have been recently modified/inserted: %s" \ %str(recids), verbose=5) update_timestamp = True recids_i = intbitset(recids) for method in non_rnk_methods_updated: task_sleep_now_if_required(can_stop_too=True) task_update_progress("Updating method %s" % method) write_message('Starting updating method %s' % method, verbose=5) executed_ok = update_bibsort_tables(recids_i, method, update_timestamp) if not executed_ok: write_message('Method %s could not be executed correctly, aborting..' \ %method, sys.stderr) return False return True
def run_bibsort_rebalance(method_list=None): """Rebalances all buckets for the methods in method_list""" bibsort_methods, errors = get_bibsort_methods_details(method_list) if errors: return False if not bibsort_methods: write_message('No methods found.. exiting rebalancing.') return True #check if there are only ranking methods -> no need for recids rnk_methods = get_rnk_methods(bibsort_methods) non_rnk_method = [ method for method in bibsort_methods.keys() if method not in rnk_methods ] write_message('Running rebalancing for methods: %s' % bibsort_methods.keys()) if non_rnk_method: # we have also 'normal' (no RNK) methods, so we need the recids recids = get_all_recids(including_deleted=False) write_message('Rebalancing will run for %s records.' \ %str(len(recids)), verbose=5) task_sleep_now_if_required(can_stop_too=True) else: recids = intbitset([]) write_message('Rebalancing will run only for RNK methods') for name in bibsort_methods: task_update_progress('Rebalancing %s method.' % name) write_message('Starting sorting the data for %s method ... ' \ %name.upper()) executed_ok = run_sorting_method(recids, name, bibsort_methods[name]['id'], bibsort_methods[name]['definition'], bibsort_methods[name]['washer']) if not executed_ok: write_message('Method %s could not be executed correctly.' \ %name, sys.stderr) return False write_message('Done.') task_sleep_now_if_required(can_stop_too=True) task_update_progress('Rebalancing done.') return True
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message( "... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message( "... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message( "... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel( ) if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext( None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default( text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % ( url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % ( url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []
def warn(self, msg): """ Add a warning to the record """ self.warnings.append("Rule %s: %s" % (self.rule["name"], msg)) write_message("[WARN] record %s by rule %s: %s" % (self.record_id, self.rule["name"], msg))
def perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp=True): """Updates the buckets""" bucket_insert = {} bucket_delete = {} write_message("Updating the buckets for method_id = %s" % method_id, verbose=5) buckets = run_sql( "SELECT bucket_no, bucket_last_value \ FROM bsrMETHODDATABUCKET \ WHERE id_bsrMETHOD = %s", (method_id, )) if not buckets: write_message("No bucket data found for method_id %s." \ %method_id, sys.stderr) raise Exception #sort the buckets to be sure we are iterating them in order(1 to max): buckets_dict = dict(buckets) for recid in recids_to_insert: for bucket_no in buckets_dict: if recids_current_ordered[recid] <= buckets_dict[bucket_no]: bucket_insert.setdefault(bucket_no, []).append(recid) break for recid in recids_old_ordered: record_inserted = 0 record_deleted = 0 for bucket_no in buckets_dict: bucket_value = int(buckets_dict[bucket_no]) if record_inserted and record_deleted: #both insertion and deletion have been registered break if recids_current_ordered[recid] <= bucket_value and \ recids_old_ordered[recid] <= bucket_value and \ not record_inserted and \ not record_deleted: #both before and after the modif, #recid should be in the same bucket -> nothing to do break if recids_current_ordered[ recid] <= bucket_value and not record_inserted: #recid should be, after the modif, here, so insert bucket_insert.setdefault(bucket_no, []).append(recid) record_inserted = 1 if recids_old_ordered[recid] <= bucket_value and not record_deleted: #recid was here before modif, must be removed bucket_delete.setdefault(bucket_no, []).append(recid) record_deleted = 1 for bucket_no in buckets_dict: if (bucket_no in bucket_insert) or (bucket_no in bucket_delete): res = run_sql("SELECT bucket_data FROM bsrMETHODDATABUCKET \ where id_bsrMETHOD = %s AND bucket_no = %s" , \ (method_id, bucket_no, )) bucket_data = intbitset(res[0][0]) for recid in bucket_insert.get(bucket_no, []): bucket_data.add(recid) for recid in bucket_delete.get(bucket_no, []): if recid in bucket_data: bucket_data.remove(recid) if update_timestamp: date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) run_sql("UPDATE bsrMETHODDATABUCKET \ SET bucket_data = %s, last_updated = %s \ WHERE id_bsrMETHOD = %s AND bucket_no = %s" , \ (bucket_data.fastdump(), date, method_id, bucket_no, )) else: run_sql("UPDATE bsrMETHODDATABUCKET \ SET bucket_data = %s \ WHERE id_bsrMETHOD = %s AND bucket_no = %s" , \ (bucket_data.fastdump(), method_id, bucket_no, )) write_message("Updating bucket %s for method %s." % (bucket_no, method_id), verbose=5)
def write_to_buckets_table(id_method, bucket_no, bucket_data, bucket_last_value, update_timestamp=True): """Serialize the date and write it to the bsrMEHODDATA_BUCKETS""" write_message('Writing the data for bucket number %s for ' \ 'method_id=%s to the database' \ %(bucket_no, id_method), verbose=5) write_message('Serializing data for bucket number %s' % bucket_no, verbose=5) serialized_bucket_data = bucket_data.fastdump() date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not update_timestamp: try: date = run_sql('SELECT last_updated from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \ (id_method, bucket_no))[0][0] except IndexError: pass # keep the generated date try: write_message('Deleting old data.', verbose=5) run_sql("DELETE FROM bsrMETHODDATABUCKET \ WHERE id_bsrMETHOD = %s AND bucket_no = %s" , \ (id_method, bucket_no, )) write_message('Inserting new data.', verbose=5) run_sql("INSERT into bsrMETHODDATABUCKET \ (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \ VALUES (%s, %s, %s, %s, %s)" , \ (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, )) except Error as err: write_message("The error [%s] occured when inserting new bibsort data " \ "into bsrMETHODATA_BUCKETS table" %err, sys.stderr) return False write_message('Writing to bsrMETHODDATABUCKET for ' \ 'bucket number %s completed.' %bucket_no, verbose=5) return True
def update_bibsort_tables(recids, method, update_timestamp=True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql( "SELECT id, definition, washer \ from bsrMETHOD where name = %s", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql( "SELECT data_dict, data_dict_ordered, data_list_sorted \ FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [ recid for recid in recids_to_delete if recid in data_dict ] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error as err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False return True
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True if run_sql( """SELECT id FROM "schTASK" WHERE proc='bibupload:oairepository' AND status='WAITING'""" ): write_message( "Previous requests of oairepository still being elaborated. Let's skip this execution." ) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, m='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, m='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, m='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set) if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n', '-Noairepository', '-P', '-1') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-Noairepository', '-P', '-1') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if tot > 0: if not no_upload: task_sleep_now_if_required(can_stop_too=True) if task_get_option("notimechange"): task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: task_low_level_submission('bibupload', 'oairepository', '-c', filename) else: os.remove(filename) return True
def write_to_methoddata_table(id_method, data_dict, data_dict_ordered, data_list_sorted, update_timestamp=True): """Serialize the date and write it to the bsrMETHODDATA""" write_message('Starting serializing the data..', verbose=5) serialized_data_dict = serialize_via_marshal(data_dict) serialized_data_dict_ordered = serialize_via_marshal(data_dict_ordered) serialized_data_list_sorted = serialize_via_marshal(data_list_sorted) write_message('Serialization completed.', verbose=5) date = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not update_timestamp: try: date = run_sql( 'SELECT last_updated from bsrMETHODDATA WHERE id_bsrMETHOD = %s', (id_method, ))[0][0] except IndexError: pass # keep the generated date write_message("Starting writing the data for method_id=%s " \ "to the database (table bsrMETHODDATA)" %id_method, verbose=5) try: write_message('Deleting old data..', verbose=5) run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (id_method, )) write_message('Inserting new data..', verbose=5) run_sql("INSERT into bsrMETHODDATA \ (id_bsrMETHOD, data_dict, data_dict_ordered, data_list_sorted, last_updated) \ VALUES (%s, %s, %s, %s, %s)" , \ (id_method, serialized_data_dict, serialized_data_dict_ordered, \ serialized_data_list_sorted, date, )) except Error as err: write_message("The error [%s] occured when inserting new bibsort data "\ "into bsrMETHODATA table" %err, sys.stderr) return False write_message('Writing to the bsrMETHODDATA successfully completed.', \ verbose=5) return True
def citerank(rank_method_code): """new ranking method based on the citation graph""" write_message("Running rank method: %s" % rank_method_code, verbose=0) if not import_numpy: write_message('The numpy package could not be imported. \ This package is compulsory for running the citerank methods.') return try: file_ = configuration.get(rank_method_code + '.cfg', '') config = ConfigParser.ConfigParser() config.readfp(open(file_)) except StandardError: write_message("Cannot find configuration file: %s" % file_, sys.stderr) raise StandardError # the file for citations needs to have the following format: #each line needs to be x[tab]y, where x cites y; x,y are recids function = config.get("rank_method", "function") try: file_for_citations = config.get(function, "file_with_citations") cit, dict_of_ids = get_citations_from_file(file_for_citations) except (ConfigParser.NoOptionError, StandardError) as err: write_message("If you want to read the citation data from file set up \ the file_for_citations parameter in the config file [%s]" % err, verbose=2) cit, dict_of_ids = get_citations_from_db() len_ = len(dict_of_ids.keys()) write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3) if len_ == 0: write_message("No citation data found, nothing to be done.") return try: method = config.get(function, "citerank_method") except ConfigParser.NoOptionError as err: write_message("Exception: %s " % err, sys.stderr) raise Exception write_message("Running %s method." % method, verbose=2) dates = get_dates(function, config, dict_of_ids) if method == "citation_time": try: time_decay = float(config.get(function, "time_decay")) except (ConfigParser.NoOptionError, ValueError) as err: write_message("Exception: %s" % err, sys.stderr) raise Exception date_coef = calculate_time_weights(len_, time_decay, dates) #cit = remove_loops(cit, dates, dict_of_ids) dict_of_ranks = \ run_citation_rank_time(cit, dict_of_ids, date_coef, dates) else: try: conv_threshold = float(config.get(function, "conv_threshold")) check_point = int(config.get(function, "check_point")) damping_factor = float(config.get(function, "damping_factor")) write_message("Parameters: d = %s, conv_threshold = %s, \ check_point = %s" %(str(damping_factor), \ str(conv_threshold), str(check_point)), verbose=5) except (ConfigParser.NoOptionError, StandardError) as err: write_message("Exception: %s" % err, sys.stderr) raise Exception if method == "pagerank_classic": ref = construct_ref_array(cit, dict_of_ids, len_) use_ext_cit = "" try: use_ext_cit = config.get(function, "use_external_citations") write_message("Pagerank will use external citations: %s" \ %str(use_ext_cit), verbose=5) except (ConfigParser.NoOptionError, StandardError) as err: write_message("%s" % err, verbose=2) if use_ext_cit == "yes": try: ext_citation_file = config.get(function, "ext_citation_file") ext_links = get_external_links_from_file( ext_citation_file, ref, dict_of_ids) except (ConfigParser.NoOptionError, StandardError): write_message("If you want to read the external citation \ data from file set up the ext_citation_file parameter in the config. file" , \ verbose=3) try: reference_tag = config.get(function, "ext_reference_tag") dummy = int(reference_tag[0:3]) except (ConfigParser.NoOptionError, StandardError): write_message( "You need to set up correctly the \ reference_tag in the cfg file", sys.stderr) raise Exception ext_links = get_external_links_from_db(ref, \ dict_of_ids, reference_tag) avg = avg_ext_links_with_0(ext_links) if avg < 1: write_message("This method can't be ran. There is not \ enough information about the external citation. Hint: check the reference tag" , \ sys.stderr) raise Exception avg_ext_links_without_0(ext_links) try: alpha = float(config.get(function, "ext_alpha")) beta = float(config.get(function, "ext_beta")) except (ConfigParser.NoOptionError, StandardError) as err: write_message("Exception: %s" % err, sys.stderr) raise Exception dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \ ext_links, conv_threshold, check_point, alpha, beta, dates) else: dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \ damping_factor, conv_threshold, check_point, dates) elif method == "pagerank_time": try: time_decay = float(config.get(function, "time_decay")) write_message("Parameter: time_decay = %s" \ %str(time_decay), verbose=5) except (ConfigParser.NoOptionError, StandardError) as err: write_message("Exception: %s" % err, sys.stderr) raise Exception date_coef = calculate_time_weights(len_, time_decay, dates) cit = remove_loops(cit, dates, dict_of_ids) ref = construct_ref_array(cit, dict_of_ids, len_) dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \ damping_factor, conv_threshold, check_point, date_coef, dates) else: write_message( "Error: Unknown ranking method. \ Please check the ranking_method parameter in the config. file.", sys.stderr) raise Exception try: filename_ranks = config.get(function, "output_ranks_to_filename") max_ranks = config.get(function, "output_rank_limit") if not max_ranks.isdigit(): max_ranks = len_ else: max_ranks = int(max_ranks) if max_ranks > len_: max_ranks = len_ ranks = sort_weights(dict_of_ranks) write_message("Ranks: %s" % str(ranks), verbose=9) write_first_ranks_to_file(ranks, dict_of_ranks, \ max_ranks, filename_ranks) except (ConfigParser.NoOptionError, StandardError): write_message("If you want the ranks to be printed in a file you have \ to set output_ranks_to_filename and output_rank_limit \ parameters in the configuration file", verbose=3) normalize_weights(dict_of_ranks) into_db(dict_of_ranks, rank_method_code)
def tweet_to_record(tweet, query): """ Transform a tweet into a record. @note: you may want to highly customize this. """ rec = {} ## Let's normalize the body of the tweet. text = tweet.text.encode('UTF-8') text = text.replace('>', '>') text = text.replace('<', '<') text = text.replace('"', "'") text = text.replace('&', '&') ## Let's add the creation date try: creation_date = time.strptime(tweet.created_at, '%a, %d %b %Y %H:%M:%S +0000') except ValueError: creation_date = time.strptime(tweet.created_at, '%a %b %d %H:%M:%S +0000 %Y') record_add_field(rec, '260__c', time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date)) ## Let's add the Tweet ID record_add_field(rec, '970', subfields=[('a', str(tweet.id))]) ## Let's add the body of the tweet as an abstract record_add_field(rec, '520', subfields=[('a', text)]) ## Let's re-add the body of the tweet as a title. record_add_field(rec, '245', subfields=[('a', text)]) ## Let's fetch information about the user try: user = _TWITTER_API.GetUser(tweet.from_user) ## Let's add the user name as author of the tweet record_add_field(rec, '100', subfields=[('a', str(user.name.encode('UTF-8')))]) ## Let's fetch the icon of the user profile, and let's upload it as ## an image (and an icon of itself) record_add_field(rec, 'FFT', subfields=[ ('a', user.profile.image_url.encode('UTF-8')), ('x', user.profile.image_url.encode('UTF-8')) ]) except Exception as err: write_message("WARNING: issue when fetching the user: %s" % err, stream=sys.stderr) if hasattr(tweet, 'iso_language_code'): ## Let's add the language of the Tweet if available (also this depends) ## on the kind of Twitter API call we used record_add_field(rec, '045', subfields=[('a', tweet.iso_language_code.encode('UTF-8'))]) ## Let's tag this record as a TWEET so that later we can build a collection ## out of these records. record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) ## Some smart manipulations: let's parse out URLs and tags from the body ## of the Tweet. for url in _RE_GET_HTTP.findall(text): url = url[0] record_add_field(rec, '856', '4', subfields=[('u', url)]) for tag in _RE_TAGS.findall(text): ## And here we add the keywords. record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) ## Finally we shall serialize everything to MARCXML return record_xml_output(rec)
def bst_weblinkback_updater(mode): """ Update linkbacks @param mode: 1 delete rejected, broken and pending linkbacks whose URLs is on blacklist 2 update page titles of new linkbacks 3 update page titles of old linkbacks 4 update manually set page titles 5 detect and disable broken linkbacks 6 send notification email for all pending linkbacks @type mode: int """ mode = int(mode) if mode == 1: write_message( "Starting to delete rejected and pending linkbacks URLs on blacklist" ) delete_linkbacks_on_blacklist() write_message( "Completed to delete rejected and pending linkbacks URLs on blacklist" ) elif mode == 2: write_message("Starting to update the page titles of new linkbacks") update_linkbacks(1) write_message("Completed to update the page titles of new linkbacks") elif mode == 3: write_message("Starting to update the page titles of old linkbacks") update_linkbacks(2) write_message("Completed to update the page titles of old linkbacks") elif mode == 4: write_message("Starting to update manually set page titles") update_linkbacks(3) write_message("Completed to update manually set page titles") elif mode == 5: write_message("Starting to detect and disable broken linkbacks") update_linkbacks(4) write_message("Completed to detect and disable broken linkbacks") elif mode == 6: write_message("Starting to send notification email") send_pending_linkbacks_notification(CFG_WEBLINKBACK_TYPE['TRACKBACK']) write_message("Completed to send notification email")
def get_dates_from_db(dict_of_ids, publication_year_tag, creation_date_tag): """Returns the year of the publication for each paper. In case the year is not in the db, the year of the submission is taken""" current_year = int(datetime.datetime.now().strftime("%Y")) publication_year_db_id = publication_year_tag[0:2] creation_date_db_id = creation_date_tag[0:2] total = 0 count = 0 dict_of_dates = {} for recid in dict_of_ids: dict_of_dates[recid] = 0 date_list = run_sql("select id, tag, value from bib" + \ publication_year_db_id + "x where tag=%s", \ (publication_year_tag, )) date_dict = {} for item in date_list: date_dict[int(item[0])] = item[2] pattern = re.compile('.*(\d{4}).*') date_list = run_sql("select id_bibrec, id_bibxxx, field_number \ from bibrec_bib" + publication_year_db_id + "x") for item in date_list: recid = int(item[0]) id_ = int(item[1]) if id_ in date_dict and recid in dict_of_dates: reg = pattern.match(date_dict[id_]) if reg: date = int(reg.group(1)) if date > 1000 and date <= current_year: dict_of_dates[recid] = date total += date count += 1 not_covered = [] for recid in dict_of_dates: if dict_of_dates[recid] == 0: not_covered.append(recid) date_list = run_sql("select id, tag, value from bib" + \ creation_date_db_id + "x where tag=%s", \ (creation_date_tag, )) date_dict = {} for item in date_list: date_dict[int(item[0])] = item[2] date_list = run_sql("select id_bibrec, id_bibxxx, field_number \ from bibrec_bib" + creation_date_db_id + "x") for item in date_list: recid = int(item[0]) id_ = int(item[1]) if id_ in date_dict and recid in not_covered: date = int(str(date_dict[id_])[0:4]) if date > 1000 and date <= current_year: dict_of_dates[recid] = date total += date count += 1 dates = {} med = total / count for recid in dict_of_dates: if dict_of_dates[recid] == 0: dates[dict_of_ids[recid]] = med else: dates[dict_of_ids[recid]] = dict_of_dates[recid] write_message("Dates extracted", verbose=2) write_message("Dates dictionary %s" % str(dates), verbose=9) return dates