Exemple #1
0
def call_bibupload(marcxmlfile, mode=None, oai_src_id=-1, sequence_id=None):
    """
    Creates a bibupload task for the task scheduler in given mode
    on given file. Returns the generated task id and logs the event
    in oaiHARVESTLOGS, also adding any given oai source identifier.


    :param marcxmlfile: base-marcxmlfilename to upload
    :param mode: mode to upload in
    :param oai_src_id: id of current source config
    :param sequence_id: sequence-number, if relevant

    :return: task_id if successful, otherwise None.
    """
    if mode is None:
        mode = ["-r", "-i"]
    if os.path.exists(marcxmlfile):
        try:
            args = mode
            # Add job with priority 6 (above normal bibedit tasks)
            # and file to upload to arguments
            args.extend(["-P", "6", marcxmlfile])
            if sequence_id:
                args.extend(['-I', str(sequence_id)])
            task_id = task_low_level_submission("bibupload", "oaiharvest", *tuple(args))
            create_oaiharvest_log(task_id, oai_src_id, marcxmlfile)
        except Exception as msg:
            write_message("An exception during submitting oaiharvest task occured : %s " % (str(msg)))
            return None
        return task_id
    else:
        write_message("marcxmlfile %s does not exist" % (marcxmlfile,))
        return None
Exemple #2
0
def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE):
    """Inserts a new record into all the data structures"""
    #data_dict
    data_dict[recid] = value
    #data_dict_ordered & data_list_sorted
    #calculate at which index the rec should be inserted in data_list_sorted
    index_for_insert = binary_search(data_list_sorted, value, data_dict)
    #we have to calculate the weight of this record in data_dict_ordered
    #and it will be the med between its neighbours in the data_list_sorted
    if index_for_insert == len(data_list_sorted):#insert at the end of the list
        #append at the end of the list
        data_list_sorted.append(recid)
        #weight = highest weight + the distance
        data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing
    else:
        if index_for_insert == 0: #insert at the begining of the list
            left_neighbor_weight = 0
        else:
            left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]]
        right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]]
        #the recid's weight will be the med between left and right
        weight = (right_neighbor_weight - left_neighbor_weight)/2
        if weight < 1: #there is no more space to insert, we have to create some space
            data_list_sorted.insert(index_for_insert, recid)
            data_dict_ordered[recid] = left_neighbor_weight + spacing
            create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing)
        else:
            data_list_sorted.insert(index_for_insert, recid)
            data_dict_ordered[recid] = left_neighbor_weight + weight
    write_message("Record %s done." %recid, verbose=5)
    return index_for_insert
Exemple #3
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        format_record(recID, fmt, on_the_fly=True)
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Exemple #4
0
def run_sorting_method(recids, method_name, method_id, definition, washer):
    """Does the actual sorting for the method_name
    for all the records in the database"""
    run_sorting_for_rnk = False
    if definition.startswith('RNK'):
        run_sorting_for_rnk = True
    field_data_dictionary = get_field_data(recids, method_name, definition)
    if not field_data_dictionary:
        write_message("POSSIBLE ERROR: The sorting method --%s-- has no data!" \
                      %method_name)
        return True
    apply_washer(field_data_dictionary, washer)
    #do we have any locale constraint?
    sorting_locale = locale_for_sorting(washer)
    sorted_data_list, sorted_data_dict = \
                sort_dict(field_data_dictionary, CFG_BIBSORT_WEIGHT_DISTANCE, run_sorting_for_rnk, sorting_locale)
    executed = write_to_methoddata_table(method_id, field_data_dictionary, \
                                         sorted_data_dict, sorted_data_list)

    if not executed:
        return False
    if CFG_BIBSORT_BUCKETS > 1:
        bucket_dict, bucket_last_rec_dict = split_into_buckets(sorted_data_list, len(sorted_data_list))
        for idx in bucket_dict:
            executed = write_to_buckets_table(method_id, idx, bucket_dict[idx], \
                                              sorted_data_dict[bucket_last_rec_dict[idx]])
            if not executed:
                return False
    else:
        executed = write_to_buckets_table(method_id, 1, intbitset(sorted_data_list), \
                                          sorted_data_list[-1])
        if not executed:
            return False
    return True
Exemple #5
0
def get_modified_or_inserted_recs(method_list):
    """Returns a list of recids that have been inserted or
    modified since the last update of the bibsort methods in method_list
    method_list should already contain a list of methods that
    SHOULD be updated, if it contains new methods, an error will be thrown"""

    if not method_list: #just to be on the safe side
        return 0

    try:
        query = """SELECT min(d.last_updated) from "bsrMETHODDATA" d,
                                                   "bsrMETHOD" m
                WHERE m.name in (%s) AND d."id_bsrMETHOD" = m.id""" % \
                ("%s," * len(method_list))[:-1]
        last_updated = str(run_sql(query, tuple(method_list))[0][0])
    except Error as err:
        write_message("Error when trying to get the last_updated date " \
                      "from bsrMETHODDATA: [%s]" %err, sys.stderr)
        return 0
    recids = []
    try:
        results = run_sql("SELECT id from bibrec \
                          where modification_date >= %s", (last_updated, ))
        if results:
            recids = [result[0] for result in results]
    except Error as err:
        write_message("Error when trying to get the list of " \
                      "modified records: [%s]" %err, sys.stderr)
        return 0
    return recids
Exemple #6
0
def create_ticket(queue, subject, text=""):
    """
    This function will submit a ticket using the configured BibCatalog system.

    :param queue: the ticketing queue to send a ticket to
    :type queue: string

    :param subject: subject of the ticket
    :type subject: string

    :param text: the main text or body of the ticket. Optional.
    :type text: string

    :return: return the ID of the created ticket, or None on failure
    :rtype: int or None
    """
    # Initialize BibCatalog connection as default user, if possible
    if bibcatalog_system is not None:
        bibcatalog_response = bibcatalog_system.check_system()
    else:
        bibcatalog_response = "No ticket system configured"
    if bibcatalog_response != "":
        write_message("BibCatalog error: %s\n" % (bibcatalog_response,))
        return None

    ticketid = bibcatalog_system.ticket_submit(subject=subject, queue=queue)
    if text:
        comment = bibcatalog_system.ticket_comment(None, ticketid, text)
        if comment is None:
            write_message("Error: commenting on ticket %s failed." % (str(ticketid),))
    return ticketid
Exemple #7
0
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Exemple #8
0
def update_sorting(methods, recids):
    """Runs the updating of the sorting tables for methods and recids
    Recids is a list of integer numbers(record ids)
    but can also contain intervals"""
    method_list = []
    if methods:
        method_list = methods.strip().split(',')

    recid_list = []
    if recids:
        cli_recid_list = recids.strip().split(',')
        for recid in cli_recid_list:
            if recid.find('-') > 0:
                rec_range = recid.split('-')
                try:
                    recid_min = int(rec_range[0])
                    recid_max = int(rec_range[1])
                    for rec in range(recid_min, recid_max + 1):
                        recid_list.append(rec)
                except Error as err:
                    write_message("Error: [%s] occured while trying \
                          to parse the recids argument." %err, sys.stderr)
                    return False
            else:
                recid_list.append(int(recid))
    return run_bibsort_update(recid_list, method_list)
Exemple #9
0
def get_config_parameter(jobname, parameter_name, is_parameter_collection = False):
    """Detect export method of JOBNAME.  Basically, parse JOBNAME.cfg
       and return export_method.  Return None if problem found."""
    jobconfig = ConfigParser()
    jobconffile = CFG_ETCDIR + os.sep + 'bibexport' + os.sep + jobname + '.cfg'

    if not os.path.exists(jobconffile):
        write_message("ERROR: cannot find config file %s." % jobconffile)
        return None

    jobconfig.read(jobconffile)

    if is_parameter_collection:
        all_items = jobconfig.items(section='export_job')

        parameters = []

        for item_name, item_value in all_items:
            if item_name.startswith(parameter_name):
                parameters.append(item_value)

        return parameters
    else:
        parameter = jobconfig.get('export_job', parameter_name)
        return parameter
Exemple #10
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, "037__9"):
            if report_number == "arXiv":
                return True
        return False

    # Fetch all records inserted since last run
    sql = (
        "SELECT `id`, `modification_date` FROM `bibrec` "
        "WHERE `modification_date` >= %s "
        "ORDER BY `modification_date`"
    )
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option("verbose") >= 9:
        write_message("recids:", verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
    """returns a dictionary containing the number of
    external links for each recid
    external link=citation that is not in our database """
    ext_links = {}
    dict_all_ref = {}
    for recid in dict_of_ids:
        dict_all_ref[recid] = 0
        ext_links[dict_of_ids[recid]] = 0
    reference_db_id = reference_indicator[0:2]
    reference_tag_regex = reference_indicator + "[a-z]"
    tag_list = run_sql("select id from bib" + reference_db_id + \
                         "x where tag RLIKE %s", (reference_tag_regex, ))
    tag_set = set()
    for tag in tag_list:
        tag_set.add(tag[0])
    ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \
                       bibrec_bib" + reference_db_id + "x group by \
                       id_bibrec, field_number")
    for item in ref_list:
        recid = int(item[0])
        id_bib = int(item[1])
        if recid in dict_of_ids and id_bib in tag_set:
            dict_all_ref[recid] += 1
    for recid in dict_of_ids:
        total_links = dict_all_ref[recid]
        internal_links = ref[dict_of_ids[recid]]
        ext_links[dict_of_ids[recid]] = total_links - internal_links
        if ext_links[dict_of_ids[recid]] < 0:
            ext_links[dict_of_ids[recid]] = 0
    write_message("External link information extracted", verbose=2)
    write_message("External links: %s" % str(ext_links), verbose=9)
    return ext_links
def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse):
    """the core function of the PAGERANK_EXT method
    returns an array with the ranks coresponding to each recid"""
    weights_old = array((), float32)
    weights_old = ones((len_), float32)
    weights_new = array((), float32)
    converged = False
    nr_of_check_points = 0
    difference = len_
    while not converged:
        nr_of_check_points += 1
        for step in (range(check_point)):
            weights_new = zeros((len_), float32)
            for (i, j) in sparse.keys():
                weights_new[i] += sparse[(i, j)]*weights_old[j]
            total_sum = 0.0
            for j in semi_sparse:
                total_sum += semi_sparse[j]*weights_old[j]
            weights_new[1:len_] = weights_new[1:len_] + total_sum
            if step == check_point - 1:
                diff = weights_new - weights_old
                difference = sqrt(dot(diff, diff))/len_
                write_message("Finished step: %s, %s " \
                    % (str(check_point*(nr_of_check_points-1) + step), \
                        str(difference)), verbose=5)
            weights_old = weights_new.copy()
            converged = (difference < conv_threshold)
    write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), \
            str(difference)), verbose=2)
    #return weights_old[1:len_]/(len_ - weights_old[0])
    return weights_old[1:len_]
Exemple #13
0
def get_data_for_definition_marc(tags, recids):
    '''Having a list of tags and a list of recids, it returns a dictionary
    with the values correspondig to the tags'''
    #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x]
    #user: 140s, sys: 21s, total: 160s - cdsdev
    if isinstance(recids, (int, long)):
        recids = intbitset([recids, ])
    # for each recid we need only one value
    #on which we sort, so we can stop looking for a value
    # as soon as we find one
    tag_index = 0
    field_data_dict = {}
    while len(recids) > 0 and tag_index < len(tags):
        write_message('%s records queried for values for tags %s.' \
                      %(len(recids), tags), verbose=5)
        res = _get_values_from_marc_tag(tags[tag_index], recids)
        res_dict = dict(res)
        #field_data_dict.update(res_dict)
        #we can not use this, because res_dict might contain recids
        #that are already in field_data_dict, and we should not overwrite their value
        field_data_dict = dict(res_dict, **field_data_dict)
        #there might be keys that we do not want (ex: using 'between')
        #so we should remove them
        res_dict_keys = intbitset(res_dict.keys())
        recids_not_needed = res_dict_keys.difference(recids)
        for recid in recids_not_needed:
            del field_data_dict[recid]
        #update the recids to contain only the recid that do not have values yet
        recids.difference_update(res_dict_keys)
        tag_index += 1
    return field_data_dict
Exemple #14
0
def query_records(params):
    """Produce record IDs from given query parameters.

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
Exemple #16
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
Exemple #17
0
def load_configuration():
    """Loads the configuration for the bibsort.cfg file into the database"""
    config_file = cfg.get('CFG_SORTER_CONFIGURATION',
                          pkg_resources.resource_filename(
                              'invenio.legacy.bibsort', 'bibsort.cfg'))
    write_message('Reading config data from: %s' % (config_file, ))
    config = ConfigParser.ConfigParser()
    try:
        config.readfp(open(config_file))
    except StandardError as err:
        write_message("Cannot find configuration file: %s" \
                      %config_file, stream=sys.stderr)
        return False
    to_insert = []
    for section in config.sections():
        try:
            name = config.get(section, "name")
            definition = config.get(section, "definition")
            washer = config.get(section, "washer")
        except (ConfigParser.NoOptionError, StandardError) as err:
            write_message("For each sort_field you need to define at least \
                          the name, the washer and the definition. \
                          [error: %s]" %err, stream=sys.stderr)
            return False
        to_insert.append((name, definition, washer))
    # all the values were correctly read from the config file
    truncate_table("bsrMETHOD")
    write_message('Old data has been deleted from bsrMETHOD table', verbose=5)
    for row in to_insert:
        run_sql("""INSERT INTO "bsrMETHOD"(name, definition, washer)
                VALUES (%s, %s, %s)""", (row[0], row[1], row[2]))
        write_message('Method %s has been inserted into bsrMETHOD table' \
                      %row[0], verbose=5)
    return True
Exemple #18
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        msg = "ERROR: cannot find %s." % cmd
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)

    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s | %s -c " % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME),
            CFG_PATH_GZIP)
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename)
    if dummy1:
        msg = "ERROR: mysqldump exit code is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy2:
        msg = "ERROR: mysqldump stdout is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy3:
        msg = "ERROR: mysqldump stderr is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
Exemple #19
0
 def fun():
     try:
         return task_run_core(name, core_func, extra_vars)
     except Exception:
         # Remove extra '\n'
         write_message(traceback.format_exc()[:-1])
         raise
def print_missing(num):
    """
    Print the contents of rnkCITATIONDATAEXT table containing external
    records that were cited by NUM or more internal records.

    NUM is by default taken from the -E command line option.
    """
    if not num:
        num = task_get_option("print-extcites")

    write_message(
        "Listing external papers cited by %i or more \
                                                      internal records:"
        % num
    )

    res = run_sql(
        """SELECT COUNT(id_bibrec), extcitepubinfo
                     FROM rnkCITATIONDATAEXT
                     GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
                     ORDER BY COUNT(id_bibrec) DESC""",
        (num,),
    )
    for cnt, brec in res:
        print(str(cnt), "\t", brec)

    write_message("Listing done.")
Exemple #21
0
def fetch_concerned_arxiv_records(name):
    task_update_progress("Fetching arxiv record ids")

    dummy, last_date = fetch_last_updated(name)

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
        "WHERE `modification_date` >= %s " \
        "AND `creation_date` > NOW() - INTERVAL 7 DAY " \
        "ORDER BY `modification_date`" \
        "LIMIT 5000"
    records = run_sql(sql, [last_date.isoformat()])

    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False

    def check_pdf_date(recid):
        doc = get_pdf_doc(recid)
        if doc:
            return doc.md > last_date
        return False

    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
    records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
    write_message("recids %s" % repr([(r, mod_date.isoformat()) \
                                               for r, mod_date in records]))
    task_update_progress("Done fetching arxiv record ids")
    return records
Exemple #22
0
def replace_cites(recid, new_cites):
    """
    Given a set of citations, replaces the citations of given recid
    in the database.
    The changes are logged into rnkCITATIONLOG.

    See @replace_refs
    """
    old_cites = set(row[0] for row in run_sql("""SELECT citer
                                                FROM rnkCITATIONDICT
                                                WHERE citee = %s""", [recid]))

    cites_to_add = new_cites - old_cites
    cites_to_delete = old_cites - new_cites

    for cite in cites_to_add:
        write_message('adding cite %s %s' % (recid, cite), verbose=1)
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated)
                   VALUES (%s, %s, %s)""", (recid, cite, now))
        run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                   VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now))

    for cite in cites_to_delete:
        write_message('deleting cite %s %s' % (recid, cite), verbose=1)
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        run_sql("""DELETE FROM rnkCITATIONDICT
                   WHERE citee = %s and citer = %s""", (recid, cite))
        run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                   VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Exemple #24
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0     # time taken up by external call
    tbibupload = 0     # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Exemple #25
0
 def set_amended(self, message):
     """ Mark the record as amended """
     write_message("Amended record %s by rule %s: %s" %
             (self.record_id, self.rule["name"], message))
     self.amendments.append("Rule %s: %s" % (self.rule["name"], message))
     self.amended = True
     if self.rule["holdingpen"]:
         self.holdingpen = True
Exemple #26
0
def insert_into_cit_db(dic, name):
    """Stores citation dictionary in the database"""
    ndate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    s = serialize_via_marshal(dic)
    write_message("size of %s %s" % (name, len(s)))
    # check that this column really exists
    run_sql("""REPLACE INTO rnkCITATIONDATA(object_name, object_value,
               last_updated) VALUES (%s, %s, %s)""", (name, s, ndate))
def _task_write_message(message):
    """ Stores the messages in a global list for notifications
        @param message: the message that should be printed as task status
        @type message: string
    """
    write_message(message)
    global _MSG_HISTORY
    _MSG_HISTORY.append(message)
def calculate_time_weights(len_, time_decay, dates):
    """calculates the time coeficients for each paper"""
    current_year = int(datetime.datetime.now().strftime("%Y"))
    date_coef = {}
    for j in range(len_):
        date_coef[j] = exp(time_decay*(dates[j] - current_year))
    write_message("Time weights calculated", verbose=5)
    write_message("Time weights: %s" % str(date_coef), verbose=9)
    return date_coef
def statistics_on_sparse(sparse):
    """returns the number of papers that cite themselves"""
    count_diag = 0
    for (i, j) in sparse.keys():
        if i == j:
            count_diag += 1
    write_message("The number of papers that cite themselves: %s" % \
        str(count_diag), verbose=3)
    return count_diag
def leaves(ref):
    """returns the number of papers that do not cite any other paper"""
    nr_of_leaves = 0
    for i in ref:
        if i == 0:
            nr_of_leaves += 1
    write_message("The number of papers that do not cite \
any other papers: %s" % str(leaves), verbose=3)
    return nr_of_leaves
Exemple #31
0
def run_bibsort_update(recids=None, method_list=None):
    """Updates bibsort tables for the methods in method_list
    and for the records in recids.

    If recids is None: recids = all records that have been modified
    or inserted since last update

    If method_list is None: method_list = all the methods available
    in bsrMETHOD table"""

    write_message('Initial data for run_bibsort_update method: ' \
                  'number of recids = %s; method_list=%s' \
                  %(str(len(recids)), method_list), verbose=5)
    write_message('Updating sorting data.')

    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    method_list = bibsort_methods.keys()
    if not method_list:
        write_message('No methods found in bsrMETHOD table.. exiting.')
        return True

    #we could have 4 types of methods:
    #(i) RNK methods -> they should be rebalanced, not updated
    #(ii) RNK methods to delete -> we should delete their data
    #(iii) non RNK methods to update
    #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated
    #check which of the methods are RNK methods (they do not need modified recids)
    rnk_methods = get_rnk_methods(bibsort_methods)
    rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(
        rnk_methods, bibsort_methods)
    #check which of the methods have no data, so they are actually new,
    #so they need balancing(sorting) instead of updating
    non_rnk_methods = [
        method for method in bibsort_methods.keys()
        if method not in rnk_methods
    ]
    non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(
        non_rnk_methods)

    #(i) + (iv)
    methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted
    if methods_to_balance:  # several methods require rebalancing(sorting) and not updating
        return run_bibsort_rebalance(methods_to_balance)

    #(ii)
    #remove the data for the ranking methods that have been deleted
    for method in rnk_methods_deleted:
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Deleting data for method %s" % method)
        write_message('Starting deleting the data for RNK method %s' % method,
                      verbose=5)
        executed_ok = delete_bibsort_data_for_method(
            bibsort_methods[method]['id'])
        if not executed_ok:
            write_message('Method %s could not be deleted correctly, aborting..' \
                          %method, sys.stderr)
            return False

    #(iii)
    #methods to actually update
    if non_rnk_methods_updated:  # we want to update some 'normal'(not RNK) tables, so we need recids
        update_timestamp = False
        if not recids:
            recids = get_modified_or_inserted_recs(non_rnk_methods_updated)
            if recids == 0:  #error signal
                return False
            if not recids:
                write_message("No records inserted or modified in bibrec table " \
                          "since the last update of bsrMETHODDATA.")
                return True
            write_message("These records have been recently modified/inserted: %s" \
                  %str(recids), verbose=5)
            update_timestamp = True
        recids_i = intbitset(recids)
        for method in non_rnk_methods_updated:
            task_sleep_now_if_required(can_stop_too=True)
            task_update_progress("Updating method %s" % method)
            write_message('Starting updating method %s' % method, verbose=5)
            executed_ok = update_bibsort_tables(recids_i, method,
                                                update_timestamp)
            if not executed_ok:
                write_message('Method %s could not be executed correctly, aborting..' \
                          %method, sys.stderr)
                return False
    return True
Exemple #32
0
def run_bibsort_rebalance(method_list=None):
    """Rebalances all buckets for the methods in method_list"""
    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    if not bibsort_methods:
        write_message('No methods found.. exiting rebalancing.')
        return True
    #check if there are only ranking methods -> no need for recids
    rnk_methods = get_rnk_methods(bibsort_methods)
    non_rnk_method = [
        method for method in bibsort_methods.keys()
        if method not in rnk_methods
    ]

    write_message('Running rebalancing for methods: %s' %
                  bibsort_methods.keys())

    if non_rnk_method:  # we have also 'normal' (no RNK) methods, so we need the recids
        recids = get_all_recids(including_deleted=False)
        write_message('Rebalancing will run for %s records.' \
                      %str(len(recids)), verbose=5)
        task_sleep_now_if_required(can_stop_too=True)
    else:
        recids = intbitset([])
        write_message('Rebalancing will run only for RNK methods')
    for name in bibsort_methods:
        task_update_progress('Rebalancing %s method.' % name)
        write_message('Starting sorting the data for %s method ... ' \
                          %name.upper())
        executed_ok = run_sorting_method(recids, name,
                                         bibsort_methods[name]['id'],
                                         bibsort_methods[name]['definition'],
                                         bibsort_methods[name]['washer'])
        if not executed_ok:
            write_message('Method %s could not be executed correctly.' \
                          %name, sys.stderr)
            return False
        write_message('Done.')
        task_sleep_now_if_required(can_stop_too=True)
    task_update_progress('Rebalancing done.')
    return True
Exemple #33
0
    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" %
                      url_direct_or_indirect,
                      verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" %
                              url_direct_or_indirect,
                              verbose=2)
                try:
                    bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                except InvenioBibDocFileError:
                    # Outdated 8564 tag
                    return []
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            try:
                                text = bibrecdocs.get_text()
                            except InvenioBibDocFileError:
                                # Invalid PDF
                                continue
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message(
                        "... %s is external URL but indexing only local files"
                        % url_direct_or_indirect,
                        verbose=2)
                    return []
                write_message("... %s is an external URL" %
                              url_direct_or_indirect,
                              verbose=2)
                urls_to_index = set()
                for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" %
                                      (url_direct_or_indirect, splash_re),
                                      verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message(
                            "... found these URLs in %s splash page: %s" %
                            (url_direct_or_indirect, ", ".join(urls)),
                            verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message(
                                    "... will index %s (matched by %s)" %
                                    (url, url_re),
                                    verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" %
                              ', '.join(urls_to_index),
                              verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel(
                    )
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc,
                                                   output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(
                                        None,
                                        text)  # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(
                                    text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception as e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (
                                url, url_direct_or_indirect, e)
                            register_exception(prefix=message,
                                               alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception as e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (
                url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []
Exemple #34
0
 def warn(self, msg):
     """ Add a warning to the record """
     self.warnings.append("Rule %s: %s" % (self.rule["name"], msg))
     write_message("[WARN] record %s by rule %s: %s" %
                   (self.record_id, self.rule["name"], msg))
Exemple #35
0
def perform_update_buckets(recids_current_ordered,
                           recids_to_insert,
                           recids_old_ordered,
                           method_id,
                           update_timestamp=True):
    """Updates the buckets"""
    bucket_insert = {}
    bucket_delete = {}
    write_message("Updating the buckets for method_id = %s" % method_id,
                  verbose=5)
    buckets = run_sql(
        "SELECT bucket_no, bucket_last_value \
                      FROM bsrMETHODDATABUCKET \
                      WHERE id_bsrMETHOD = %s", (method_id, ))
    if not buckets:
        write_message("No bucket data found for method_id %s." \
                      %method_id, sys.stderr)
        raise Exception
    #sort the buckets to be sure we are iterating them in order(1 to max):
    buckets_dict = dict(buckets)
    for recid in recids_to_insert:
        for bucket_no in buckets_dict:
            if recids_current_ordered[recid] <= buckets_dict[bucket_no]:
                bucket_insert.setdefault(bucket_no, []).append(recid)
                break

    for recid in recids_old_ordered:
        record_inserted = 0
        record_deleted = 0
        for bucket_no in buckets_dict:
            bucket_value = int(buckets_dict[bucket_no])
            if record_inserted and record_deleted:
                #both insertion and deletion have been registered
                break
            if recids_current_ordered[recid] <= bucket_value and \
                recids_old_ordered[recid] <= bucket_value and \
                not record_inserted and \
                not record_deleted:
                #both before and after the modif,
                #recid should be in the same bucket -> nothing to do
                break
            if recids_current_ordered[
                    recid] <= bucket_value and not record_inserted:
                #recid should be, after the modif, here, so insert
                bucket_insert.setdefault(bucket_no, []).append(recid)
                record_inserted = 1
            if recids_old_ordered[recid] <= bucket_value and not record_deleted:
                #recid was here before modif, must be removed
                bucket_delete.setdefault(bucket_no, []).append(recid)
                record_deleted = 1

    for bucket_no in buckets_dict:
        if (bucket_no in bucket_insert) or (bucket_no in bucket_delete):
            res = run_sql("SELECT bucket_data FROM bsrMETHODDATABUCKET \
                          where id_bsrMETHOD = %s AND bucket_no = %s"                                                                     , \
                          (method_id, bucket_no, ))
            bucket_data = intbitset(res[0][0])
            for recid in bucket_insert.get(bucket_no, []):
                bucket_data.add(recid)
            for recid in bucket_delete.get(bucket_no, []):
                if recid in bucket_data:
                    bucket_data.remove(recid)
            if update_timestamp:
                date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                run_sql("UPDATE bsrMETHODDATABUCKET \
                    SET bucket_data = %s, last_updated = %s \
                    WHERE id_bsrMETHOD = %s AND bucket_no = %s"                                                               , \
                    (bucket_data.fastdump(), date, method_id, bucket_no, ))
            else:
                run_sql("UPDATE bsrMETHODDATABUCKET \
                    SET bucket_data = %s \
                    WHERE id_bsrMETHOD = %s AND bucket_no = %s"                                                               , \
                    (bucket_data.fastdump(), method_id, bucket_no, ))
            write_message("Updating bucket %s for method %s." %
                          (bucket_no, method_id),
                          verbose=5)
Exemple #36
0
def write_to_buckets_table(id_method,
                           bucket_no,
                           bucket_data,
                           bucket_last_value,
                           update_timestamp=True):
    """Serialize the date and write it to the bsrMEHODDATA_BUCKETS"""
    write_message('Writing the data for bucket number %s for ' \
                  'method_id=%s to the database' \
                  %(bucket_no, id_method), verbose=5)
    write_message('Serializing data for bucket number %s' % bucket_no,
                  verbose=5)
    serialized_bucket_data = bucket_data.fastdump()
    date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if not update_timestamp:
        try:
            date = run_sql('SELECT last_updated from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \
                           (id_method, bucket_no))[0][0]
        except IndexError:
            pass  # keep the generated date
    try:
        write_message('Deleting old data.', verbose=5)
        run_sql("DELETE FROM bsrMETHODDATABUCKET \
                WHERE id_bsrMETHOD = %s AND bucket_no = %s"                                                           , \
                (id_method, bucket_no, ))
        write_message('Inserting new data.', verbose=5)
        run_sql("INSERT into bsrMETHODDATABUCKET \
            (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \
            VALUES (%s, %s, %s, %s, %s)"                                        , \
            (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, ))
    except Error as err:
        write_message("The error [%s] occured when inserting new bibsort data " \
                      "into bsrMETHODATA_BUCKETS table" %err, sys.stderr)
        return False
    write_message('Writing to bsrMETHODDATABUCKET for ' \
                  'bucket number %s completed.' %bucket_no, verbose=5)
    return True
Exemple #37
0
def update_bibsort_tables(recids, method, update_timestamp=True):
    """Updates the data structures for sorting method: method
    for the records in recids"""

    res = run_sql(
        "SELECT id, definition, washer \
                  from bsrMETHOD where name = %s", (method, ))
    if res and res[0]:
        method_id = res[0][0]
        definition = res[0][1]
        washer = res[0][2]
    else:
        write_message('No sorting method called %s could be found ' \
                      'in bsrMETHOD table.' %method, sys.stderr)
        return False
    res = run_sql(
        "SELECT data_dict, data_dict_ordered, data_list_sorted \
                  FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
    if res and res[0]:
        data_dict = deserialize_via_marshal(res[0][0])
        data_dict_ordered = {}
        data_list_sorted = []
    else:
        write_message('No data could be found for the sorting method %s.' \
                      %method)
        return False  #since this case should have been treated earlier
    #get the values for the recids that need to be recalculated
    field_data = get_field_data(recids, method, definition)
    if not field_data:
        write_message("Possible error: the method %s has no data for records %s." \
                      %(method, str(recids)))
    else:
        apply_washer(field_data, washer)

    #if a recid is not in field_data that is because no value was found for it
    #so it should be marked for deletion
    recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
    recids_to_insert = []
    recids_to_modify = {}
    for recid in field_data:
        if recid in data_dict:
            if data_dict[recid] != field_data[recid]:
                #we store the old value
                recids_to_modify[recid] = data_dict[recid]
        else:  # recid is new, and needs to be inserted
            recids_to_insert.append(recid)

    #remove the recids that were not previously in bibsort
    recids_to_delete = [
        recid for recid in recids_to_delete if recid in data_dict
    ]

    #dicts to keep the ordered values for the recids - useful bor bucket insertion
    recids_current_ordered = {}
    recids_old_ordered = {}

    if recids_to_insert or recids_to_modify or recids_to_delete:
        data_dict_ordered = deserialize_via_marshal(res[0][1])
        data_list_sorted = deserialize_via_marshal(res[0][2])
        if recids_to_modify:
            write_message("%s records have been modified." \
                          %len(recids_to_modify), verbose=5)
            for recid in recids_to_modify:
                recids_old_ordered[recid] = data_dict_ordered[recid]
                perform_modify_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_insert:
            write_message("%s records have been inserted." \
                          %len(recids_to_insert), verbose=5)
            for recid in recids_to_insert:
                perform_insert_record(data_dict, data_dict_ordered, \
                                data_list_sorted, field_data[recid], recid)
        if recids_to_delete:
            write_message("%s records have been deleted." \
                          %len(recids_to_delete), verbose=5)
            for recid in recids_to_delete:
                perform_delete_record(data_dict, data_dict_ordered,
                                      data_list_sorted, recid)

        for recid in recids_to_modify:
            recids_current_ordered[recid] = data_dict_ordered[recid]
        for recid in recids_to_insert:
            recids_current_ordered[recid] = data_dict_ordered[recid]

        #write the modifications to db
        executed = write_to_methoddata_table(method_id, data_dict, \
                                         data_dict_ordered, data_list_sorted, update_timestamp)
        if not executed:
            return False

        #update buckets
        try:
            perform_update_buckets(recids_current_ordered, recids_to_insert,
                                   recids_old_ordered, method_id,
                                   update_timestamp)
        except Error as err:
            write_message("[%s] The bucket data for method %s has not been updated" \
                          %(method, err), sys.stderr)
            return False
    return True
Exemple #38
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql(
            """SELECT id FROM "schTASK" WHERE proc='bibupload:oairepository' AND status='WAITING'"""
    ):
        write_message(
            "Previous requests of oairepository still being elaborated. Let's skip this execution."
        )
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, m='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               m='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               m='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in iteritems(recids_for_set)
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename, '-n',
                                              '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename,
                                              '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
    else:
        os.remove(filename)

    return True
Exemple #39
0
def write_to_methoddata_table(id_method,
                              data_dict,
                              data_dict_ordered,
                              data_list_sorted,
                              update_timestamp=True):
    """Serialize the date and write it to the bsrMETHODDATA"""
    write_message('Starting serializing the data..', verbose=5)
    serialized_data_dict = serialize_via_marshal(data_dict)
    serialized_data_dict_ordered = serialize_via_marshal(data_dict_ordered)
    serialized_data_list_sorted = serialize_via_marshal(data_list_sorted)
    write_message('Serialization completed.', verbose=5)
    date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if not update_timestamp:
        try:
            date = run_sql(
                'SELECT last_updated from bsrMETHODDATA WHERE id_bsrMETHOD = %s',
                (id_method, ))[0][0]
        except IndexError:
            pass  # keep the generated date
    write_message("Starting writing the data for method_id=%s " \
                  "to the database (table bsrMETHODDATA)" %id_method, verbose=5)
    try:
        write_message('Deleting old data..', verbose=5)
        run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s",
                (id_method, ))
        write_message('Inserting new data..', verbose=5)
        run_sql("INSERT into bsrMETHODDATA \
            (id_bsrMETHOD, data_dict, data_dict_ordered, data_list_sorted, last_updated) \
            VALUES (%s, %s, %s, %s, %s)"                                        , \
            (id_method, serialized_data_dict, serialized_data_dict_ordered, \
             serialized_data_list_sorted, date, ))
    except Error as err:
        write_message("The error [%s] occured when inserting new bibsort data "\
                      "into bsrMETHODATA table" %err, sys.stderr)
        return False
    write_message('Writing to the bsrMETHODDATA successfully completed.', \
                  verbose=5)
    return True
def citerank(rank_method_code):
    """new ranking method based on the citation graph"""
    write_message("Running rank method: %s" % rank_method_code, verbose=0)
    if not import_numpy:
        write_message('The numpy package could not be imported. \
This package is compulsory for running the citerank methods.')
        return
    try:
        file_ = configuration.get(rank_method_code + '.cfg', '')
        config = ConfigParser.ConfigParser()
        config.readfp(open(file_))
    except StandardError:
        write_message("Cannot find configuration file: %s" % file_, sys.stderr)
        raise StandardError
    # the file for citations needs to have the following format:
    #each line needs to be x[tab]y, where x cites y; x,y are recids
    function = config.get("rank_method", "function")
    try:
        file_for_citations = config.get(function, "file_with_citations")
        cit, dict_of_ids = get_citations_from_file(file_for_citations)
    except (ConfigParser.NoOptionError, StandardError) as err:
        write_message("If you want to read the citation data from file set up \
the file_for_citations parameter in the config file [%s]" % err,
                      verbose=2)
        cit, dict_of_ids = get_citations_from_db()
    len_ = len(dict_of_ids.keys())
    write_message("Number of nodes(papers) to rank : %s" % str(len_),
                  verbose=3)
    if len_ == 0:
        write_message("No citation data found, nothing to be done.")
        return
    try:
        method = config.get(function, "citerank_method")
    except ConfigParser.NoOptionError as err:
        write_message("Exception: %s " % err, sys.stderr)
        raise Exception
    write_message("Running %s method." % method, verbose=2)
    dates = get_dates(function, config, dict_of_ids)
    if method == "citation_time":
        try:
            time_decay = float(config.get(function, "time_decay"))
        except (ConfigParser.NoOptionError, ValueError) as err:
            write_message("Exception: %s" % err, sys.stderr)
            raise Exception
        date_coef = calculate_time_weights(len_, time_decay, dates)
        #cit = remove_loops(cit, dates, dict_of_ids)
        dict_of_ranks = \
            run_citation_rank_time(cit, dict_of_ids, date_coef, dates)
    else:
        try:
            conv_threshold = float(config.get(function, "conv_threshold"))
            check_point = int(config.get(function, "check_point"))
            damping_factor = float(config.get(function, "damping_factor"))
            write_message("Parameters: d = %s, conv_threshold = %s, \
check_point = %s"                  %(str(damping_factor), \
str(conv_threshold), str(check_point)), verbose=5)
        except (ConfigParser.NoOptionError, StandardError) as err:
            write_message("Exception: %s" % err, sys.stderr)
            raise Exception
        if method == "pagerank_classic":
            ref = construct_ref_array(cit, dict_of_ids, len_)
            use_ext_cit = ""
            try:
                use_ext_cit = config.get(function, "use_external_citations")
                write_message("Pagerank will use external citations: %s" \
                   %str(use_ext_cit), verbose=5)
            except (ConfigParser.NoOptionError, StandardError) as err:
                write_message("%s" % err, verbose=2)
            if use_ext_cit == "yes":
                try:
                    ext_citation_file = config.get(function,
                                                   "ext_citation_file")
                    ext_links = get_external_links_from_file(
                        ext_citation_file, ref, dict_of_ids)
                except (ConfigParser.NoOptionError, StandardError):
                    write_message("If you want to read the external citation \
data from file set up the ext_citation_file parameter in the config. file"                                                                          , \
verbose=3)
                    try:
                        reference_tag = config.get(function,
                                                   "ext_reference_tag")
                        dummy = int(reference_tag[0:3])
                    except (ConfigParser.NoOptionError, StandardError):
                        write_message(
                            "You need to set up correctly the \
reference_tag in the cfg file", sys.stderr)
                        raise Exception
                    ext_links = get_external_links_from_db(ref, \
                            dict_of_ids, reference_tag)
                    avg = avg_ext_links_with_0(ext_links)
                    if avg < 1:
                        write_message("This method can't be ran. There is not \
enough information about the external citation. Hint: check the reference tag"                                                                              , \
sys.stderr)
                        raise Exception
                    avg_ext_links_without_0(ext_links)
                try:
                    alpha = float(config.get(function, "ext_alpha"))
                    beta = float(config.get(function, "ext_beta"))
                except (ConfigParser.NoOptionError, StandardError) as err:
                    write_message("Exception: %s" % err, sys.stderr)
                    raise Exception
                dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \
                ext_links, conv_threshold, check_point, alpha, beta, dates)
            else:
                dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \
                    damping_factor, conv_threshold, check_point, dates)
        elif method == "pagerank_time":
            try:
                time_decay = float(config.get(function, "time_decay"))
                write_message("Parameter: time_decay = %s" \
                              %str(time_decay), verbose=5)
            except (ConfigParser.NoOptionError, StandardError) as err:
                write_message("Exception: %s" % err, sys.stderr)
                raise Exception
            date_coef = calculate_time_weights(len_, time_decay, dates)
            cit = remove_loops(cit, dates, dict_of_ids)
            ref = construct_ref_array(cit, dict_of_ids, len_)
            dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \
             damping_factor, conv_threshold, check_point, date_coef, dates)
        else:
            write_message(
                "Error: Unknown ranking method. \
Please check the ranking_method parameter in the config. file.", sys.stderr)
            raise Exception
    try:
        filename_ranks = config.get(function, "output_ranks_to_filename")
        max_ranks = config.get(function, "output_rank_limit")
        if not max_ranks.isdigit():
            max_ranks = len_
        else:
            max_ranks = int(max_ranks)
            if max_ranks > len_:
                max_ranks = len_
        ranks = sort_weights(dict_of_ranks)
        write_message("Ranks: %s" % str(ranks), verbose=9)
        write_first_ranks_to_file(ranks, dict_of_ranks, \
                max_ranks, filename_ranks)
    except (ConfigParser.NoOptionError, StandardError):
        write_message("If you want the ranks to be printed in a file you have \
to set output_ranks_to_filename and output_rank_limit \
parameters in the configuration file",
                      verbose=3)
    normalize_weights(dict_of_ranks)
    into_db(dict_of_ranks, rank_method_code)
Exemple #41
0
def tweet_to_record(tweet, query):
    """
    Transform a tweet into a record.
    @note: you may want to highly customize this.
    """
    rec = {}
    ## Let's normalize the body of the tweet.
    text = tweet.text.encode('UTF-8')
    text = text.replace('&gt;', '>')
    text = text.replace('&lt;', '<')
    text = text.replace('&quot;', "'")
    text = text.replace('&amp;', '&')

    ## Let's add the creation date
    try:
        creation_date = time.strptime(tweet.created_at,
                                      '%a, %d %b %Y %H:%M:%S +0000')
    except ValueError:
        creation_date = time.strptime(tweet.created_at,
                                      '%a %b %d %H:%M:%S +0000 %Y')
    record_add_field(rec, '260__c',
                     time.strftime('%Y-%m-%dZ%H:%M:%ST', creation_date))

    ## Let's add the Tweet ID
    record_add_field(rec, '970', subfields=[('a', str(tweet.id))])

    ## Let's add the body of the tweet as an abstract
    record_add_field(rec, '520', subfields=[('a', text)])

    ## Let's re-add the body of the tweet as a title.
    record_add_field(rec, '245', subfields=[('a', text)])

    ## Let's fetch information about the user
    try:
        user = _TWITTER_API.GetUser(tweet.from_user)

        ## Let's add the user name as author of the tweet
        record_add_field(rec,
                         '100',
                         subfields=[('a', str(user.name.encode('UTF-8')))])

        ## Let's fetch the icon of the user profile, and let's upload it as
        ## an image (and an icon of itself)
        record_add_field(rec,
                         'FFT',
                         subfields=[
                             ('a', user.profile.image_url.encode('UTF-8')),
                             ('x', user.profile.image_url.encode('UTF-8'))
                         ])
    except Exception as err:
        write_message("WARNING: issue when fetching the user: %s" % err,
                      stream=sys.stderr)
    if hasattr(tweet, 'iso_language_code'):
        ## Let's add the language of the Tweet if available (also this depends)
        ## on the kind of Twitter API call we used
        record_add_field(rec,
                         '045',
                         subfields=[('a',
                                     tweet.iso_language_code.encode('UTF-8'))])

    ## Let's tag this record as a TWEET so that later we can build a collection
    ## out of these records.
    record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)])

    ## Some smart manipulations: let's parse out URLs and tags from the body
    ## of the Tweet.
    for url in _RE_GET_HTTP.findall(text):
        url = url[0]
        record_add_field(rec, '856', '4', subfields=[('u', url)])

    for tag in _RE_TAGS.findall(text):
        ## And here we add the keywords.
        record_add_field(rec,
                         '653',
                         '1',
                         subfields=[('a', tag), ('9', 'TWITTER')])

    ## Finally we shall serialize everything to MARCXML
    return record_xml_output(rec)
def bst_weblinkback_updater(mode):
    """
    Update linkbacks
    @param mode: 1 delete rejected, broken and pending linkbacks whose URLs is on blacklist
                 2 update page titles of new linkbacks
                 3 update page titles of old linkbacks
                 4 update manually set page titles
                 5 detect and disable broken linkbacks
                 6 send notification email for all pending linkbacks
    @type mode: int
    """
    mode = int(mode)
    if mode == 1:
        write_message(
            "Starting to delete rejected and pending linkbacks URLs on blacklist"
        )
        delete_linkbacks_on_blacklist()
        write_message(
            "Completed to delete rejected and pending linkbacks URLs on blacklist"
        )
    elif mode == 2:
        write_message("Starting to update the page titles of new linkbacks")
        update_linkbacks(1)
        write_message("Completed to update the page titles of new linkbacks")
    elif mode == 3:
        write_message("Starting to update the page titles of old linkbacks")
        update_linkbacks(2)
        write_message("Completed to update the page titles of old linkbacks")
    elif mode == 4:
        write_message("Starting to update manually set page titles")
        update_linkbacks(3)
        write_message("Completed to update manually set page titles")
    elif mode == 5:
        write_message("Starting to detect and disable broken linkbacks")
        update_linkbacks(4)
        write_message("Completed to detect and disable broken linkbacks")
    elif mode == 6:
        write_message("Starting to send notification email")
        send_pending_linkbacks_notification(CFG_WEBLINKBACK_TYPE['TRACKBACK'])
        write_message("Completed to send notification email")
def get_dates_from_db(dict_of_ids, publication_year_tag, creation_date_tag):
    """Returns the year of the publication for each paper.
    In case the year is not in the db, the year of the submission is taken"""
    current_year = int(datetime.datetime.now().strftime("%Y"))
    publication_year_db_id = publication_year_tag[0:2]
    creation_date_db_id = creation_date_tag[0:2]
    total = 0
    count = 0
    dict_of_dates = {}
    for recid in dict_of_ids:
        dict_of_dates[recid] = 0
    date_list = run_sql("select id, tag, value from bib" + \
                        publication_year_db_id + "x where tag=%s", \
                        (publication_year_tag, ))
    date_dict = {}
    for item in date_list:
        date_dict[int(item[0])] = item[2]
    pattern = re.compile('.*(\d{4}).*')
    date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
                        from bibrec_bib" + publication_year_db_id + "x")
    for item in date_list:
        recid = int(item[0])
        id_ = int(item[1])
        if id_ in date_dict and recid in dict_of_dates:
            reg = pattern.match(date_dict[id_])
            if reg:
                date = int(reg.group(1))
                if date > 1000 and date <= current_year:
                    dict_of_dates[recid] = date
                    total += date
                    count += 1
    not_covered = []
    for recid in dict_of_dates:
        if dict_of_dates[recid] == 0:
            not_covered.append(recid)
    date_list = run_sql("select id, tag, value from bib" + \
                        creation_date_db_id + "x where tag=%s", \
                        (creation_date_tag, ))
    date_dict = {}
    for item in date_list:
        date_dict[int(item[0])] = item[2]
    date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
                        from bibrec_bib" + creation_date_db_id + "x")
    for item in date_list:
        recid = int(item[0])
        id_ = int(item[1])
        if id_ in date_dict and recid in not_covered:
            date = int(str(date_dict[id_])[0:4])
            if date > 1000 and date <= current_year:
                dict_of_dates[recid] = date
                total += date
                count += 1
    dates = {}
    med = total / count
    for recid in dict_of_dates:
        if dict_of_dates[recid] == 0:
            dates[dict_of_ids[recid]] = med
        else:
            dates[dict_of_ids[recid]] = dict_of_dates[recid]
    write_message("Dates extracted", verbose=2)
    write_message("Dates dictionary %s" % str(dates), verbose=9)
    return dates