def solr_commit_if_necessary(next_commit_counter,
                             final_commit=False,
                             recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (
            final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Beispiel #3
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0     # time taken up by external call
    tbibupload = 0     # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Beispiel #4
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Beispiel #5
0
def bst_fibonacci(n=30):
    """
    Small tasklets that prints the the Fibonacci sequence for n.
    @param n: how many Fibonacci numbers to print.
    @type n: int
    """
    ## Since it's tasklet, the parameter might be passed as a string.
    ## it should then be converted to an int.
    n = int(n)
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Beispiel #6
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = configuration.get(key + '.cfg', '')
        write_message("Getting configuration from file: %s" % filename,
            verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message("Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call"
                % key)

    return True
Beispiel #7
0
def fill_self_cites_tables(config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon
    """
    algorithm = config['algorithm']
    tags = get_authors_tags()
    all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')]
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun)
Beispiel #8
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    The task prints Fibonacci numbers for up to NUM on the stdout, and some
    messages on stderr.
    Return 1 in case of success and 0 in case of failure."""
    n = int(task_get_option('number'))
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
            if task_get_option('error'):
                1 / 0
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Beispiel #9
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        format_record(recID, fmt, on_the_fly=True)
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Beispiel #10
0
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Beispiel #11
0
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Beispiel #12
0
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))

    kb_src = config.get(config.get("rank_method", "function"), "kb_src").strip()
    # Find path from configuration registry by knowledge base name.
    kb_src_clean = configuration.get(kb_src)

    with open(kb_src_clean, 'r') as kb_file:
        data = kb_file.readlines()

    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset()
            newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if value in kb_data:
            if key not in rnkset:
                rnkset[key] = float(kb_data[value])
            else:
                if rnkset[key] in kb_data and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset
Beispiel #13
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ),
                  verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" % (len(records_concerned), ))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress(
            "Processing records %s/%s (%i%%)" %
            (records_processed, len(records_concerned),
             int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" %
                              (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" %
                                      (ticket, ),
                                      verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                          (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid, ))
                    else:
                        write_message("Skipping record %s", (recid, ))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" %
                                  (recid, ))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" %
                                                (ticket_name, ))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
Beispiel #14
0
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
Beispiel #15
0
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
Beispiel #16
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Beispiel #17
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Beispiel #18
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" %
                  (str(tickets_to_apply),), verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" %
                  (len(records_concerned),))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress("Processing records %s/%s (%i%%)"
                             % (records_processed, len(records_concerned),
                                int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" % (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" % (ticket,), verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                         (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid,))
                    else:
                        write_message("Skipping record %s", (recid,))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" % (recid,))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name,))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
Beispiel #19
0
def process_and_store(recids, config, chunk_size):
    # Limit of # of citation we can loose in one chunk
    function = config.get("rank_method", "function")
    citation_loss_limit = int(config.get(function, "citation_loss_limit"))
    # If we have nothing to process
    # Do not update the weights dictionary
    modified = False
    # Process recent records first
    # The older records were most likely added by the above steps
    # to be reprocessed so they only have minor changes
    recids_iter = iter(sorted(recids, reverse=True))

    # Split records to process into chunks so that we do not
    # fill up too much memory
    while True:
        task_sleep_now_if_required()

        chunk = list(islice(recids_iter, chunk_size))
        if not chunk:
            break

        write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))
        # The core work
        cites, refs = process_chunk(chunk, config)
        # Check that we haven't lost too many citations
        cites_diff = compute_dicts_diff(chunk, refs, cites)
        write_message("Citations balance %s" % cites_diff)
        if citation_loss_limit and cites_diff <= -citation_loss_limit:
            raise Exception("Lost too many references, aborting")

        # Store processed citations/references
        store_dicts(chunk, refs, cites)
        modified = True

    # Compute new weights dictionary
    if modified:
        weights = compute_weights()
    else:
        weights = None

    store_weights_cache(weights)

    return weights
Beispiel #20
0
def process_and_store(recids, config, chunk_size):
    # Limit of # of citation we can loose in one chunk
    function = config.get("rank_method", "function")
    citation_loss_limit = int(config.get(function, "citation_loss_limit"))
    # If we have nothing to process
    # Do not update the weights dictionary
    modified = False
    # Process recent records first
    # The older records were most likely added by the above steps
    # to be reprocessed so they only have minor changes
    recids_iter = iter(sorted(recids, reverse=True))

    # Split records to process into chunks so that we do not
    # fill up too much memory
    while True:
        task_sleep_now_if_required()

        chunk = list(islice(recids_iter, chunk_size))
        if not chunk:
            break

        write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))
        # The core work
        cites, refs = process_chunk(chunk, config)
        # Check that we haven't lost too many citations
        cites_diff = compute_dicts_diff(chunk, refs, cites)
        write_message("Citations balance %s" % cites_diff)
        if citation_loss_limit and cites_diff <= -citation_loss_limit:
            raise Exception('Lost too many references, aborting')

        # Store processed citations/references
        store_dicts(chunk, refs, cites)
        modified = True

    # Compute new weights dictionary
    if modified:
        weights = compute_weights()
    else:
        weights = None

    store_weights_cache(weights)

    return weights
Beispiel #21
0
def run_bibsort_rebalance(method_list=None):
    """Rebalances all buckets for the methods in method_list"""
    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    if not bibsort_methods:
        write_message('No methods found.. exiting rebalancing.')
        return True
    #check if there are only ranking methods -> no need for recids
    rnk_methods = get_rnk_methods(bibsort_methods)
    non_rnk_method = [
        method for method in bibsort_methods.keys()
        if method not in rnk_methods
    ]

    write_message('Running rebalancing for methods: %s' %
                  bibsort_methods.keys())

    if non_rnk_method:  # we have also 'normal' (no RNK) methods, so we need the recids
        recids = get_all_recids(including_deleted=False)
        write_message('Rebalancing will run for %s records.' \
                      %str(len(recids)), verbose=5)
        task_sleep_now_if_required(can_stop_too=True)
    else:
        recids = intbitset([])
        write_message('Rebalancing will run only for RNK methods')
    for name in bibsort_methods:
        task_update_progress('Rebalancing %s method.' % name)
        write_message('Starting sorting the data for %s method ... ' \
                          %name.upper())
        executed_ok = run_sorting_method(recids, name,
                                         bibsort_methods[name]['id'],
                                         bibsort_methods[name]['definition'],
                                         bibsort_methods[name]['washer'])
        if not executed_ok:
            write_message('Method %s could not be executed correctly.' \
                          %name, sys.stderr)
            return False
        write_message('Done.')
        task_sleep_now_if_required(can_stop_too=True)
    task_update_progress('Rebalancing done.')
    return True
Beispiel #22
0
def calculate_index_term_count(config):
    """Calculate the weight of a record set based on number of enries of a
    tag from the record in another index...useful for authority files"""

    records = []

    if config.has_section("index_term_count"):
        index = config.get("index_term_count", "index_table_name")
        tag = config.get("index_term_count", "index_term_value_from_tag")
        # check against possible SQL injection:
        dummy = get_table_update_time(index)
        tag = wash_table_column_name(tag)
    else:
        raise Exception("Config file " + config +
                        " does not have index_term_count section")
        return ()

    task_sleep_now_if_required(can_stop_too=True)
    write_message("......Processing all records")
    query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \
            (tag[0:2], tag[0:2]) # we checked that tag is safe
    records = list(run_sql(query, (tag, )))
    write_message("Number of records found with the necessary tags: %s" %
                  len(records))

    rnkset = {}
    for key, value in records:
        hits = 0
        if len(value):
            query = "SELECT hitlist from %s where term = %%s" % index  # we checked that index is a table
            row = run_sql(query, (value, ))
            if row and row[0] and row[0][0]:
                #has to be prepared for corrupted data!
                try:
                    hits = len(intbitset(row[0][0]))
                except:
                    hits = 0
        rnkset[key] = hits
    write_message("Number of records available in rank method: %s" %
                  len(rnkset))
    return rnkset
Beispiel #23
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    write_message("Running rank method: %s" % rank_method_code, verbose=0)

    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm':
        selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold':
        selfcites_config.get(rank_method_code, "friends_threshold")
    }
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(rank_method_code, config)

    tags = get_authors_tags()
    recids, end_date = fetch_concerned_records(rank_method_code,
                                               task_get_option("id"))
    citations_fun = get_citations_fun(config['algorithm'])
    weights = fromDB(rank_method_code)

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun, weights)

    intoDB(weights, end_date, rank_method_code)
    store_weights_cache(weights)

    write_message("Complete")
    return True
Beispiel #24
0
def bst_fibonacci(n=30):
    """
    Small tasklets that prints the the Fibonacci sequence for n.
    @param n: how many Fibonacci numbers to print.
    @type n: int
    """
    ## Since it's tasklet, the parameter might be passed as a string.
    ## it should then be converted to an int.
    n = int(n)
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.", sys.stderr, verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message("Error: floppy drive dropped on the floor.  Ignoring and continuing.", sys.stderr)
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Beispiel #25
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    The task prints Fibonacci numbers for up to NUM on the stdout, and some
    messages on stderr.
    Return 1 in case of success and 0 in case of failure."""
    n = int(task_get_option('number'))
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.", sys.stderr, verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message("Error: floppy drive dropped on the floor.  Ignoring and continuing.", sys.stderr)
            if task_get_option('error'):
                1 / 0
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Beispiel #26
0
def fill_self_cites_tables(rank_method_code, config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon

    This is an optimization when running on empty tables, and we hope the
    result is the same as the compute_and_store_self_citations.
    """
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    algorithm = config['algorithm']
    tags = get_authors_tags()
    selfcites_dic = {}
    all_ids = intbitset(run_sql('SELECT id FROM bibrec ORDER BY id'))
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid,
                                         tags,
                                         citations_fun,
                                         selfcites_dic)
    intoDB(selfcites_dic, begin_date, rank_method_code)
    store_weights_cache(selfcites_dic)
Beispiel #27
0
def calculate_index_term_count(config):
    """Calculate the weight of a record set based on number of enries of a
    tag from the record in another index...useful for authority files"""

    records = []

    if config.has_section("index_term_count"):
        index = config.get("index_term_count","index_table_name")
        tag = config.get("index_term_count","index_term_value_from_tag")
        # check against possible SQL injection:
        dummy = get_table_update_time(index)
        tag = wash_table_column_name(tag)
    else:
        raise Exception("Config file " + config + " does not have index_term_count section")
        return()

    task_sleep_now_if_required(can_stop_too=True)
    write_message("......Processing all records")
    query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \
            (tag[0:2], tag[0:2]) # we checked that tag is safe
    records = list(run_sql(query, (tag,)))
    write_message("Number of records found with the necessary tags: %s" % len(records))


    rnkset = {}
    for key, value in records:
        hits = 0
        if len(value):
            query = "SELECT hitlist from %s where term = %%s" % index # we checked that index is a table
            row = run_sql(query, (value,))
            if row and row[0] and row[0][0]:
                #has to be prepared for corrupted data!
                try:
                    hits = len(intbitset(row[0][0]))
                except:
                    hits = 0
        rnkset[key] = hits
    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset
Beispiel #28
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    write_message("Running rank method: %s" % rank_method_code, verbose=0)

    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm': selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold")
    }
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(rank_method_code, config)

    tags = get_authors_tags()
    recids, end_date = fetch_concerned_records(rank_method_code,
                                               task_get_option("id"))
    citations_fun = get_citations_fun(config['algorithm'])
    weights = fromDB(rank_method_code)

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun, weights)

    intoDB(weights, end_date, rank_method_code)
    store_weights_cache(weights)

    write_message("Complete")
    return True
Beispiel #29
0
def fill_self_cites_tables(rank_method_code, config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon

    This is an optimization when running on empty tables, and we hope the
    result is the same as the compute_and_store_self_citations.
    """
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    algorithm = config['algorithm']
    tags = get_authors_tags()
    selfcites_dic = {}
    all_ids = intbitset(run_sql('SELECT id FROM bibrec ORDER BY id'))
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun,
                                         selfcites_dic)
    intoDB(selfcites_dic, begin_date, rank_method_code)
    store_weights_cache(selfcites_dic)
Beispiel #30
0
def run_bibsort_rebalance(method_list = None):
    """Rebalances all buckets for the methods in method_list"""
    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    if not bibsort_methods:
        write_message('No methods found.. exiting rebalancing.')
        return True
    #check if there are only ranking methods -> no need for recids
    rnk_methods = get_rnk_methods(bibsort_methods)
    non_rnk_method = [method for method in bibsort_methods.keys() if method not in rnk_methods]

    write_message('Running rebalancing for methods: %s' %bibsort_methods.keys())

    if non_rnk_method:# we have also 'normal' (no RNK) methods, so we need the recids
        recids = get_all_recids(including_deleted=False)
        write_message('Rebalancing will run for %s records.' \
                      %str(len(recids)), verbose=5)
        task_sleep_now_if_required(can_stop_too=True)
    else:
        recids = intbitset([])
        write_message('Rebalancing will run only for RNK methods')
    for name in bibsort_methods:
        task_update_progress('Rebalancing %s method.' %name)
        write_message('Starting sorting the data for %s method ... ' \
                          %name.upper())
        executed_ok = run_sorting_method(recids, name,
                                bibsort_methods[name]['id'],
                                bibsort_methods[name]['definition'],
                                bibsort_methods[name]['washer'])
        if not executed_ok:
            write_message('Method %s could not be executed correctly.' \
                          %name, sys.stderr)
            return False
        write_message('Done.')
        task_sleep_now_if_required(can_stop_too=True)
    task_update_progress('Rebalancing done.')
    return True
Beispiel #31
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm': selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold")
    }
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(config)

    write_message("Starting")

    tags = get_authors_tags()
    recids = fetch_concerned_records(rank_method_code)
    citations_fun = get_citations_fun(config['algorithm'])

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun)

    store_last_updated(rank_method_code, begin_date)

    write_message("Complete")
    return True
Beispiel #32
0
def process_and_store(recids, config, dicts, chunk_size, quick):
    # Process recent records first
    # The older records were most likely added by the above steps
    # to be reprocessed so they only have minor changes
    recids_iter = iter(sorted(recids, reverse=True))

    # Split records to process into chunks so that we do not
    # fill up too much memory
    while True:
        task_sleep_now_if_required()
        chunk = list(islice(recids_iter, chunk_size))
        if not chunk:
            if not quick:
                store_dicts(dicts)
            break
        write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))

        # dicts are modified in-place
        process_chunk(chunk, config, dicts)

        if quick:
            # Store partial result as it is just an update and not
            # a creation from scratch
            store_dicts(dicts)
Beispiel #33
0
def wait_for_task(task_id):
    sql = 'select status from schTASK where id = %s'
    while run_sql(sql, [task_id])[0][0] not in ('DONE', 'ACK', 'ACK DONE'):
        task_sleep_now_if_required(True)
        time.sleep(5)
Beispiel #34
0
def bibrank_engine(run):
    """Run the indexing task.
    Return 1 in case of success and 0 in case of failure.
    """
    startCreate = time.time()

    options["run"] = []
    options["run"].append(run)
    for rank_method_code in options["run"]:
        task_sleep_now_if_required(can_stop_too=True)
        cfg_name = getName(rank_method_code)
        write_message("Running rank method: %s." % cfg_name)
        config = load_config(rank_method_code)
        cfg_short = rank_method_code
        cfg_function = config.get("rank_method", "function") + "_exec"
        cfg_repair_function = config.get("rank_method", "function") + "_repair_exec"
        cfg_name = getName(cfg_short)
        options["validset"] = get_valid_range(rank_method_code)

        if task_get_option("collection"):
            l_of_colls = string.split(task_get_option("collection"), ", ")
            recIDs = perform_request_search(c=l_of_colls)
            recIDs_range = []
            for recID in recIDs:
                recIDs_range.append([recID, recID])
            options["recid_range"] = recIDs_range
        elif task_get_option("id"):
            options["recid_range"] = task_get_option("id")
        elif task_get_option("modified"):
            options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified"))
        elif task_get_option("last_updated"):
            options["recid_range"] = add_recIDs_by_date(rank_method_code)
        else:
            write_message("No records specified, updating all", verbose=2)
            min_id = run_sql("SELECT min(id) from bibrec")[0][0]
            max_id = run_sql("SELECT max(id) from bibrec")[0][0]
            options["recid_range"] = [[min_id, max_id]]

        if task_get_option("quick") == "no":
            write_message("Recalculate parameter not used, parameter ignored.", verbose=9)

        if task_get_option("cmd") == "del":
            del_recids(cfg_short, options["recid_range"])
        elif task_get_option("cmd") == "add":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "stat":
            rank_method_code_statistics(rank_method_code)
        elif task_get_option("cmd") == "check":
            check_method(rank_method_code)
        elif task_get_option("cmd") == "print-missing":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "repair":
            func_object = globals().get(cfg_repair_function)
            func_object()
        else:
            write_message("Invalid command found processing %s" % rank_method_code, sys.stderr)
            raise StandardError

    if task_get_option("verbose"):
        showtime((time.time() - startCreate))
    return 1
Beispiel #35
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])), stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                                  (onto_rec['ontology'],
                                   ', '.join([str(recid) for recid in
                                              onto_rec['recIDs']])),
                                  stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                                     onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message(
            "WARNING: No keywords found, recids: %s" % onto_recids,
            stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Beispiel #36
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \
                 or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR
    # Check if directory /batchupload exists
    if not task_get_option('documents'):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        list_of_folders = [
            "insert", "append", "correct", "replace", "holdingpen"
        ]
        for folder in list_of_folders:
            files_dir = os.path.join(parent_dir, folder)
            files_done_dir = os.path.join(files_dir, "DONE")
            try:
                files = os.listdir(files_dir)
            except OSError as e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
                write_message("Created new folder %s" % (files_dir, ))
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(
                        prefix=metafile + "_" +
                        time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
                        dir=CFG_TMPSHAREDDIR)
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "--" + folder
                    jobid = str(
                        task_low_level_submission('bibupload', 'batchupload',
                                                  mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime(
                        "%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile),
                              os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of %d." %
                                 (progress, len(list_of_folders)))
    else:
        # Documents upload
        parent_dir = daemon_dir + "/documents/"
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        matching_order = CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY
        for folder in ["append/", "revise/"]:
            try:
                os.mkdir(parent_dir + folder)
            except:
                pass
            for matching in matching_order:
                errors = document_upload(folder=parent_dir + folder,
                                         matching=matching,
                                         mode=folder[:-1])[0]
                if not errors:
                    break  # All documents succedeed with that matching
                for error in errors:
                    write_message(
                        "File: %s - %s with matching %s" %
                        (error[0], error[1], matching), sys.stderr)
            task_sleep_now_if_required(can_stop_too=True)
    return 1
Beispiel #37
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" % len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid, verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3)

        current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set)
             if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3)

        updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) |
             (current_oai_sets - updated_oai_sets))
        write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3)
            continue # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" % recid, verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
                else:
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename)
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c', filename)
    else:
        os.remove(filename)

    return True
Beispiel #38
0
def _analyze_documents(
        records,
        taxonomy_name,
        collection,
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection,
            stream=sys.stderr,
            verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files(
        )  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' % record,
                    stream=sys.stderr,
                    verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    engine.get_keywords_from_local_file(fulltext,
                                                        taxonomy_name,
                                                        with_author_keywords=True,
                                                        output_mode="raw",
                                                        output_limit=output_limit,
                                                        match_mode='partial')
            else:
                bibtask.write_message(
                    'WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = engine.clean_before_output(single_keywords)
                cleaned_composite = engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                engine._output_marc(keywords.items(), (),
                                    akws,
                                    acro,
                                    spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record,
                                  stream=sys.stderr,
                                  verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' %
                                     (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
Beispiel #39
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option("recids")
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = "Done %s of %s" % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message("processing %s" % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message("already harvested successfully")
                time.sleep(6)
            except FoundExistingPdf:
                write_message("pdf already attached (matching md5)")
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = "Updated %s records" % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
Beispiel #40
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
    elif docnames is None:
        docnames = []

    try:
        bibarchive = BibRecDocs(recid)
    except Exception, e:
        write_message("Could not instantiate record #%s: %s" % (recid, e))
        return 0

    write_message("Going to create related file formats for record #%s" %
                  recid)

    i = 0
    for docname in docnames:
        i += 1
        task_sleep_now_if_required()
        msg = "Processing %s (%i/%i)" % (docname, i, len(docnames))
        write_message(msg)
        task_update_progress(msg)
        try:
            bibdoc = bibarchive.get_bibdoc(docname)
        except Exception, e:
            write_message("Could not process docname %s: %s" % (docname, e))
            continue

        (prev_desc, prev_comment) = \
                    get_description_and_comment(bibarchive.get_bibdoc(docname).list_latest_files())

        # List all files that are not icons or subformats
        current_files = [bibdocfile.get_path() for bibdocfile in bibdoc.list_latest_files() if \
                         not bibdocfile.get_subformat() and not bibdocfile.is_icon()]
Beispiel #42
0
def generate_sitemaps(sitemap_index_writer, collection_names, fulltext_filter=''):
    """
    Generate sitemaps themselves. Return list of generated sitemaps files
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    for lang in CFG_SITE_LANGS:
        writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang,
                       lastmod=datetime.today(),
                       changefreq=DEFAULT_CHANGEFREQ_HOME,
                       priority=DEFAULT_PRIORITY_HOME)
        nb_urls += 1
    write_message("... Getting all public records...")
    recids = get_all_public_records(collection_names)
    write_message("... Generating urls for %s records..." % len(recids))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Sitemap for recid %s/%s" % (i + 1, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for collections...")
    collections = get_all_public_collections(collection_names)
    for i, (collection, lastmod) in enumerate(collections):
        for lang in CFG_SITE_LANGS:
            if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
                sitemap_id += 1
                writer = SitemapWriter(sitemap_id)
                sitemap_index_writer.add_url(writer.get_sitemap_url())
            nb_urls = writer.add_url('%s/collection/%s?ln=%s' % (CFG_SITE_URL, quote(collection), lang),
                        lastmod = lastmod,
                        changefreq = DEFAULT_CHANGEFREQ_COLLECTIONS,
                        priority = DEFAULT_PRIORITY_COLLECTIONS,
                        alternate=True)
        if i % 100 == 0:
            task_update_progress("Sitemap for collection %s/%s" % (i + 1, len(collections)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for fulltexts...")
    recids = filter_fulltexts(recids, fulltext_filter)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/files' % (CFG_SITE_RECORD, recid),
                                 lastmod = lastmod,
                                 changefreq = DEFAULT_CHANGEFREQ_FULLTEXTS,
                                 priority = DEFAULT_PRIORITY_FULLTEXTS)
        if i % 100 == 0:
            task_update_progress("Sitemap for files page %s/%s" % (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)

    write_message("... Generating urls for comments...")
    recids = filter_comments(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/comments' % (CFG_SITE_RECORD, recid),
                                 lastmod = lastmod,
                                 changefreq = DEFAULT_CHANGEFREQ_COMMENTS,
                                 priority = DEFAULT_PRIORITY_COMMENTS)
        if i % 100 == 0:
            task_update_progress("Sitemap for comments page %s/%s" % (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for reviews")
    recids = filter_reviews(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            write_message("")
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/reviews' % (CFG_SITE_RECORD, recid),
                                 lastmod = lastmod,
                                 changefreq = DEFAULT_CHANGEFREQ_REVIEWS,
                                 priority = DEFAULT_PRIORITY_REVIEWS)
        if i % 100 == 0:
            task_update_progress("Sitemap for reviews page %s/%s" % (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
Beispiel #43
0
def bibrank_engine(run):
    """Run the indexing task.
    Return 1 in case of success and 0 in case of failure.
    """
    startCreate = time.time()

    options["run"] = []
    options["run"].append(run)
    for rank_method_code in options["run"]:
        task_sleep_now_if_required(can_stop_too=True)
        cfg_name = getName(rank_method_code)
        write_message("Running rank method: %s." % cfg_name)
        config = load_config(rank_method_code)
        cfg_short = rank_method_code
        cfg_function = config.get("rank_method", "function") + "_exec"
        cfg_repair_function = config.get("rank_method",
                                         "function") + "_repair_exec"
        cfg_name = getName(cfg_short)
        options["validset"] = get_valid_range(rank_method_code)

        if task_get_option("collection"):
            l_of_colls = string.split(task_get_option("collection"), ", ")
            recIDs = perform_request_search(c=l_of_colls)
            recIDs_range = []
            for recID in recIDs:
                recIDs_range.append([recID, recID])
            options["recid_range"] = recIDs_range
        elif task_get_option("id"):
            options["recid_range"] = task_get_option("id")
        elif task_get_option("modified"):
            options["recid_range"] = add_recIDs_by_date(
                rank_method_code, task_get_option("modified"))
        elif task_get_option("last_updated"):
            options["recid_range"] = add_recIDs_by_date(rank_method_code)
        else:
            write_message("No records specified, updating all", verbose=2)
            min_id = run_sql("SELECT min(id) from bibrec")[0][0]
            max_id = run_sql("SELECT max(id) from bibrec")[0][0]
            options["recid_range"] = [[min_id, max_id]]

        if task_get_option("quick") == "no":
            write_message("Recalculate parameter not used, parameter ignored.",
                          verbose=9)

        if task_get_option("cmd") == "del":
            del_recids(cfg_short, options["recid_range"])
        elif task_get_option("cmd") == "add":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "stat":
            rank_method_code_statistics(rank_method_code)
        elif task_get_option("cmd") == "check":
            check_method(rank_method_code)
        elif task_get_option("cmd") == "print-missing":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "repair":
            func_object = globals().get(cfg_repair_function)
            func_object()
        else:
            write_message(
                "Invalid command found processing %s" % rank_method_code,
                sys.stderr)
            raise StandardError

    if task_get_option("verbose"):
        showtime((time.time() - startCreate))
    return 1
def load_kbs(cfg, run_sql, in_task=False):
    for kb, query in cfg.iteritems():
        task_sleep_now_if_required(can_stop_too=True)
        if not kb_exists(kb):
            add_kb(kb)
        if in_task:
            write_message("Updating %s KB..." % kb)
        try:
            if not in_task:
                print "kb:", kb
                print "kb beginning:", len(get_kb_mappings(kb))
            if kb.startswith('json_'):
                encoder = ComplexEncoder()
                mapping, description = run_sql(query, with_desc=True)
                if kb in CFG_ADDITIONAL_ENTRIES:
                    mapping += CFG_ADDITIONAL_ENTRIES[kb]
                    if not in_task:
                        print CFG_ADDITIONAL_ENTRIES[kb]
                column_counter = {}
                new_description = []
                for column in description[1:]:
                    column = column[0]
                    counter = column_counter[column] = column_counter.get(
                        column, 0) + 1
                    if counter > 1:
                        new_description.append('%s%d' % (column, counter))
                    else:
                        new_description.append(column)
                description = new_description
            else:
                mapping = run_sql(query)
                if kb in CFG_ADDITIONAL_ENTRIES:
                    mapping += CFG_ADDITIONAL_ENTRIES[kb]
                    if not in_task:
                        print CFG_ADDITIONAL_ENTRIES[kb]
                if not in_task:
                    print "mapping:", len(mapping)
                if kb == 'projects':
                    mapping += [('000000', 'NO PROJECT')]
            original_keys = set([key[0] for key in get_kbr_keys(kb)])
            if not in_task:
                print "original_keys before:", len(original_keys)

            updated = 0
            added = 0
            for i, row in enumerate(mapping):
                key, value = row[0], row[1:]
                if kb.startswith('json_'):
                    value = encoder.encode(dict(zip(description, value)))
                else:
                    value = value[0]
                if value:
                    if key in original_keys:
                        original_keys.remove(key)
                    if in_task:
                        task_update_progress("%s - %s%%" %
                                             (kb, i * 100 / len(mapping)))
                    if kb_mapping_exists(kb, key):
                        updated += 1
                        update_kb_mapping(kb, key, key, value)
                    else:
                        added += 1
                        add_kb_mapping(kb, key, value)
            if not in_task:
                print "updated:", updated, "added:", added
                print "kb after update:", len(get_kb_mappings(kb))
                print "original_keys after:", len(original_keys)
            if in_task:
                task_update_progress("Cleaning %s" % kb)
            for key in original_keys:
                remove_kb_mapping(kb, key)
            if not in_task:
                print "kb after remove:", len(get_kb_mappings(kb))
        except:
            register_exception(alert_admin=True,
                               prefix="Error when updating KB %s" % kb)
            continue
Beispiel #45
0
    updated = 0
    for i, recid in enumerate(recids):
        done, exceptions = create_icons_for_record(recid, icon_sizes,
                                                   icon_format_mappings,
                                                   docnames, add_default_icon,
                                                   inherit_moreinfo)
        updated += done
        if exceptions:
            for ex in exceptions:
                write_message(ex)
        else:
            write_message("Recid %s DONE." % recid)

        task_update_progress("Done %d out of %d." % (i, len(recids)))
        task_sleep_now_if_required(can_stop_too=True)

    if updated:
        cli_fix_marc(None, explicit_recid_set=recids, interactive=False)

    return 1


def _create_icon(file_path,
                 icon_size,
                 docname,
                 icon_format='gif',
                 verbosity=9):
    """
    Creates icon of given file.
Beispiel #46
0
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {"report-numbers": {}, "journals": {}, "doi": {}, "hdl": {}, "isbn": {}, "record_id": {}}

    references_info = {"report-numbers": {}, "journals": {}, "doi": {}, "record_id": {}, "isbn": {}, "hdl": {}}

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info["record_id"][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, "collections"):
            if recid not in recids_cache(config.get(function, "collections")):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags["refs_report_number"]:
            references_info["report-numbers"][recid] = [
                t.value for t in record.find_subfields(tags["refs_report_number"])
            ]
            msg = "references_info['report-numbers'][%s] = %r" % (recid, references_info["report-numbers"][recid])
            write_message(msg, verbose=9)
        if tags["refs_journal"]:
            references_info["journals"][recid] = []
            for ref in record.find_subfields(tags["refs_journal"]):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(",")
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ",".join([journal, alt_volume, page])
                        references_info["journals"][recid] += [alt_ref]
                references_info["journals"][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" % (recid, references_info["journals"][recid])
            write_message(msg, verbose=9)
        if tags["refs_doi"]:
            references = [t.value for t in record.find_subfields(tags["refs_doi"])]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info["doi"][recid] = dois
            references_info["hdl"][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags["refs_record_id"]:
            references_info["record_id"][recid] = [t.value for t in record.find_subfields(tags["refs_record_id"])]
            msg = "references_info['record_id'][%s] = %r" % (recid, references_info["record_id"][recid])
            write_message(msg, verbose=9)
        if tags["refs_isbn"]:
            references_info["isbn"][recid] = [t.value for t in record.find_subfields(tags["refs_isbn"])]
            msg = "references_info['isbn'][%s] = %r" % (recid, references_info["isbn"][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags["record_pri_number"] or tags["record_add_number"]:
            records_info["report-numbers"][recid] = []

            if tags["record_pri_number"]:
                records_info["report-numbers"][recid] += [
                    t.value for t in record.find_subfields(tags["record_pri_number"])
                ]

            if tags["record_add_number"]:
                records_info["report-numbers"][recid] += [
                    t.value for t in record.find_subfields(tags["record_add_number"])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" % (recid, records_info["report-numbers"][recid])
            write_message(msg, verbose=9)

        if tags["doi"]:
            records_info["doi"][recid] = []
            records_info["hdl"][recid] = []
            for tag in tags["doi"]:
                for field in record.find_fields(tag[:5]):
                    if "DOI" in field.get_subfield_values("2"):
                        dois = field.get_subfield_values("a")
                        records_info["doi"][recid].extend(dois)
                    elif "HDL" in field.get_subfield_values("2"):
                        hdls = field.get_subfield_values("a")
                        records_info["hdl"][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" % (recid, records_info["doi"][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" % (recid, records_info["hdl"][recid])
            write_message(msg, verbose=9)

        if tags["isbn"]:
            records_info["isbn"][recid] = []
            for tag in tags["isbn"]:
                values = [t.value for t in record.find_subfields(tag)]
                records_info["isbn"][recid] += values

            msg = "records_info[%s]['isbn'] = %r" % (recid, records_info["isbn"][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags["publication"]:
            records_info["journals"][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" % (recid, records_info["journals"][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info
Beispiel #47
0
def bst_notify_url(url,
                   data=None,
                   content_type='text/plain',
                   attempt_times=1,
                   attempt_sleeptime=10,
                   admin_emails=None):
    """
    Access given URL, and post given data if specified.

    @param url: the URL to access
    @type url: string
    @param data: the data to be posted to the given URL
    @type data: string
    @param data: the content-type header to use to post data
    @type data: string
    @param attempt_times: number of tries
    @type attempt_times: int
    @param attempt_sleeptime: seconds in between tries
    @type attempt_sleeptime: int
    @param admin_emails: a comma-separated list of emails to notify in case of failure
    @type admin_emails: string or list (as accepted by mailutils.send_email)

    If accessing fails, try to send it ATTEMPT_TIMES, and wait for
    ATTEMPT_SLEEPTIME seconds in between tries. When the maximum
    number of attempts is reached, send an email notification to the
    recipients specified in ADMIN_EMAILS.
    """
    attempt_times = int(attempt_times)
    attempt_sleeptime = int(attempt_sleeptime)
    remaining_attempts = attempt_times

    success_p = False
    reason_failure = ""

    write_message("Going to notify URL: %(url)s" % {'url': url})

    while not success_p and remaining_attempts > 0:
        ## <scheme>://<netloc>/<path>?<query>#<fragment>
        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
        ## See: http://stackoverflow.com/questions/111945/is-there-any-way-to-do-http-put-in-python
        if scheme == 'http':
            opener = urllib2.build_opener(urllib2.HTTPHandler)
        elif scheme == 'https':
            opener = urllib2.build_opener(urllib2.HTTPSHandler)
        else:
            raise ValueError("Scheme not handled %s for url %s" %
                             (scheme, url))
        request = urllib2.Request(url, data=data)
        if data:
            request.add_header('Content-Type', content_type)
            request.get_method = lambda: 'POST'
        try:
            opener.open(request)
            success_p = True
        except urllib2.URLError as e:
            success_p = False
            reason_failure = repr(e)
        if not success_p:
            remaining_attempts -= 1
            if remaining_attempts > 0:  # sleep only if we shall retry again
                task_sleep_now_if_required(can_stop_too=True)
                time.sleep(attempt_sleeptime)

        # Report about success/failure
        if success_p:
            write_message("URL successfully notified")
        else:
            write_message("Failed at notifying URL. Reason:\n%(reason_failure)s" % \
                          {'reason_failure': reason_failure})

    if not success_p and admin_emails:
        # We could not access the specified URL. Send an email to the
        # specified contacts.
        write_message("Notifying by email %(admin_emails)s" % \
                      {'admin_emails': str(admin_emails)})
        subject = "%(CFG_SITE_NAME)s could not contact %(url)s" % \
                  {'CFG_SITE_NAME': CFG_SITE_NAME,
                   'url': url}
        content = """\n%(CFG_SITE_NAME)s unsuccessfully tried to contact %(url)s.

Number of attempts: %(attempt_times)i. No further attempts will be made.

""" % \
                  {'CFG_SITE_NAME': CFG_SITE_NAME,
                   'url': url,
                   'attempt_times': attempt_times}
        if data:
            max_data_length = 10000
            content += "The following data should have been posted:\n%(data)s%(extension)s" % \
                      {'data': data[:max_data_length],
                       'extension': len(data) > max_data_length and ' [...]' or ''}
        # Send email. If sending fails, we will stop the queue
        return send_email(fromaddr=CFG_SITE_ADMIN_EMAIL,
                          toaddr=admin_emails,
                          subject=subject,
                          content=content)

    # We do not really want to stop the queue now, even in case of
    # failure as an email would have been sent if necessary.
    return 1
Beispiel #48
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Beispiel #49
0
def generate_sitemaps(sitemap_index_writer,
                      collection_names,
                      fulltext_filter=''):
    """
    Generate sitemaps themselves. Return list of generated sitemaps files
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    for lang in CFG_SITE_LANGS:
        writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang,
                       lastmod=datetime.today(),
                       changefreq=DEFAULT_CHANGEFREQ_HOME,
                       priority=DEFAULT_PRIORITY_HOME,
                       alternate=True)
        nb_urls += 1
    write_message("... Getting all public records...")
    recids = get_all_public_records(collection_names)
    write_message("... Generating urls for %s records..." % len(recids))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_RECORDS,
                                 priority=DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Sitemap for recid %s/%s" %
                                 (i + 1, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for collections...")
    collections = get_all_public_collections(collection_names)
    for i, (collection, lastmod) in enumerate(collections):
        for lang in CFG_SITE_LANGS:
            if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                       or nb_urls >= MAX_RECORDS):
                sitemap_id += 1
                writer = SitemapWriter(sitemap_id)
                sitemap_index_writer.add_url(writer.get_sitemap_url())
            nb_urls = writer.add_url('%s/collection/%s?ln=%s' %
                                     (CFG_SITE_URL, quote(collection), lang),
                                     lastmod=lastmod,
                                     changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS,
                                     priority=DEFAULT_PRIORITY_COLLECTIONS,
                                     alternate=True)
        if i % 100 == 0:
            task_update_progress("Sitemap for collection %s/%s" %
                                 (i + 1, len(collections)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for fulltexts...")
    recids = filter_fulltexts(recids, fulltext_filter)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/files' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS,
                                 priority=DEFAULT_PRIORITY_FULLTEXTS)
        if i % 100 == 0:
            task_update_progress("Sitemap for files page %s/%s" %
                                 (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)

    write_message("... Generating urls for comments...")
    recids = filter_comments(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/comments' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_COMMENTS,
                                 priority=DEFAULT_PRIORITY_COMMENTS)
        if i % 100 == 0:
            task_update_progress("Sitemap for comments page %s/%s" %
                                 (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for reviews")
    recids = filter_reviews(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            write_message("")
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/reviews' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_REVIEWS,
                                 priority=DEFAULT_PRIORITY_REVIEWS)
        if i % 100 == 0:
            task_update_progress("Sitemap for reviews page %s/%s" %
                                 (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
Beispiel #50
0
    updated = 0
    for i, recid in enumerate(recids):
        done, exceptions = create_icons_for_record(recid, icon_sizes,
                                                   icon_format_mappings,
                                                   docnames, add_default_icon,
                                                   inherit_moreinfo)
        updated += done
        if exceptions:
            for ex in exceptions:
                write_message(ex)
        else:
            write_message("Recid %s DONE." % recid)

        task_update_progress("Done %d out of %d." % (i, len(recids)))
        task_sleep_now_if_required(can_stop_too=True)

    if updated:
        cli_fix_marc(None, explicit_recid_set=recids, interactive=False)

    return 1

def _create_icon(file_path, icon_size, docname, icon_format='gif', verbosity=9):
    """
    Creates icon of given file.

    Returns path to the icon. If creation fails, return None, and
    register exception (send email to admin).


    @param file_path: full path to icon
Beispiel #51
0
def _analyze_documents(records, taxonomy_name, collection,
                       output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(
            record).list_latest_files()  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    engine.get_keywords_from_local_file(fulltext,
                                                        taxonomy_name,
                                                        with_author_keywords=True,
                                                        output_mode="raw",
                                                        output_limit=output_limit,
                                                        match_mode='partial')
            else:
                bibtask.write_message('WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                                      (doc.fullpath, doc.doctype),
                                      stream=sys.stderr, verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = engine.clean_before_output(single_keywords)
                cleaned_composite = engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(engine._output_marc(keywords.items(), (), akws, acro,
                                              spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress(
            'Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
Beispiel #52
0
def task_run_core():
    """Perform a search to find records without a texkey.

    generates a new one and uploads the changes in chunks
    """
    recids = perform_request_search(
        p='-035:spirestex -035:inspiretex', cc='HEP')

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct, tag="035",
                                                   ind1="", ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                try:
                    value = field_get_subfield_values(instance, "a")[0]
                except IndexError:
                    value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message(
                    "INFO: Record %s has already texkey %s" % (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message((
                    "WARNING: Record %s has no first author or "
                    "collaboration") % recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    # FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    return True
Beispiel #53
0
def run_bibsort_update(recids=None, method_list=None):
    """Updates bibsort tables for the methods in method_list
    and for the records in recids.

    If recids is None: recids = all records that have been modified
    or inserted since last update

    If method_list is None: method_list = all the methods available
    in bsrMETHOD table"""

    write_message('Initial data for run_bibsort_update method: ' \
                  'number of recids = %s; method_list=%s' \
                  %(str(len(recids)), method_list), verbose=5)
    write_message('Updating sorting data.')

    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    method_list = bibsort_methods.keys()
    if not method_list:
        write_message('No methods found in bsrMETHOD table.. exiting.')
        return True

    #we could have 4 types of methods:
    #(i) RNK methods -> they should be rebalanced, not updated
    #(ii) RNK methods to delete -> we should delete their data
    #(iii) non RNK methods to update
    #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated
    #check which of the methods are RNK methods (they do not need modified recids)
    rnk_methods = get_rnk_methods(bibsort_methods)
    rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(rnk_methods, bibsort_methods)
    #check which of the methods have no data, so they are actually new,
    #so they need balancing(sorting) instead of updating
    non_rnk_methods = [method for method in bibsort_methods.keys() if method not in rnk_methods]
    non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(non_rnk_methods)

    #(i) + (iv)
    methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted
    if methods_to_balance: # several methods require rebalancing(sorting) and not updating
        return run_bibsort_rebalance(methods_to_balance)

    #(ii)
    #remove the data for the ranking methods that have been deleted
    for method in rnk_methods_deleted:
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Deleting data for method %s" %method)
        write_message('Starting deleting the data for RNK method %s' %method, verbose=5)
        executed_ok = delete_bibsort_data_for_method(bibsort_methods[method]['id'])
        if not executed_ok:
            write_message('Method %s could not be deleted correctly, aborting..' \
                          %method, sys.stderr)
            return False

    #(iii)
    #methods to actually update
    if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids
        update_timestamp = False
        if not recids:
            recids = get_modified_or_inserted_recs(non_rnk_methods_updated)
            if recids == 0: #error signal
                return False
            if not recids:
                write_message("No records inserted or modified in bibrec table " \
                          "since the last update of bsrMETHODDATA.")
                return True
            write_message("These records have been recently modified/inserted: %s" \
                  %str(recids), verbose=5)
            update_timestamp = True
        recids_i = intbitset(recids)
        for method in non_rnk_methods_updated:
            task_sleep_now_if_required(can_stop_too=True)
            task_update_progress("Updating method %s" %method)
            write_message('Starting updating method %s' %method, verbose=5)
            executed_ok = update_bibsort_tables(recids_i, method, update_timestamp)
            if not executed_ok:
                write_message('Method %s could not be executed correctly, aborting..' \
                          %method, sys.stderr)
                return False
    return True
Beispiel #54
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    plugins = load_plugins()
    rules = load_rules(plugins)
    task_set_option('plugins', plugins)
    recids_for_rules = get_recids_for_rules(rules)

    all_recids = intbitset([])
    single_rules = set()
    batch_rules = set()
    for rule_name, rule_recids in recids_for_rules.iteritems():
        all_recids.union_update(rule_recids)
        if plugins[rules[rule_name]["check"]]["batch"]:
            batch_rules.add(rule_name)
        else:
            single_rules.add(rule_name)

    records_to_upload_holdingpen = []
    records_to_upload_replace = []
    for batch in iter_batches(all_recids, CFG_BATCH_SIZE):

        for rule_name in batch_rules:
            rule = rules[rule_name]
            rule_recids = recids_for_rules[rule_name]
            task_sleep_now_if_required(can_stop_too=True)
            records = []
            for i, record_id, record in batch:
                if record_id in rule_recids:
                    records.append(record)
            if len(records):
                check_records(rule, records)

        # Then run them trught normal rules
        for i, record_id, record in batch:
            progress_percent = int(float(i) / len(all_recids) * 100)
            task_update_progress("Processing record %s/%s (%i%%)." %
                        (i, len(all_recids), progress_percent))
            write_message("Processing record %s" % record_id)

            for rule_name in single_rules:
                rule = rules[rule_name]
                rule_recids = recids_for_rules[rule_name]
                task_sleep_now_if_required(can_stop_too=True)
                if record_id in rule_recids:
                    check_record(rule, record)

            if record.amended:
                if record.holdingpen:
                    records_to_upload_holdingpen.append(record)
                else:
                    records_to_upload_replace.append(record)

            if not record.valid:
                submit_ticket(record, record_id)

        if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_holdingpen, True)
            records_to_upload_holdingpen = []
        if len(records_to_upload_replace) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_replace, False)
            records_to_upload_replace = []

    ## In case there are still some remaining amended records
    if records_to_upload_holdingpen:
        upload_amendments(records_to_upload_holdingpen, True)
    if records_to_upload_replace:
        upload_amendments(records_to_upload_replace, False)

    # Update the database with the last time the rules was ran
    for rule in rules.keys():
        update_rule_last_run(rule)

    return True
Beispiel #55
0
def task_run_core():
    """
    Run daemon
    """
    write_message("Starting...")
    if task_get_option("update-borrowers"):
        write_message("Started update-borrowers")
        list_of_borrowers = db.get_all_borrowers()
        total_borrowers = len(list_of_borrowers)

        for done, borrower in enumerate(list_of_borrowers):
            user_id = borrower[0]
            update_user_info_from_ldap(user_id)
            if done % 10 == 0:
                task_update_progress("Borrower: updated %d out of %d." % (done, total_borrowers))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Borrower: updated %d out of %d." % (done+1, total_borrowers))
        write_message("Updated %d out of %d total borrowers" % (done+1, total_borrowers))

    if task_get_option("update-requests"):
        write_message("Started update-requests")
        list_of_reqs = db.get_loan_request_by_status(CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING)

        for (_request_id, recid, bc, _name, borrower_id, _library, _location,
             _date_from, _date_to, _request_date) in list_of_reqs:
            description = db.get_item_description(bc)
            list_of_barcodes = db.get_barcodes(recid, description)
            for barcode in list_of_barcodes:
                update_requests_statuses(barcode)
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Requests due updated from 'waiting' to 'pending'.")
        write_message("Requests due updated from 'waiting' to 'pending'.")

    if task_get_option("overdue-letters"):
        write_message("Started overdue-letters")
        expired_loans = db.get_all_expired_loans()
        total_expired_loans = len(expired_loans)

        for done, (borrower_id, _bor_name, recid, _barcode, _loaned_on,
             _due_date, _number_of_renewals, number_of_letters,
             date_letters, _notes, loan_id) in enumerate(expired_loans):

            number_of_letters=int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id)
            elif number_of_letters == 1 and must_send_second_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id)
            elif number_of_letters == 2 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)
            elif number_of_letters >= 3 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)

            if content != '':
                title = book_title_from_MARC(recid)
                subject = "LOAN RECALL: " + title

                update_expired_loan(loan_id)
                send_overdue_letter(borrower_id, CFG_BIBCIRCULATION_LOANS_EMAIL, subject, content)

            if done % 10 == 0:
                task_update_progress("Loan recall: sent %d out of %d." % (done, total_expired_loans))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Loan recall: processed %d out of %d expires loans." % (done+1, total_expired_loans))
        write_message("Processed %d out of %d expired loans." % (done+1, total_expired_loans))

        # Recalls for expired ILLs
        write_message("Started overdue-letters for Inter Library Loans")
        expired_ills = db.get_all_expired_ills()
        total_expired_ills = len(expired_ills)

        for done, (ill_id, borrower_id, item_info, number_of_letters,
             date_letters) in enumerate(expired_ills):

            number_of_letters=int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL1'], ill_id, ill=1)
            elif number_of_letters == 1 and must_send_second_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL2'], ill_id, ill=1)
            elif number_of_letters == 2 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1)
            elif number_of_letters >= 3 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1)

            if content != '' and looks_like_dictionary(item_info):
                item_info = eval(item_info)
                if 'title' in item_info:
                    book_title = item_info['title']
                    subject = "ILL RECALL: " + str(book_title)
                    update_expired_loan(loan_id=ill_id, ill=1)
                    send_overdue_letter(borrower_id, CFG_BIBCIRCULATION_ILLS_EMAIL, subject, content)
            if done % 10 == 0:
                task_update_progress("ILL recall: sent %d out of %d." % (done, total_expired_ills))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("ILL recall: processed %d out of %d expired ills." % (done+1, total_expired_ills))
        write_message("Processed %d out of %d expired ills." % (done+1, total_expired_ills))

    return 1
Beispiel #56
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql(
            "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"
    ):
        write_message(
            "Previous requests of oairepository still being elaborated. Let's skip this execution."
        )
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in iteritems(recids_for_set)
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename, '-n',
                                              '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename,
                                              '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
    else:
        os.remove(filename)

    return True
Beispiel #57
0
def get_citation_informations(recid_list,
                              tags,
                              config,
                              fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'hdl': {},
        'isbn': {},
        'record_id': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'record_id': {},
        'isbn': {},
        'hdl': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info['record_id'][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, 'collections'):
            if recid not in recids_cache(config.get(function, 'collections')):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags['refs_report_number']:
            references_info['report-numbers'][recid] = [
                t.value
                for t in record.find_subfields(tags['refs_report_number'])
            ]
            msg = "references_info['report-numbers'][%s] = %r" \
                        % (recid, references_info['report-numbers'][recid])
            write_message(msg, verbose=9)
        if tags['refs_journal']:
            references_info['journals'][recid] = []
            for ref in record.find_subfields(tags['refs_journal']):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(',')
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ','.join([journal, alt_volume, page])
                        references_info['journals'][recid] += [alt_ref]
                references_info['journals'][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" \
                              % (recid, references_info['journals'][recid])
            write_message(msg, verbose=9)
        if tags['refs_doi']:
            references = [
                t.value for t in record.find_subfields(tags['refs_doi'])
            ]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info['doi'][recid] = dois
            references_info['hdl'][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags['refs_record_id']:
            references_info['record_id'][recid] = [
                t.value for t in record.find_subfields(tags['refs_record_id'])
            ]
            msg = "references_info['record_id'][%s] = %r" \
                                   % (recid, references_info['record_id'][recid])
            write_message(msg, verbose=9)
        if tags['refs_isbn']:
            references_info['isbn'][recid] = [
                t.value for t in record.find_subfields(tags['refs_isbn'])
            ]
            msg = "references_info['isbn'][%s] = %r" \
                                   % (recid, references_info['isbn'][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags['record_pri_number'] or tags['record_add_number']:
            records_info['report-numbers'][recid] = []

            if tags['record_pri_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_pri_number'])
                ]

            if tags['record_add_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_add_number'])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" \
                        % (recid, records_info['report-numbers'][recid])
            write_message(msg, verbose=9)

        if tags['doi']:
            records_info['doi'][recid] = []
            records_info['hdl'][recid] = []
            for tag in tags['doi']:
                for field in record.find_fields(tag[:5]):
                    if 'DOI' in field.get_subfield_values('2'):
                        dois = field.get_subfield_values('a')
                        records_info['doi'][recid].extend(dois)
                    elif 'HDL' in field.get_subfield_values('2'):
                        hdls = field.get_subfield_values('a')
                        records_info['hdl'][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" \
                                      % (recid, records_info['doi'][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" \
                                      % (recid, records_info['hdl'][recid])
            write_message(msg, verbose=9)

        if tags['isbn']:
            records_info['isbn'][recid] = []
            for tag in tags['isbn']:
                values = [t.value for t in record.find_subfields(tag)]
                records_info['isbn'][recid] += values

            msg = "records_info[%s]['isbn'] = %r" \
                                      % (recid, records_info['isbn'][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags['publication']:
            records_info['journals'][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" \
                                 % (recid, records_info['journals'][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info
Beispiel #58
0
def task_run_core():
    """ Performs a search to find records without a texkey, generates a new
    one and uploads the changes in chunks """
    recids = perform_request_search(p='-035:spirestex -035:inspiretex',
                                    cc='HEP')

    write_message("Found %s records to assign texkeys" % len(recids))
    processed_recids = []
    xml_to_process = []
    for count, recid in enumerate(recids):
        write_message("processing recid %s" % recid)

        # Check that the record does not have already a texkey
        has_texkey = False
        recstruct = get_record(recid)
        for instance in record_get_field_instances(recstruct,
                                                   tag="035",
                                                   ind1="",
                                                   ind2=""):
            try:
                provenance = field_get_subfield_values(instance, "9")[0]
            except IndexError:
                provenance = ""
            try:
                value = field_get_subfield_values(instance, "z")[0]
            except IndexError:
                try:
                    value = field_get_subfield_values(instance, "a")[0]
                except IndexError:
                    value = ""
            provenances = ["SPIRESTeX", "INSPIRETeX"]
            if provenance in provenances and value:
                has_texkey = True
                write_message("INFO: Record %s has already texkey %s" %
                              (recid, value))

        if not has_texkey:
            TexKeySeq = TexkeySeq()
            new_texkey = ""
            try:
                new_texkey = TexKeySeq.next_value(recid)
            except TexkeyNoAuthorError:
                write_message(
                    "WARNING: Record %s has no first author or collaboration" %
                    recid)
                continue
            except TexkeyNoYearError:
                write_message("WARNING: Record %s has no year" % recid)
                continue
            write_message("Created texkey %s for record %d" %
                          (new_texkey, recid))
            xml = create_xml(recid, new_texkey)
            processed_recids.append(recid)
            xml_to_process.append(xml)

        task_update_progress("Done %d out of %d." % (count, len(recids)))
        task_sleep_now_if_required()

    # sequence ID to be used in all subsequent tasks
    sequence_id = str(random.randrange(1, 4294967296))
    if xml_to_process:
        process_chunk(xml_to_process, sequence_id)

    # Finally, index all the records processed
    #FIXME: Waiting for sequence id to be fixed
    # if processed_recids:
    #     submit_bibindex_task(processed_recids, sequence_id)

    return True
Beispiel #59
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \
                 or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR
    # Check if directory /batchupload exists
    if not task_get_option('documents'):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        for folder in ["insert/", "append/", "correct/", "replace/"]:
            files_dir = parent_dir + folder
            files_done_dir = files_dir + "DONE/"
            try:
                files = os.listdir(files_dir)
            except OSError as e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(prefix=metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR)
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "-" + folder[0]
                    jobid = str(task_low_level_submission('bibupload', 'batchupload', mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile), os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of 4." % progress)
    else:
        # Documents upload
        parent_dir = daemon_dir + "/documents/"
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        matching_order = CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY
        for folder in ["append/", "revise/"]:
            try:
                os.mkdir(parent_dir + folder)
            except:
                pass
            for matching in matching_order:
                errors = document_upload(folder=parent_dir + folder, matching=matching, mode=folder[:-1])[0]
                if not errors:
                    break # All documents succedeed with that matching
                for error in errors:
                    write_message("File: %s - %s with matching %s" % (error[0], error[1], matching), sys.stderr)
            task_sleep_now_if_required(can_stop_too=True)
    return 1
Beispiel #60
0
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))

    kb_src = config.get(config.get("rank_method", "function"),
                        "kb_src").strip()
    # Find path from configuration registry by knowledge base name.
    kb_src_clean = configuration.get(kb_src)

    with open(kb_src_clean, 'r') as kb_file:
        data = kb_file.readlines()

    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip(
                (string.split(string.strip(line), "---"))[0])] = (string.split(
                    string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" %
                  len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"),
                      "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql(
            "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s"
            % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset()
            newset += [
                recid[0] for recid in (run_sql(
                    "SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s"
                    % (tag[0:2], tag[0:2]), (key, recids, recide)))
            ]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" %
                      len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if value in kb_data:
            if key not in rnkset:
                rnkset[key] = float(kb_data[value])
            else:
                if rnkset[key] in kb_data and float(kb_data[value]) > float(
                    (rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" %
                  len(rnkset))
    return rnkset