Ejemplo n.º 1
0
def fill_self_cites_tables(config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon
    """
    algorithm = config['algorithm']
    tags = get_authors_tags()
    all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')]
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun)
Ejemplo n.º 2
0
def fill_self_cites_tables(config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon
    """
    algorithm = config['algorithm']
    tags = get_authors_tags()
    all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')]
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun)
Ejemplo n.º 3
0
def bst_fibonacci(n=30):
    """
    Small tasklets that prints the the Fibonacci sequence for n.
    @param n: how many Fibonacci numbers to print.
    @type n: int
    """
    ## Since it's tasklet, the parameter might be passed as a string.
    ## it should then be converted to an int.
    n = int(n)
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Ejemplo n.º 4
0
def match_remote_ids(remote_ids):
    """ Matches remote IDs to local records, IDs that cannot be matched
    are returned as a list."""
    per_last = -1

    def percent_update(index, percent_last):
        """ Calculates completion percentage, updates task progress """
        per = 100 * float(index)/float(len(remote_ids))
        if per > (percent_last + 0.5):
            percent_last = per
            task_update_progress("Local matching %.1f%% (%d/%d)"
                                 % (per, index, len(remote_ids)))
        return percent_last

    missing = []
    for i, recid in enumerate(remote_ids):
        task_sleep_now_if_required(can_stop_too=True)
        per_last = percent_update(i, per_last)
        term = "035__9:%s and 035__a:%d" % (REMOTE_INSTANCE, recid)
        result = perform_request_search(p=term)
        if not result:
            missing.append(recid)
    _print("Of %d record IDs, %d were matched, %d are missing"
           % (len(remote_ids), (len(remote_ids) - len(missing)), len(missing)))
    return missing
Ejemplo n.º 5
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
        write_message("Getting configuration from file: %s" % filename,
            verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message("Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call"
                % key)

    return True
Ejemplo n.º 6
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Ejemplo n.º 7
0
def solr_commit_if_necessary(next_commit_counter,
                             final_commit=False,
                             recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (
            final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Ejemplo n.º 8
0
def fetch_xml_files(folder, els, new_files):
    """Recursively gets the downloaded xml files
    converts them to marc xml format and stores them
    in the same directory with the name "upload.xml"."""
    if exists(folder):
        for subfolder in listdir(folder):
            subfolder = join(folder, subfolder).lstrip()
            if isfile(subfolder):
                if not subfolder.endswith('upload.xml'):
                    folders = subfolder.split('/')
                    folders[-1] = 'upload.xml'
                    file_loc = "/".join(folders)
                    if not exists(file_loc):
                        xmlFile = open(subfolder, "r")
                        xmlString = xmlFile.read()
                        xmlFile.close()
                        dom_xml = xml.dom.minidom.parseString(xmlString)
                        doi = els.get_publication_information(dom_xml)[-1]
                        write_message("DOI in record: %s" % (doi,))
                        res = perform_request_search(p="doi:%s" % (doi,),
                                                     of="id")
                        if not res:
                            write_message("DOI not found")
                            doctype = els.get_doctype(dom_xml).lower()
                            #ignore index pages
                            if doctype in INTERESTING_DOCTYPES:
                                marcfile = open(file_loc, 'w')
                                marcfile.write(els.get_record(subfolder))
                                marcfile.close()
                                new_files.append(file_loc)
                                task_sleep_now_if_required(can_stop_too=False)
                        else:
                            write_message("DOI found: %s" % (res,))
            else:
                fetch_xml_files(subfolder, els, new_files)
Ejemplo n.º 9
0
def afs_sync(modified_records, time_estimator, tot, now):
    """Sync to AFS."""
    write_message("Appending output to %s" % CFG_OUTPUT_PATH)
    prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz'
    r = gzip.open(prodsyncname, "w")
    print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for i, recid in enumerate(modified_records):
        with run_ro_on_slave_db():
            record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record))
        else:
            print >> r, record
        if shall_sleep(recid, i, tot, time_estimator):
            r.flush()
            task_sleep_now_if_required()
    print >> r, '</collection>'
    r.close()
    prodsync_tarname = CFG_OUTPUT_PATH + '.tar'
    write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname))
    prodsync_tar = tarfile.open(prodsync_tarname, 'a')
    prodsync_tar.add(prodsyncname)
    prodsync_tar.close()
    os.remove(prodsyncname)
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 11
0
def fetch_xml_files(folder, els, new_files):
    """Recursively gets the downloaded xml files
    converts them to marc xml format and stores them
    in the same directory with the name "upload.xml"."""
    if exists(folder):
        for subfolder in listdir(folder):
            subfolder = join(folder, subfolder).lstrip()
            if isfile(subfolder):
                if not subfolder.endswith('upload.xml'):
                    folders = subfolder.split('/')
                    folders[-1] = 'upload.xml'
                    file_loc = "/".join(folders)
                    if not exists(file_loc):
                        xmlFile = open(subfolder, "r")
                        xmlString = xmlFile.read()
                        xmlFile.close()
                        dom_xml = xml.dom.minidom.parseString(xmlString)
                        doi = els.get_publication_information(dom_xml)[-1]
                        write_message("DOI in record: %s" % (doi,))
                        res = perform_request_search(p="doi:%s" % (doi,),
                                                     of="id")
                        if not res:
                            write_message("DOI not found")
                            doctype = els.get_doctype(dom_xml).lower()
                            #ignore index pages
                            if doctype in INTERESTING_DOCTYPES:
                                marcfile = open(file_loc, 'w')
                                marcfile.write(els.get_record(subfolder))
                                marcfile.close()
                                new_files.append(file_loc)
                                task_sleep_now_if_required(can_stop_too=False)
                        else:
                            write_message("DOI found: %s" % (res,))
            else:
                fetch_xml_files(subfolder, els, new_files)
Ejemplo n.º 12
0
def fetch_xml_files(folder, els, new_files):
    """Recursively gets the downloaded xml files
    converts them to marc xml format and stores them
    in the same directory with the name "upload.xml"."""
    for path, folders, files in walk(folder):
        for fl in files:
            if fl != "upload.xml":
                file_loc = join(path, "upload.xml")
                if not exists(file_loc):
                    record_path = join(path, fl)
                    dom_xml = parse(record_path)
                    doi = els.get_publication_information(dom_xml)[-1]
                    res = None
                    if doi:
                        write_message("DOI in record: %s" % (doi,))
                        res = perform_request_search(p="doi:%s" % (doi,), of="id")
                    if not res:
                        write_message("DOI not found in record: \n%s" % (join(path, fl),))
                        doctype = els.get_doctype(dom_xml).lower()
                        # ignore index pages
                        if doctype in INTERESTING_DOCTYPES:
                            marcfile = open(file_loc, "w")
                            marcfile.write(els.get_record(record_path))
                            marcfile.close()
                            new_files.append(file_loc)
                            task_sleep_now_if_required(can_stop_too=False)
                        else:
                            write_message("DOI found: %s" % (res,))
Ejemplo n.º 13
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Ejemplo n.º 14
0
def iterate_over_new(list, fmt):
    "Iterate over list of IDs"
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)):
            run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt))
        else:
            run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Ejemplo n.º 15
0
def iterate_over_new(recIDs, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call

    tot = len(recIDs)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID,
                                                  of=fmt,
                                                  on_the_fly=True,
                                                  save_missing=False)
        save_preformatted_record(recID=recID,
                                 of=fmt,
                                 res=formatted_record,
                                 needs_2nd_pass=needs_2nd_pass,
                                 low_priority=True)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Ejemplo n.º 16
0
def convert_files(xml_files, els, prefix="", threshold_date=None):
    """Convert the list of publisher XML to MARCXML using given instance."""
    results = {}
    for xml_file in xml_files:
        task_sleep_now_if_required()
        full_xml_filepath = join(prefix, xml_file)
        dom_xml = parse(full_xml_filepath)
        date = els.get_publication_information(dom_xml)[-2]
        if threshold_date and date < threshold_date:
            continue
        doctype = els.get_doctype(dom_xml).lower()
        if doctype in INTERESTING_DOCTYPES:
            new_full_xml_filepath = join(dirname(full_xml_filepath),
                                         "upload.xml")
            try:
                converted_xml = els.get_record(full_xml_filepath,
                                               refextract_callback=refextract)
            except Exception as e:
                _errors_detected.append(e)
                error_trace = traceback.format_exc()
                # Some error happened, lets gracefully quit
                results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR,
                                              error_trace)
                write_message('Error converting:'
                              ' \n {0}'.format(error_trace))
                continue
            with open(new_full_xml_filepath, "w") as marcfile:
                marcfile.write(converted_xml)
            results[full_xml_filepath] = (StatusCodes.OK,
                                          new_full_xml_filepath)
        else:
            results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG, doctype)
            write_message("Doctype not interesting: {0}".format(doctype))
    return results
Ejemplo n.º 17
0
def process_affiliations(record_ids=None, all_records=False):
    name = 'affiliations'

    if all_records:
        records = intbitset(run_sql("SELECT id FROM bibrec"))
        start_time = datetime.now()
    elif record_ids:
        records = intbitset(record_ids)
        start_time = None
    else:
        dummy_last_recid, last_updated = fetch_last_updated(name)
        start_time = datetime.now()
        sql = """SELECT `id` FROM `bibrec`
                 WHERE `modification_date` >= %s
                 AND `modification_date` <= %s
                 ORDER BY `modification_date`"""
        records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()]))

    records_iter = iter(records)
    processed_records_count = 0
    while True:
        task_sleep_now_if_required()
        chunk = list(islice(records_iter, CHUNK_SIZE))
        if not chunk:
            break
        process_and_store(chunk)
        processed_records_count += len(chunk)
        task_update_progress('processed %s out of %s records' % (processed_records_count, len(records)))
    if start_time:
        store_last_updated(None, start_time, name)
Ejemplo n.º 18
0
def convert_files(xml_files, els, prefix="", threshold_date=None):
    """Convert the list of publisher XML to MARCXML using given instance."""
    results = {}
    for xml_file in xml_files:
        task_sleep_now_if_required()
        full_xml_filepath = join(prefix, xml_file)
        dom_xml = parse(full_xml_filepath)
        date = els.get_publication_information(dom_xml)[-2]
        if threshold_date and date < threshold_date:
            continue
        doctype = els.get_doctype(dom_xml).lower()
        if doctype in INTERESTING_DOCTYPES:
            new_full_xml_filepath = join(dirname(full_xml_filepath),
                                         "upload.xml")
            try:
                converted_xml = els.get_record(
                    full_xml_filepath, refextract_callback=refextract)
            except Exception as e:
                _errors_detected.append(e)
                error_trace = traceback.format_exc()
                # Some error happened, lets gracefully quit
                results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR,
                                              error_trace)
                write_message('Error converting:'
                              ' \n {0}'.format(error_trace))
                continue
            with open(new_full_xml_filepath, "w") as marcfile:
                marcfile.write(converted_xml)
            results[full_xml_filepath] = (StatusCodes.OK,
                                          new_full_xml_filepath)
        else:
            results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG,
                                          doctype)
            write_message("Doctype not interesting: {0}".format(doctype))
    return results
Ejemplo n.º 19
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0     # time taken up by external call
    tbibupload = 0     # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Ejemplo n.º 20
0
def match_remote_ids(remote_ids):
    """ Matches remote IDs to local records, IDs that cannot be matched
    are returned as a list."""
    per_last = -1

    def percent_update(index, percent_last):
        """ Calculates completion percentage, updates task progress """
        per = 100 * float(index) / float(len(remote_ids))
        if per > (percent_last + 0.5):
            percent_last = per
            task_update_progress("Local matching %.1f%% (%d/%d)" %
                                 (per, index, len(remote_ids)))
        return percent_last

    missing = []
    for i, recid in enumerate(remote_ids):
        task_sleep_now_if_required(can_stop_too=True)
        per_last = percent_update(i, per_last)
        term = "035__9:%s and 035__a:%d" % (REMOTE_INSTANCE, recid)
        result = perform_request_search(p=term)
        if not result:
            missing.append(recid)
    _print("Of %d record IDs, %d were matched, %d are missing" %
           (len(remote_ids), (len(remote_ids) - len(missing)), len(missing)))
    return missing
Ejemplo n.º 21
0
def process_record_batch(batch):
    """ Splitting the matching remotely job into parts, function does the
    matching of remote records to local IDs """
    _print("Processing batch, recid #%d to #%d" % (batch[0], batch[-1]), 4)
    # Local ID: Remote ID
    appends = {}
    problems = []
    for recid in batch:
        task_sleep_now_if_required(can_stop_too=True)

        # Here we are taking a rest
        time.sleep(0.5)

        _print("Processing recid %d" % recid, 9)
        record = get_remote_record(recid)
        if record is None:
            _print("Error: Could not fetch remote record %s" % (str(recid), ),
                   5)
            continue
        else:
            local_id = extract_035_id(record)
            if not local_record_exists(local_id):
                _print("Local record does not exist", 5)
                problems.append(recid)
                continue
            else:
                _print(
                    "Matching remote id %d to local record %s" %
                    (recid, local_id), 5)
                appends[local_id] = recid
    _print(
        "Batch matching done: %d IDs matched, %d IDs not matched" %
        (len(appends), len(problems)), 4)
    return appends, problems
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Ejemplo n.º 23
0
def solr_add_range(lower_recid, upper_recid):
    """
    Adds the regarding field values of all records from the lower recid to the upper one to Solr.
    It preserves the fulltext information.
    """
    for recid in range(lower_recid, upper_recid + 1):
        if record_exists(recid):
            try:
                abstract = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0]), 'utf-8')
            except:
                abstract = ""
            try:
                first_author = remove_control_characters(get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0])
                additional_authors = remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), ''))
                author = unicode(first_author + " " + additional_authors, 'utf-8')
            except:
                author = ""
            try:
                bibrecdocs = BibRecDocs(recid)
                fulltext = unicode(remove_control_characters(bibrecdocs.get_text()), 'utf-8')
            except:
                fulltext = ""
            try:
                keyword = unicode(remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), '')), 'utf-8')
            except:
                keyword = ""
            try:
                title = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_TITLE)[0]), 'utf-8')
            except:
                title = ""
            solr_add(recid, abstract, author, fulltext, keyword, title)

    SOLR_CONNECTION.commit()
    task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 24
0
def afs_sync(modified_records, time_estimator, tot, now):
    """Sync to AFS."""
    write_message("Appending output to %s" % CFG_OUTPUT_PATH)
    prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz'
    r = gzip.open(prodsyncname, "w")
    print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for i, recid in enumerate(modified_records):
        record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record
            ))
        else:
            print >> r, record
        if shall_sleep(recid, i, tot, time_estimator):
            r.flush()
            task_sleep_now_if_required()
    print >> r, '</collection>'
    r.close()
    prodsync_tarname = CFG_OUTPUT_PATH + '.tar'
    write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname))
    prodsync_tar = tarfile.open(prodsync_tarname, 'a')
    prodsync_tar.add(prodsyncname)
    prodsync_tar.close()
    os.remove(prodsyncname)
Ejemplo n.º 25
0
def match_missing_ids(remote_ids, batch_size):
    """ For ID pairings that are missing, this function splits the missing
    IDs into batches. The records are pulled from remote, the 035 field read
    and then the remote ID appended to the local record.

    Parameters:
     remote_ids - a list of missing remote rec-ids
     batch_size - How many records to match at a time
    Returns:
     count_appends - number of records being appended
     count_problems - number of records which could not be matched at all
    """
    count_appends = 0
    count_problems = 0

    batches = [remote_ids[x:x+batch_size] for x in
               xrange(0, len(remote_ids), batch_size)]
    _print("Identified %d records which their remote IDs updating."
           % len(remote_ids))
    _print("Processing %d batches of size %d" % (len(batches), batch_size))
    for i, batch in enumerate(batches, 1):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Batch %d of %d" % (i, len(batches)))
        _print("Batch %d of %d" % (i, len(batches)))
        try:
            appends, problems = process_record_batch(batch)
            count_appends += len(appends)
            count_problems += len(problems)
            write_to_file('missing_ids.txt', problems, append=True)
            _print("Submitting batch #%d to BibUpload for appending..." % i, 4)
            start_bibupload_job(appends)
        except StandardError, e:
            _print("Error occured during match of batch %d: %s\n%s"
                   % (i, e, traceback.format_exc()), 2)
Ejemplo n.º 26
0
def generate_sitemaps(sitemap_index_writer, records, output_directory,
                      sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_RECORDS,
                                 priority=DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress(
                "Google Scholar sitemap '%s' for recid %s/%s" %
                (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
def bst_inspire_authority_ids_synchronizer(
        url=SYNC_URL_INSPIRE_RECORDS_SRC, tmp_dir=SYNC_LOCAL_TMP_DIR):
    """Synchronize INSPIRE authority ids.

    :param string url: valid URL to the gzip (.gz) file
    :param string tmp_dir: existing directory path for temporary files
    """
    xml_content = get_inspire_dump(
        url, os.path.join(tmp_dir, SYNC_LOCAL_INSPIRE_RECORDS_FILE_NAME))

    task_sleep_now_if_required()

    authority_ids = parse_inspire_xml(xml_content)

    task_sleep_now_if_required()

    if authority_ids:
        record_ids = get_record_ids()
        write_message(
            "Info: {0} record ids have been requested".format(len(record_ids)))
        if record_ids:
            synchronize(
                record_ids,
                authority_ids,
                os.path.join(tmp_dir, SYNC_LOCAL_CDS_RECORDS_UPDATES_FILE_NAME))
Ejemplo n.º 28
0
def fetch_xml_files(folder, els, new_files):
    """Recursively gets the downloaded xml files
    converts them to marc xml format and stores them
    in the same directory with the name "upload.xml"."""
    for path, folders, files in walk(folder):
        for fl in files:
            if fl != 'upload.xml':
                file_loc = join(path, 'upload.xml')
                if not exists(file_loc):
                    record_path = join(path, fl)
                    dom_xml = parse(record_path)
                    doi = els.get_publication_information(dom_xml)[-1]
                    res = None
                    if doi:
                        write_message("DOI in record: %s" % (doi, ))
                        res = perform_request_search(p="doi:%s" % (doi, ),
                                                     of="id")
                    if not res:
                        write_message("DOI not found in record: \n%s" %
                                      (join(path, fl), ))
                        doctype = els.get_doctype(dom_xml).lower()
                        #ignore index pages
                        if doctype in INTERESTING_DOCTYPES:
                            marcfile = open(file_loc, 'w')
                            marcfile.write(els.get_record(record_path))
                            marcfile.close()
                            new_files.append(file_loc)
                            task_sleep_now_if_required(can_stop_too=False)
                        else:
                            write_message("DOI found: %s" % (res, ))
Ejemplo n.º 29
0
def process_record_batch(batch):
    """ Splitting the matching remotely job into parts, function does the
    matching of remote records to local IDs """
    _print("Processing batch, recid #%d to #%d" % (batch[0], batch[-1]), 4)
    # Local ID: Remote ID
    appends = {}
    problems = []
    for recid in batch:
        task_sleep_now_if_required(can_stop_too=True)

        # Here we are taking a rest
        time.sleep(0.5)

        _print("Processing recid %d" % recid, 9)
        record = get_remote_record(recid)
        if record is None:
            _print("Error: Could not fetch remote record %s" % (str(recid),), 5)
            continue
        else:
            local_id = extract_035_id(record)
            if not local_record_exists(local_id):
                _print("Local record does not exist", 5)
                problems.append(recid)
                continue
            else:
                _print("Matching remote id %d to local record %s"
                       % (recid, local_id), 5)
                appends[local_id] = recid
    _print("Batch matching done: %d IDs matched, %d IDs not matched"
           % (len(appends), len(problems)), 4)
    return appends, problems
Ejemplo n.º 30
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option(
            "run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
        write_message("Getting configuration from file: %s" % filename,
                      verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message(
                "Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call" % key)

    return True
Ejemplo n.º 31
0
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    for i, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 32
0
def bst_autoclaim():
    orcid_personid_map = get_orcid_personid_map()
    papers = get_papers_with_orcid()
    for i, recid in enumerate(papers):
        autoclaim_paper(recid, orcid_personid_map)
        if i % 10 == 0:
            task_update_progress("Done %s out of %s records (%s%%)" % (i, len(papers), 100*(i)/len(papers)))
            task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 33
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \
                 or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR
    # Check if directory /batchupload exists
    if not task_get_option('documents'):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        list_of_folders = [
            "insert", "append", "correct", "replace", "holdingpen"
        ]
        for folder in list_of_folders:
            files_dir = os.path.join(parent_dir, folder)
            files_done_dir = os.path.join(files_dir, "DONE")
            try:
                files = os.listdir(files_dir)
            except OSError, e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
                write_message("Created new folder %s" % (files_dir, ))
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(
                        prefix=metafile + "_" +
                        time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
                        dir=CFG_TMPSHAREDDIR)
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "--" + folder
                    jobid = str(
                        task_low_level_submission('bibupload', 'batchupload',
                                                  mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime(
                        "%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile),
                              os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of %d." %
                                 (progress, len(list_of_folders)))
Ejemplo n.º 34
0
def bst_synchronize_recids(search_terms=SEARCH_TERMS,
                           log_dir=None,
                           collection=COLLECTION,
                           batch_size=BATCH_SIZE,
                           debug=False,
                           remote_ids=None):
    """Synchronize record IDs between the CERN Document Server (CDS) and Inspire

This BibTasklet is intended to be a general purpose replacement for
'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should
be executable on both CDS and Inspire.

Generally there should be no need to modify these parameters, the
script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf
to determine what type of Invenio instance we're running on. These
parameters will be set by default to the correct values to
synchronise all IDs, though you may want to limit records manually.

Parameters:
 search_terms - The term to use to get record IDs
                (Default "035:<LOCAL>)
 log_dir - The directory to store the log file in
           (Defaults to CFG_TMPSHAREDDIR)
 collection - What collection to take from
              (Default is no collection)
 batch_size - How many records to try and ammend at once
              (Default 200)
 debug - If True, this script will run against the TEST instances
         (Default false)
 remote_ids - Comma seperated values of remote IDs, if this is
              specified, remote IDs will not be searched for.
    """
    configure_globals(search_terms, log_dir, debug)
    _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE))

    if not remote_ids:
        task_update_progress("Finding remote records on %s with %s IDs" %
                             (REMOTE_INSTANCE, LOCAL_INSTANCE))
        remote_ids = get_remote_ids(search_terms, collection)
    else:
        remote_ids = [int(rid) for rid in remote_ids.split(',')]

    task_sleep_now_if_required(can_stop_too=True)
    task_update_progress("Matching remote IDs to local records")
    missing_ids = match_remote_ids(remote_ids)

    count_appends, count_problems = match_missing_ids(missing_ids, batch_size)

    _print("======================== FINAL SCORE ========================", 1)
    _print(" Records matched: %d" % (len(remote_ids) - len(missing_ids)), 1)
    _print(" Records appended: %d" % count_appends, 1)
    _print(" IDs not matched (broken link!): %d" % count_problems, 1)
    _print("=============================================================", 1)

    _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE))

    return True
Ejemplo n.º 35
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ),
                  verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" % (len(records_concerned), ))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress(
            "Processing records %s/%s (%i%%)" %
            (records_processed, len(records_concerned),
             int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" %
                              (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" %
                                      (ticket, ),
                                      verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                          (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid, ))
                    else:
                        write_message("Skipping record %s", (recid, ))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" %
                                  (recid, ))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" %
                                                (ticket_name, ))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
Ejemplo n.º 36
0
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
Ejemplo n.º 38
0
def task_run_core():
    """
    run daemon
    """

    if task_get_option("update-borrowers"):
        list_of_borrowers = db.get_all_borrowers()

        total_borrowers = len(list_of_borrowers)
        done  = 0

        for borrower in list_of_borrowers:
            user_id = borrower[0]
            update_user_info_from_ldap(user_id)
            done+=1
            task_update_progress("Done %d out of %d." % (done, total_borrowers))
            task_sleep_now_if_required(can_stop_too=True)

    if task_get_option("overdue-letters"):
        expired_loans = db.get_all_expired_loans()

        total_expired_loans = len(expired_loans)
        done  = 0

        for (borrower_id, _bor_name, recid, _barcode, _loaned_on,
             _due_date, _number_of_renewals, number_of_letters,
             date_letters, _notes, loan_id) in expired_loans:

            number_of_letters=int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id)
            elif number_of_letters == 1 and must_send_second_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id)
            elif number_of_letters == 2 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)
            elif number_of_letters >= 3 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)

            if content != '':
                title = book_title_from_MARC(recid)
                subject = "LOAN RECALL: " + title

                update_expired_loan(loan_id)
                send_overdue_letter(borrower_id, subject, content)

            done+=1

            task_update_progress("Done %d out of %d." % (done, total_expired_loans))

            task_sleep_now_if_required(can_stop_too=True)
            time.sleep(1)

    return 1
Ejemplo n.º 39
0
def download_feed(feed_url, batch_size, delete_zip, new_sources,
                  directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)
    for fileUrl, fileName in entries:
        task_sleep_now_if_required()
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            fileUrl = fileUrl.replace(' ', '%20')
            try:
                write_message("Downloading %s to %s\n" % (fileUrl,
                                                          outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                _errors_detected.append(err)
                write_message("URL could not be opened: %s" % fileUrl)
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                xml_files.extend(extractAll(outFilename,
                                            delete_zip,
                                            directory))
            except BadZipfile:
                _errors_detected.append(err)
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
    return xml_files
Ejemplo n.º 40
0
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=sys.stdout,
                          verbose=0)
    for i, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s" %
                              (pids.index(p) + 1, len(pids)))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" %
                                     (pids.index(p) + 1, len(pids)))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 41
0
def bst_synchronize_recids(search_terms=SEARCH_TERMS, log_dir=None,
                           collection=COLLECTION, batch_size=BATCH_SIZE,
                           debug=False, remote_ids=None):
    """Synchronize record IDs between the CERN Document Server (CDS) and Inspire

This BibTasklet is intended to be a general purpose replacement for
'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should
be executable on both CDS and Inspire.

Generally there should be no need to modify these parameters, the
script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf
to determine what type of Invenio instance we're running on. These
parameters will be set by default to the correct values to
synchronise all IDs, though you may want to limit records manually.

Parameters:
 search_terms - The term to use to get record IDs
                (Default "035:<LOCAL>)
 log_dir - The directory to store the log file in
           (Defaults to CFG_TMPSHAREDDIR)
 collection - What collection to take from
              (Default is no collection)
 batch_size - How many records to try and ammend at once
              (Default 200)
 debug - If True, this script will run against the TEST instances
         (Default false)
 remote_ids - Comma seperated values of remote IDs, if this is
              specified, remote IDs will not be searched for.
    """
    configure_globals(search_terms, log_dir, debug)
    _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE))

    if not remote_ids:
        task_update_progress("Finding remote records on %s with %s IDs"
                             % (REMOTE_INSTANCE, LOCAL_INSTANCE))
        remote_ids = get_remote_ids(search_terms, collection)
    else:
        remote_ids = [int(rid) for rid in remote_ids.split(',')]

    task_sleep_now_if_required(can_stop_too=True)
    task_update_progress("Matching remote IDs to local records")
    missing_ids = match_remote_ids(remote_ids)

    count_appends, count_problems = match_missing_ids(missing_ids, batch_size)

    _print("======================== FINAL SCORE ========================", 1)
    _print(" Records matched: %d" % (len(remote_ids)-len(missing_ids)), 1)
    _print(" Records appended: %d" % count_appends, 1)
    _print(" IDs not matched (broken link!): %d" % count_problems, 1)
    _print("=============================================================", 1)

    _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE))

    return True
Ejemplo n.º 42
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, 'dumps'))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
                      "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            'prefix': CFG_SITE_URL,
            'collection': collection,
            'date': time.ctime()
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, 'dumps',
                                   '.%s-records.xml.gz' % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            with run_ro_on_slave_db():
                print >> output, format_record(recid, 'xme', user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s" %
                    (collection, recid, (i + 1) * 100 / tot,
                     time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time_estimation))))
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >> open(output_path + '.md5', "w"), calculate_md5(output_path)
        os.rename(
            output_path,
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz' % collection))
        os.rename(
            output_path + '.md5',
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz.md5' % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
              os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
Ejemplo n.º 43
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Ejemplo n.º 44
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = (
        CFG_BATCHUPLOADER_DAEMON_DIR[0] == "/"
        and CFG_BATCHUPLOADER_DAEMON_DIR
        or CFG_PREFIX + "/" + CFG_BATCHUPLOADER_DAEMON_DIR
    )
    # Check if directory /batchupload exists
    if not task_get_option("documents"):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        list_of_folders = ["insert", "append", "correct", "replace", "holdingpen"]
        for folder in list_of_folders:
            files_dir = os.path.join(parent_dir, folder)
            files_done_dir = os.path.join(files_dir, "DONE")
            try:
                files = os.listdir(files_dir)
            except OSError, e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
                write_message("Created new folder %s" % (files_dir,))
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(
                        prefix=metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
                        dir=CFG_TMPSHAREDDIR,
                    )
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "--" + folder
                    jobid = str(task_low_level_submission("bibupload", "batchupload", mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile), os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of %d." % (progress, len(list_of_folders)))
Ejemplo n.º 45
0
def compute_cache_mp(pids):
    from multiprocessing import Pool
    p = Pool()
    bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    sl = 100
    ss = [pids[i: i + sl] for i in range(0, len(pids), sl)]
    for i, bunch in enumerate(ss):
        bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss)))
        p.map(_compute_cache_for_person, bunch)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
Ejemplo n.º 46
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Ejemplo n.º 47
0
def bst_inspire_cds_synchro():
    task_update_progress("Phase 1: extracting IDs for %s" % CFG_OTHER_SITE)
    export_file = open(CFG_EXPORT_FILE + '.part', "w")
    for i, row in enumerate(iter_export_rows()):
        print >> export_file, row
        if i % 100 == 0:
            task_sleep_now_if_required(can_stop_too=True)
    export_file.close()
    shutil.move(CFG_EXPORT_FILE + '.part', CFG_EXPORT_FILE)
    task_sleep_now_if_required(can_stop_too=True)
    if os.path.exists(CFG_IMPORT_FILE):
        task_update_progress("Phase 2: importing IDs from %s" % CFG_OTHER_SITE)
        import_recid_list(open(CFG_IMPORT_FILE))
Ejemplo n.º 48
0
def redis_sync(modified_records, time_estimator, tot):
    """Sync to redis."""
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    for i, recid in enumerate(modified_records):
        record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record
            ))
        else:
            r.rpush('legacy_records', zlib.compress(record))
        if shall_sleep(recid, i, tot, time_estimator):
            task_sleep_now_if_required()
Ejemplo n.º 49
0
def bst_inspire_cds_synchro():
    task_update_progress("Phase 1: extracting IDs for %s" % CFG_OTHER_SITE)
    export_file = open(CFG_EXPORT_FILE + ".part", "w")
    for i, row in enumerate(iter_export_rows()):
        print >> export_file, row
        if i % 100 == 0:
            task_sleep_now_if_required(can_stop_too=True)
    export_file.close()
    shutil.move(CFG_EXPORT_FILE + ".part", CFG_EXPORT_FILE)
    task_sleep_now_if_required(can_stop_too=True)
    if os.path.exists(CFG_IMPORT_FILE):
        task_update_progress("Phase 2: importing IDs from %s" % CFG_OTHER_SITE)
        import_recid_list(open(CFG_IMPORT_FILE))
Ejemplo n.º 50
0
def bst_fix_ffts(debug=0):
    debug = bool(int(debug))
    ffts = {}
    for recid in get_broken_recids():
        task_sleep_now_if_required(can_stop_too=True)
        write_message("Fixing %s" % recid)
        try:
            ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid)))
        except:
            register_exception(alert_admin=True)
    write_message("Uploading corrections")
    bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False)
    return True
Ejemplo n.º 51
0
def bst_fix_ffts(debug=0):
    debug = bool(int(debug))
    ffts = {}
    for recid in get_broken_recids():
        task_sleep_now_if_required(can_stop_too=True)
        write_message("Fixing %s" % recid)
        try:
            ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid)))
        except:
            register_exception(alert_admin=True)
    write_message("Uploading corrections")
    bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False)
    return True
Ejemplo n.º 52
0
def redis_sync(modified_records, time_estimator, tot):
    """Sync to redis."""
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    for i, recid in enumerate(modified_records):
        with run_ro_on_slave_db():
            record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0]
        if not record:
            write_message("Error formatting record {0} as 'xme': {1}".format(
                recid, record))
        else:
            r.rpush('legacy_records', zlib.compress(record))
        if shall_sleep(recid, i, tot, time_estimator):
            task_sleep_now_if_required()
Ejemplo n.º 53
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" %
                  (str(tickets_to_apply),), verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" %
                  (len(records_concerned),))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress("Processing records %s/%s (%i%%)"
                             % (records_processed, len(records_concerned),
                                int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" % (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" % (ticket,), verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                         (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid,))
                    else:
                        write_message("Skipping record %s", (recid,))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" % (recid,))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name,))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))
    input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r')
    data = input.readlines()
    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = HitSet(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = HitSet()
            newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if kb_data.has_key(value):
            if not rnkset.has_key(key):
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset
Ejemplo n.º 55
0
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" %
                   config.get(config.get("rank_method", "function"), "kb_src"))
    with open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') as f:
        for line in f:
            if not line[0:1] == "#":
                key, value = line.strip().split("---")
                kb_data[key.strip()] = value.strip()
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(",")
    if tags == ['']:
        tags = ""

    records = []
    for recids, recide in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset(run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))
            valid &= newset
        if tags:
            recs = [(rec, value) for recid, value in recs if recid in valid]
        records += list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = [(recid, value) for recid, value in records if recid in options["validset"]]
    rnkset = {}
    for key, value in records:
        if value in kb_data:
            if key not in rnkset:
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset