Ejemplo n.º 1
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Ejemplo n.º 2
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(
        ' ', '_') + '.sql.gz'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Ejemplo n.º 3
0
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" % (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" % (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
Ejemplo n.º 4
0
def clean_bibxxx():
    """
    Clean unreferenced bibliographic values from bibXXx tables.
    This is useful to prettify browse results, as it removes
    old, no longer used values.

    WARNING: this function must be run only when no bibupload is
    running and/or sleeping.
    """
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES STARTED""")
    for xx in range(0, 100):
        bibxxx = 'bib%02dx' % xx
        bibrec_bibxxx = 'bibrec_bib%02dx' % xx
        if task_get_option('verbose') >= 9:
            num_unref_values = run_sql("""SELECT COUNT(*) FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })[0][0]
        run_sql("""DELETE %(bibxxx)s FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })
        if task_get_option('verbose') >= 9:
            write_message(""" - %d unreferenced %s values cleaned""" % \
                          (num_unref_values, bibxxx))
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES FINISHED""")
Ejemplo n.º 5
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
        write_message("Getting configuration from file: %s" % filename,
            verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message("Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call"
                % key)

    return True
Ejemplo n.º 6
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(' ','_') + '.sql'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Ejemplo n.º 7
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    dummy, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = """SELECT `id_bibrec`, `cd` FROM `bibdocfsinfo`
                 INNER JOIN `bibrec_bibdoc`
                 ON `bibdocfsinfo`.`id_bibdoc` = `bibrec_bibdoc`.`id_bibdoc`
                 WHERE `cd` > %s
                 AND format IN ('.pdf', '.PDF', '.pdf;pdfa', '.PDF;pdfa')
                 ORDER BY `cd`"""
        records = run_sql(sql, [last_date.isoformat()])
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql(
                """SELECT `id`, NULL FROM `bibrec`
                                 WHERE `id` IN (%s)
                                 ORDER BY `id`""" % format_strings,
                list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Ejemplo n.º 8
0
def parse_option(key, value, dummy, args):
    """Parse command line options"""

    if args:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key == '--rebuild':
        task_set_option('rebuild', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        collections.update(split_cli_ids_arg(value))
    elif key in ('-r', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_cli_ids_arg(value))

    return True
Ejemplo n.º 9
0
def clean_bibxxx():
    """
    Clean unreferenced bibliographic values from bibXXx tables.
    This is useful to prettify browse results, as it removes
    old, no longer used values.

    WARNING: this function must be run only when no bibupload is
    running and/or sleeping.
    """
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES STARTED""")
    for xx in range(0, 100):
        bibxxx = 'bib%02dx' % xx
        bibrec_bibxxx = 'bibrec_bib%02dx' % xx
        if task_get_option('verbose') >= 9:
            num_unref_values = run_sql("""SELECT COUNT(*) FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })[0][0]
        run_sql("""DELETE %(bibxxx)s FROM %(bibxxx)s
                     LEFT JOIN %(bibrec_bibxxx)s
                            ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx
                     WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL""" % \
                        {'bibxxx': bibxxx,
                         'bibrec_bibxxx': bibrec_bibxxx, })
        if task_get_option('verbose') >= 9:
            write_message(""" - %d unreferenced %s values cleaned""" % \
                          (num_unref_values, bibxxx))
    write_message("""CLEANING OF UNREFERENCED bibXXx VALUES FINISHED""")
Ejemplo n.º 10
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option(
            "run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    for key in task_get_option("run"):
        task_sleep_now_if_required(can_stop_too=True)
        write_message("")
        filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
        write_message("Getting configuration from file: %s" % filename,
                      verbose=9)
        config = ConfigParser.ConfigParser()
        try:
            config.readfp(open(filename))
        except StandardError:
            write_message(
                "Cannot find configuration file: %s. "
                "The rankmethod may also not be registered using "
                "the BibRank Admin Interface." % filename, sys.stderr)
            raise

        #Using the function variable to call the function related to the
        #rank method
        cfg_function = config.get("rank_method", "function")
        func_object = globals().get(cfg_function)
        if func_object:
            func_object(key)
        else:
            write_message("Cannot run method '%s', no function to call" % key)

    return True
Ejemplo n.º 11
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(suffix='.xml',
                                    prefix="bibcheckfile_%s" %
                                    time.strftime("%Y-%m-%d_%H:%M:%S"),
                                    dir=CFG_TMPSHAREDDIR)
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    if task_get_option("notimechange"):
        task = task_low_level_submission('bibupload', 'bibcheck', flag,
                                         tmp_file, "--notimechange")
    else:
        task = task_low_level_submission('bibupload', 'bibcheck', flag,
                                         tmp_file)
    write_message("Submitted bibupload task %s" % task)
Ejemplo n.º 12
0
def parse_option(key, value, dummy, args):
    """Parse command line options"""

    if args:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
    elif key == '--rebuild':
        task_set_option('rebuild', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        collections.update(split_cli_ids_arg(value))
    elif key in ('-r', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_cli_ids_arg(value))

    return True
Ejemplo n.º 13
0
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" %
                                 (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" %
                                 (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
Ejemplo n.º 14
0
def task_run_core(recid, records, bibcatalog_system=None, _arxiv=False):
    setup_loggers(None, use_bibtask=True)

    if _arxiv:
        overwrite = True
    else:
        overwrite = not task_get_option('no-overwrite')

    try:
        record = extract_references_from_record(recid)
        msg = "Extracted references for %s" % recid
        safe_to_extract = True
        if overwrite:
            write_message("%s (overwrite)" % msg)
        else:
            write_message(msg)
            if not check_record_for_refextract(recid):
                write_message('Record not safe for re-extraction, skipping')
                safe_to_extract = False

        if safe_to_extract:
            records.append(record)
            # Create a RT ticket if necessary
            if task_get_option('new') or task_get_option('create-ticket'):
                create_ticket(recid, bibcatalog_system)
    except FullTextNotAvailable:
        write_message("No full text available for %s" % recid)
Ejemplo n.º 15
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Ejemplo n.º 16
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        bibtask.task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            bibtask.write_message("The value specified for --recid must be a "
                                  "valid integer, not '%s'." % value,
                                  stream=sys.stderr,
                                  verbose=0)
        if not _recid_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid record ID." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        recids = bibtask.task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        bibtask.task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid collection." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        collections = bibtask.task_get_option("collections")
        collections = collections or []
        collections.append(value)
        bibtask.task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            bibtask.write_message("ERROR: '%s' is not a valid taxonomy name." %
                                  value,
                                  stream=sys.stderr,
                                  verbose=0)
            return False
        bibtask.task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        bibtask.task_set_option("force", True)
    else:
        return False

    return True
Ejemplo n.º 17
0
def task_run_core(recid, records, bibcatalog_system=None, _arxiv=False):
    setup_loggers(None, use_bibtask=True)

    if _arxiv:
        overwrite = True
    else:
        overwrite = not task_get_option('no-overwrite')

    try:
        record = extract_references_from_record(recid)
        msg = "Extracted references for %s" % recid
        safe_to_extract = True
        if overwrite:
            write_message("%s (overwrite)" % msg)
        else:
            write_message(msg)
            if not check_record_for_refextract(recid):
                write_message('Record not safe for re-extraction, skipping')
                safe_to_extract = False

        if safe_to_extract:
            records.append(record)
            # Create a RT ticket if necessary
            if task_get_option('new') or task_get_option('create-ticket'):
                create_ticket(recid, bibcatalog_system)
    except FullTextNotAvailable:
        write_message("No full text available for %s" % recid)
Ejemplo n.º 18
0
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key == '--inspire':
        msg = """The --inspire option does not exist anymore.
Please set the config variable CFG_INSPIRE_SITE instead."""
        raise StandardError(msg)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-i', '--id'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('-r', '--recids'):
        msg = """The --recids has been renamed.
please use --id for specifying recids."""
        raise StandardError(msg)
    elif key == '-f':
        msg = """refextract is now used to run in daemon mode only.
If you would like to run reference extraction on a standalone PDF file,
please use "docextract file.pdf\""""
        raise StandardError(msg)

    return True
Ejemplo n.º 19
0
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key == '--inspire':
        msg = """The --inspire option does not exist anymore.
Please set the config variable CFG_INSPIRE_SITE instead."""
        raise StandardError(msg)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-i', '--id'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))
    elif key in ('-r', '--recids'):
        msg = """The --recids has been renamed.
please use --id for specifying recids."""
        raise StandardError(msg)
    elif key == '-f':
        msg = """refextract is now used to run in daemon mode only.
If you would like to run reference extraction on a standalone PDF file,
please use "docextract file.pdf\""""
        raise StandardError(msg)

    return True
Ejemplo n.º 20
0
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last + 1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
def citation_exec(rank_method_code, name, config):
    """Rank method for citation analysis"""
    #first check if this is a specific task
    if task_get_option("cmd") == "print-missing":
        num = task_get_option("num")
        print_missing(num)
    dict = get_citation_weight(rank_method_code, config)
    date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if dict: intoDB(dict, date, rank_method_code)
    else: write_message("no need to update the indexes for citations")
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last+1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                                                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
Ejemplo n.º 23
0
def task_run_core():
    """
    run daemon
    """

    if task_get_option("update-borrowers"):
        list_of_borrowers = db.get_all_borrowers()

        total_borrowers = len(list_of_borrowers)
        done  = 0

        for borrower in list_of_borrowers:
            user_id = borrower[0]
            update_user_info_from_ldap(user_id)
            done+=1
            task_update_progress("Done %d out of %d." % (done, total_borrowers))
            task_sleep_now_if_required(can_stop_too=True)

    if task_get_option("overdue-letters"):
        expired_loans = db.get_all_expired_loans()

        total_expired_loans = len(expired_loans)
        done  = 0

        for (borrower_id, _bor_name, recid, _barcode, _loaned_on,
             _due_date, _number_of_renewals, number_of_letters,
             date_letters, _notes, loan_id) in expired_loans:

            number_of_letters=int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id)
            elif number_of_letters == 1 and must_send_second_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id)
            elif number_of_letters == 2 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)
            elif number_of_letters >= 3 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)

            if content != '':
                title = book_title_from_MARC(recid)
                subject = "LOAN RECALL: " + title

                update_expired_loan(loan_id)
                send_overdue_letter(borrower_id, subject, content)

            done+=1

            task_update_progress("Done %d out of %d." % (done, total_expired_loans))

            task_sleep_now_if_required(can_stop_too=True)
            time.sleep(1)

    return 1
Ejemplo n.º 24
0
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('new') and not task_get_option('recids') \
                and not task_get_option('collections'):
        print >>sys.stderr, 'Error: No input file specified, you need' \
            ' to specify which files to run on'
        return False
    return True
Ejemplo n.º 25
0
def check_options():
    """Check command line options"""
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('rebuild'):
        print >>sys.stderr, 'Error: No input file specified, you need' \
            ' to specify which files to run on'
        return False

    return True
Ejemplo n.º 26
0
def check_options():
    """Check command line options"""
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('rebuild'):
        print >>sys.stderr, 'Error: No input file specified, you need' \
            ' to specify which files to run on'
        return False

    return True
Ejemplo n.º 27
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        task_get_option(\1) = value
        return True
    return False
    """
    # Recid option
    if key in ("-i", "--recid"):
        try:
            value = int(value)
        except ValueError:
            write_message("The value specified for --recid must be a "
                "valid integer, not '%s'." % value, stream=sys.stderr,
                verbose=0)
        if not _recid_exists(value):
            write_message("ERROR: '%s' is not a valid record ID." % value,
                stream=sys.stderr, verbose=0)
            return False
        recids = task_get_option('recids')
        if recids is None:
            recids = []
        recids.append(value)
        task_set_option('recids', recids)

    # Collection option
    elif key in ("-c", "--collection"):
        if not _collection_exists(value):
            write_message("ERROR: '%s' is not a valid collection." % value,
                stream=sys.stderr, verbose=0)
            return False
        collections = task_get_option("collections")
        collections = collections or []
        collections.append(value)
        task_set_option("collections", collections)

    # Taxonomy option
    elif key in ("-k", "--taxonomy"):
        if not _ontology_exists(value):
            write_message("ERROR: '%s' is not a valid taxonomy name." % value,
                stream=sys.stderr, verbose=0)
            return False
        task_set_option("taxonomy", value)
    elif key in ("-f", "--force"):
        task_set_option("force", True)
    else:
        return False

    return True
Ejemplo n.º 28
0
def citation_exec(rank_method_code, name, config):
    """Rank method for citation analysis"""
    #first check if this is a specific task
    begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if task_get_option("cmd") == "print-missing":
        num = task_get_option("num")
        print_missing(num)
    else:
        dict = get_citation_weight(rank_method_code, config)
        if dict:
            intoDB(dict, begin_date, rank_method_code)
        else:
            write_message("no need to update the indexes for citations")
Ejemplo n.º 29
0
def task_run_core(recid, records, bibcatalog_system=None):
    setup_loggers(None, use_bibtask=True)
    try:
        extract_one(recid=recid,
                    records=records,
                    overwrite=task_get_option('overwrite'),
                    create_a_ticket=task_get_option('new')
                    or task_get_option('create-ticket'),
                    bibcatalog_system=bibcatalog_system)
    except FullTextNotAvailable:
        write_message("No full text available for %s" % recid)
    except NotSafeForExtraction:
        write_message('Record not safe for re-extraction, skipping')
Ejemplo n.º 30
0
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-a', '--new'):
        task_set_option('new', True)
        task_set_option('no-overwrite', True)
    elif key in ('-m', '--modified'):
        task_set_option('modified', True)
        task_set_option('no-overwrite', True)
    elif key in (
            '-i',
            '--inspire',
    ):
        task_set_option('inspire', True)
    elif key in ('--kb-reports', ):
        task_set_option('kb-reports', value)
    elif key in ('--kb-journals', ):
        task_set_option('kb-journals', value)
    elif key in ('--kb-journals-re', ):
        task_set_option('kb-journals-re', value)
    elif key in ('--kb-authors', ):
        task_set_option('kb-authors', value)
    elif key in ('--kb-books', ):
        task_set_option('kb-books', value)
    elif key in ('--kb-conferences', ):
        task_set_option('kb-conferences', value)
    elif key in ('--create-ticket', ):
        task_set_option('create-ticket', True)
    elif key in ('--no-overwrite', ):
        task_set_option('no-overwrite', True)
    elif key in ('--arxiv'):
        task_set_option('arxiv', True)
    elif key in ('-c', '--collections'):
        collections = task_get_option('collections')
        if not collections:
            collections = set()
            task_set_option('collections', collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ('-r', '--recids'):
        recids = task_get_option('recids')
        if not recids:
            recids = set()
            task_set_option('recids', recids)
        recids.update(split_ids(value))

    return True
Ejemplo n.º 31
0
def _task_submit_check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('recids') and not task_get_option('collections'):
        write_message('Error: No input file specified', stream=sys.stdout, verbose=0),
        return False
    ## Output to a file in tmp, if the user has not specified an output file
    if not task_get_option('xmlfile', default=False):
        abs_path = _generate_default_xml_out()
        ## Set the output
        task_set_option('xmlfile', abs_path)
    return True
Ejemplo n.º 32
0
def task_submit_elaborate_specific_parameter(key, value, opts, dummy):
    """Elaborate a specific parameter of CLI bibrank."""
    if key in ("-a", "--add"):
        task_set_option("cmd", "add")
        if ("-x", "") in opts or ("--del", "") in opts:
            raise StandardError, "--add incompatible with --del"
    elif key in ("--run", "-w"):
        task_set_option("run", [])
        run = value.split(",")
        for run_key in range(0, len(run)):
            task_get_option('run').append(run[run_key])
    elif key in ("-r", "--repair"):
        task_set_option("cmd", "repair")
    elif key in ("-E", "--print-extcites"):
        try:
            task_set_option("print-extcites", int(value))
        except:
            task_set_option("print-extcites", 10)  # default fallback value
        task_set_option("cmd", "print-missing")
    elif key in ("-A", "--author-citations"):
        task_set_option("author-citations", "1")
    elif key in ("-d", "--del"):
        task_set_option("cmd", "del")
    elif key in ("-k", "--check"):
        task_set_option("cmd", "check")
    elif key in ("-S", "--stat"):
        task_set_option("cmd", "stat")
    elif key in ("-i", "--id"):
        task_set_option("id", task_get_option("id") + split_ranges(value))
        task_set_option("last_updated", "")
    elif key in ("-c", "--collection"):
        task_set_option("collection", value)
    elif key in ("-R", "--rebalance"):
        task_set_option("quick", "no")
    elif key in ("-f", "--flush"):
        task_set_option("flush", int(value))
    elif key in ("-M", "--maxmem"):
        task_set_option("maxmem", int(value))
        if task_get_option("maxmem") < base_process_size + 1000:
            raise StandardError, "Memory usage should be higher than %d kB" % \
                (base_process_size + 1000)
    elif key in ("-m", "--modified"):
        task_set_option("modified",
                        get_date_range(value))  #2002-10-27 13:57:26)
        task_set_option("last_updated", "")
    elif key in ("-l", "--lastupdate"):
        task_set_option("last_updated", "last_updated")
    else:
        return False
    return True
Ejemplo n.º 33
0
def task_run_core():
    """Reimplement to add the body of the task"""
    write_message("bibsort starting..")

    cmd = task_get_option('cmd')
    methods = task_get_option('methods')
    recids = task_get_option('recids')
    write_message("Task parameters: command=%s ; methods=%s ; recids=%s" \
                  % (cmd, methods, recids), verbose=2)

    executed_correctly = False

    # if no command is defined, run sorting
    if not cmd:
        cmd = 'sort'

    if cmd == 'load':
        write_message('Starting loading the configuration \
                      from the cfg file to the db.',
                      verbose=5)
        executed_correctly = load_configuration()
        if executed_correctly:
            write_message('Loading completed.', verbose=5)
    elif cmd == 'dump':
        write_message('Starting dumping the configuration \
                      from the db into the cfg file.',
                      verbose=5)
        executed_correctly = dump_configuration()
        if executed_correctly:
            write_message('Dumping completed.', verbose=5)
    elif cmd == 'print':
        executed_correctly = print_sorting_methods()
    elif cmd == 'sort':
        write_message('Starting sorting.', verbose=5)
        executed_correctly = update_sorting(methods, recids)
        if executed_correctly:
            write_message('Sorting completed.', verbose=5)
    elif cmd == 'rebalance':
        write_message('Starting rebalancing the sorting buckets.', verbose=5)
        executed_correctly = rebalance(methods)
        if executed_correctly:
            write_message('Rebalancing completed.', verbose=5)
    else:
        write_message(
            "This action is not possible. \
        See the --help for available actions.", sys.stderr)

    write_message('bibsort exiting..')
    return executed_correctly
Ejemplo n.º 34
0
def task_submit_elaborate_specific_parameter(key, value, opts, dummy):
    """Elaborate a specific parameter of CLI bibrank."""
    if key in ("-a", "--add"):
        task_set_option("cmd", "add")
        if ("-x","") in opts or ("--del","") in opts:
            raise StandardError, "--add incompatible with --del"
    elif key in ("--run", "-w"):
        task_set_option("run", [])
        run = value.split(",")
        for run_key in range(0, len(run)):
            task_get_option('run').append(run[run_key])
    elif key in ("-r", "--repair"):
        task_set_option("cmd", "repair")
    elif key in ("-E", "--print-extcites"):
        try:
            task_set_option("print-extcites", int(value))
        except:
            task_set_option("print-extcites", 10) # default fallback value
        task_set_option("cmd", "print-missing")
    elif key in ("-A", "--author-citations"):
        task_set_option("author-citations", "1")
    elif key in ("-d", "--del"):
        task_set_option("cmd", "del")
    elif key in ("-k", "--check"):
        task_set_option("cmd", "check")
    elif key in ("-S", "--stat"):
        task_set_option("cmd", "stat")
    elif key in ("-i", "--id"):
        task_set_option("id", task_get_option("id") + split_ranges(value))
        task_set_option("last_updated", "")
    elif key in ("-c", "--collection"):
        task_set_option("collection", value)
    elif key in ("-R", "--rebalance"):
        task_set_option("quick", "no")
    elif key in ("-f", "--flush"):
        task_set_option("flush", int(value))
    elif key in ("-M", "--maxmem"):
        task_set_option("maxmem", int(value))
        if task_get_option("maxmem") < base_process_size + 1000:
            raise StandardError, "Memory usage should be higher than %d kB" % \
                (base_process_size + 1000)
    elif key in ("-m", "--modified"):
        task_set_option("modified", get_date_range(value))#2002-10-27 13:57:26)
        task_set_option("last_updated", "")
    elif key in ("-l", "--lastupdate"):
        task_set_option("last_updated", "last_updated")
    else:
        return False
    return True
Ejemplo n.º 35
0
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('new') \
            and not task_get_option('modified') \
            and not task_get_option('recids') \
            and not task_get_option('collections') \
            and not task_get_option('arxiv'):
        print >>sys.stderr, 'Error: No records specified, you need' \
            ' to specify which files to run on'
        return False

    return True
Ejemplo n.º 36
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return   # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;",
                      (task_get_task_param('task_starting_time'), rule_name,))
    if not updated: # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
Ejemplo n.º 37
0
def cb_parse_option(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        # There should be no standalone arguments for any refextract job
        # This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ("-a", "--new"):
        task_set_option("new", True)
        task_set_option("no-overwrite", True)
    elif key in ("-m", "--modified"):
        task_set_option("modified", True)
        task_set_option("no-overwrite", True)
    elif key in ("-i", "--inspire"):
        task_set_option("inspire", True)
    elif key in ("--kb-reports",):
        task_set_option("kb-reports", value)
    elif key in ("--kb-journals",):
        task_set_option("kb-journals", value)
    elif key in ("--kb-journals-re",):
        task_set_option("kb-journals-re", value)
    elif key in ("--kb-authors",):
        task_set_option("kb-authors", value)
    elif key in ("--kb-books",):
        task_set_option("kb-books", value)
    elif key in ("--kb-conferences",):
        task_set_option("kb-conferences", value)
    elif key in ("--create-ticket",):
        task_set_option("create-ticket", True)
    elif key in ("--no-overwrite",):
        task_set_option("no-overwrite", True)
    elif key in ("--arxiv"):
        task_set_option("arxiv", True)
    elif key in ("-c", "--collections"):
        collections = task_get_option("collections")
        if not collections:
            collections = set()
            task_set_option("collections", collections)
        for v in value.split(","):
            collections.update(perform_request_search(c=v))
    elif key in ("-r", "--recids"):
        recids = task_get_option("recids")
        if not recids:
            recids = set()
            task_set_option("recids", recids)
        recids.update(split_ids(value))

    return True
Ejemplo n.º 38
0
def task_submit_check_options():
    if not task_get_option('logs') and \
       not task_get_option('tempfiles') and \
       not task_get_option('guests') and \
       not task_get_option('bibxxx') and \
       not task_get_option('documents') and \
       not task_get_option('cache') and \
       not task_get_option('tasks') and \
       not task_get_option('check-tables') and \
       not task_get_option('optimise-tables'):
        task_set_option('sessions', True)
    return True
Ejemplo n.º 39
0
def task_submit_check_options():
    if not task_get_option('logs') and \
       not task_get_option('tempfiles') and \
       not task_get_option('guests') and \
       not task_get_option('bibxxx') and \
       not task_get_option('documents') and \
       not task_get_option('cache') and \
       not task_get_option('tasks') and \
       not task_get_option('check-tables') and \
       not task_get_option('optimise-tables'):
        task_set_option('sessions', True)
    return True
def task_submit_elaborate_specific_parameter(key, value, opts, args):
    """ Given the string key it checks it's meaning, eventually using the value.
    Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ['-n', '--number']:
        self.options['number'] = value
        return True
    return False
    """
    if key in ("-c", "--collection"):
        task_set_option("collection", value)
    elif key in ("-r", "--recursive"):
        task_set_option("recursive", 1)
    elif key in ("-f", "--force"):
        task_set_option("force", 1)
    elif key in ("-p", "--part"):
        task_set_option("part", int(value))
    elif key in ("-l", "--language"):
        languages = task_get_option("language", [])
        languages += value.split(',')
        for ln in languages:
            if ln not in CFG_SITE_LANGS:
                print 'ERROR: "%s" is not a recognized language code' % ln
                return False
        task_set_option("language", languages)
    else:
        return False
    return True
Ejemplo n.º 41
0
def solr_commit_if_necessary(next_commit_counter,
                             final_commit=False,
                             recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (
            final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Ejemplo n.º 42
0
def task_submit_elaborate_specific_parameter(key, value,
        dummy_opts, dummy_args):
    """ Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        task_set_option('number', value)
        return True
    return False
    """
    if key in ('-T', '--tasklet'):
        task_set_option('tasklet', value)
        return True
    elif key in ('-a', '--argument'):
        arguments = task_get_option('arguments', {})
        try:
            key, value = value.split('=', 1)
        except NameError:
            print >> sys.stderr, 'ERROR: an argument must be in the form ' \
                'param=value, not "%s"' % value
            return False
        arguments[key] = value
        task_set_option('arguments', arguments)
        return True
    elif key in ('-l', '--list-tasklets'):
        cli_list_tasklets()
        return True
    return False
Ejemplo n.º 43
0
def task_run_core():
    """
    Run the specific tasklet.
    """
    tasklet = task_get_option('tasklet')
    arguments = task_get_option('arguments', {})
    write_message('Starting tasklet "%s" (with arguments %s)' %
        (tasklet, arguments))
    task_update_progress('%s started' % tasklet)
    ret = _TASKLETS[tasklet](**arguments)
    task_update_progress('%s finished' % tasklet)
    write_message('Finished tasklet "%s" (with arguments %s)' %
        (tasklet, arguments))
    if ret is not None:
        return ret
    return True
Ejemplo n.º 44
0
def task_run_core():
    """
    Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    """
    errors_encountered_p = False
    jobnames = _detect_jobs_to_run(task_get_option('wjob'))
    for jobname in jobnames:
        jobname_export_method = _detect_export_method(jobname)
        if not jobname_export_method:
            write_message("ERROR: cannot detect export method for job %s." % jobname, sys.stderr)
            errors_encountered_p = True
        else:
            try:
                # every bibexport method must define run_export_job() that will do the job
                exec "from invenio.bibexport_method_%s import run_export_method" % jobname_export_method
                write_message("started export job " + jobname, verbose=3)
                # pylint: disable=E0602
                # The import is done via the exec command 2 lines above.
                run_export_method(jobname)
                # pylint: enable=E0602
                _update_job_lastrun_time(jobname)
                write_message("finished export job " + jobname, verbose=3)
            except Exception, msg:
                write_message("ERROR: cannot run export job %s: %s." % (jobname, msg), sys.stderr)
                errors_encountered_p = True
Ejemplo n.º 45
0
def task_submit_elaborate_specific_parameter(key, value, opts, args):
    """ Given the string key it checks it's meaning, eventually using the value.
    Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ['-n', '--number']:
        self.options['number'] = value
        return True
    return False
    """
    if key in ("-c", "--collection"):
        task_set_option("collection", value)
    elif key in ("-r", "--recursive"):
        task_set_option("recursive", 1)
    elif key in ("-f", "--force"):
        task_set_option("force", 1)
    elif key in ("-p", "--part"):
        task_set_option("part", int(value))
    elif key in ("-l", "--language"):
        languages = task_get_option("language", [])
        languages += value.split(',')
        for ln in languages:
            if ln not in CFG_SITE_LANGS:
                print 'ERROR: "%s" is not a recognized language code' % ln
                return False
        task_set_option("language", languages)
    else:
        return False
    return True
def task_submit_check_options():
    if task_has_option('collection'):
        coll = get_collection(task_get_option("collection"))
        if coll.id is None:
            print 'ERROR: Collection "%s" does not exist' % coll.name
            return False
    return True
Ejemplo n.º 47
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, '037__9'):
            if report_number == 'arXiv':
                return True
        return False

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
          "WHERE `modification_date` >= %s " \
          "ORDER BY `modification_date`"
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option('verbose') >= 9:
        write_message('recids:', verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
Ejemplo n.º 48
0
def _task_submit_check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if not task_get_option('recids') and not task_get_option('collections'):
        write_message('Error: No input file specified',
                      stream=sys.stdout,
                      verbose=0),
        return False
    ## Output to a file in tmp, if the user has not specified an output file
    if not task_get_option('xmlfile', default=False):
        abs_path = _generate_default_xml_out()
        ## Set the output
        task_set_option('xmlfile', abs_path)
    return True
Ejemplo n.º 49
0
def task_submit_elaborate_specific_parameter(key, value,
        dummy_opts, dummy_args):
    """ Given the string key it checks it's meaning, eventually using the
    value. Usually it fills some key in the options dict.
    It must return True if it has elaborated the key, False, if it doesn't
    know that key.
    eg:
    if key in ('-n', '--number'):
        task_set_option('number', value)
        return True
    return False
    """
    if key in ('-T', '--tasklet'):
        task_set_option('tasklet', value)
        return True
    elif key in ('-a', '--argument'):
        arguments = task_get_option('arguments', {})
        try:
            key, value = value.split('=', 1)
        except NameError:
            print >> sys.stderr, 'ERROR: an argument must be in the form ' \
                'param=value, not "%s"' % value
            return False
        arguments[key] = value
        task_set_option('arguments', arguments)
        return True
    elif key in ('-l', '--list-tasklets'):
        cli_list_tasklets()
        return True
    return False
Ejemplo n.º 50
0
def word_index(run): # pylint: disable=W0613
    """
    Runs the indexing task.
    """
    id_option = task_get_option("id")
    # Indexes passed ids and id ranges
    if len(id_option):
        for id_elem in id_option:
            lower_recid= id_elem[0]
            upper_recid = id_elem[1]
            write_message("Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid))
            solr_add_all(lower_recid, upper_recid)

    # Indexes modified ids since last run
    else:
        starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        id_ranges = get_recIDs_by_date()
        if not id_ranges:
            write_message("No new records. Solr index is up to date")
        else:
            for ids_range in id_ranges:
                lower_recid= ids_range[0]
                upper_recid = ids_range[1]
                write_message("Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid))
                solr_add_all(lower_recid, upper_recid)
            run_sql('UPDATE rnkMETHOD SET last_updated=%s WHERE name="wrd"', (starting_time, ))

    write_message("Solr ranking indexer completed")
Ejemplo n.º 51
0
def task_submit_check_options():
    if task_has_option('collection'):
        coll = get_collection(task_get_option("collection"))
        if coll.id is None:
            print 'ERROR: Collection "%s" does not exist' % coll.name
            return False
    return True
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Ejemplo n.º 53
0
def check_options():
    """ Reimplement this method for having the possibility to check options
    before submitting the task, in order for example to provide default
    values. It must return False if there are errors in the options.
    """
    if (
        not task_get_option("new")
        and not task_get_option("modified")
        and not task_get_option("recids")
        and not task_get_option("collections")
        and not task_get_option("arxiv")
    ):
        print >>sys.stderr, "Error: No records specified, you need" " to specify which files to run on"
        return False

    return True
Ejemplo n.º 54
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
Ejemplo n.º 55
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        bibtask.write_message("ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.", stream=sys.stderr,
            verbose=0)
        return False

    return True
Ejemplo n.º 56
0
def task_run_core():
    """
    Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    """
    errors_encountered_p = False
    jobnames = _detect_jobs_to_run(task_get_option('wjob'))
    for jobname in jobnames:
        jobname_export_method = _detect_export_method(jobname)
        if not jobname_export_method:
            write_message(
                "ERROR: cannot detect export method for job %s." % jobname,
                sys.stderr)
            errors_encountered_p = True
        else:
            try:
                # every bibexport method must define run_export_job() that will do the job
                exec "from invenio.bibexport_method_%s import run_export_method" % jobname_export_method
                write_message("started export job " + jobname, verbose=3)
                # pylint: disable=E0602
                # The import is done via the exec command 2 lines above.
                run_export_method(jobname)
                # pylint: enable=E0602
                _update_job_lastrun_time(jobname)
                write_message("finished export job " + jobname, verbose=3)
            except Exception, msg:
                write_message(
                    "ERROR: cannot run export job %s: %s." % (jobname, msg),
                    sys.stderr)
                errors_encountered_p = True
Ejemplo n.º 57
0
def _task_submit_check_options():
    """Required by bibtask. Checks the options."""
    recids = task_get_option('recids')
    collections = task_get_option('collections')
    taxonomy = task_get_option('taxonomy')

    # If a recid or a collection is specified, check that the taxonomy
    # is also specified.
    if (recids is not None or collections is not None) and \
        taxonomy is None:
        write_message("ERROR: When specifying a record ID or a collection, "
            "you have to precise which\ntaxonomy to use.", stream=sys.stderr,
            verbose=0)
        return False

    return True
Ejemplo n.º 58
0
def task_run_core():
    """
    Run the specific tasklet.
    """
    tasklet = task_get_option('tasklet')
    arguments = task_get_option('arguments', {})
    write_message('Starting tasklet "%s" (with arguments %s)' %
        (tasklet, arguments))
    task_update_progress('%s started' % tasklet)
    ret = _TASKLETS[tasklet](**arguments)
    task_update_progress('%s finished' % tasklet)
    write_message('Finished tasklet "%s" (with arguments %s)' %
        (tasklet, arguments))
    if ret is not None:
        return ret
    return True
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)