def task_submit_check_options():
    if task_has_option('collection'):
        coll = get_collection(task_get_option("collection"))
        if coll.id is None:
            print 'ERROR: Collection "%s" does not exist' % coll.name
            return False
    return True
def task_submit_check_options():
    if task_has_option('collection'):
        coll = get_collection(task_get_option("collection"))
        if coll.id is None:
            print 'ERROR: Collection "%s" does not exist' % coll.name
            return False
    return True
Exemple #3
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is what BibSched will be invoking via daemon call."""

    ## initialize parameters
    if task_get_option('format'):
        fmts = task_get_option('format')
    else:
        fmts = 'HB' # default value if no format option given
    for fmt in fmts.split(','):
        sql = {
            "all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt,
            "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt,
            "q1"  : "select br.id from bibrec as br",
            "q2"  : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt
        }
        sql_queries = []
        cds_query = {}
        if task_has_option("all"):
            sql_queries.append(sql['all'])
        if task_has_option("last"):
            sql_queries.append(sql['last'])
        if task_has_option("collection"):
            cds_query['collection'] = task_get_option('collection')
        else:
            cds_query['collection'] = ""

        if task_has_option("field"):
            cds_query['field']      = task_get_option('field')
        else:
            cds_query['field']      = ""

        if task_has_option("pattern"):
            cds_query['pattern']      = task_get_option('pattern')
        else:
            cds_query['pattern']      = ""

        if task_has_option("matching"):
            cds_query['matching']      = task_get_option('matching')
        else:
            cds_query['matching']      = ""

        recids = intbitset()
        if task_has_option("recids"):
            for recid in task_get_option('recids').split(','):
                if ":" in recid:
                    start = int(recid.split(':')[0])
                    end = int(recid.split(':')[1])
                    recids += range(start, end)
                else:
                    recids.add(int(recid))

    ### sql commands to be executed during the script run
    ###
        bibreformat_task(fmt, sql, sql_queries, cds_query, task_has_option('without'), not task_has_option('noprocess'), recids)
    return True
Exemple #4
0
def task_submit_check_options():
    """Last checks and updating on the options..."""
    if not (task_has_option('all') or task_has_option('collection')
            or task_has_option('field') or task_has_option('pattern')
            or task_has_option('matching') or task_has_option('recids')):
        task_set_option('last', 1)
    return True
Exemple #5
0
def task_submit_check_options():
    """Last checks and updating on the options..."""
    if not (task_has_option('all') or task_has_option('collection')
            or task_has_option('field') or task_has_option('pattern')
            or task_has_option('matching') or task_has_option('recids')):
        task_set_option('last', 1)
    return True
Exemple #6
0
def task_run_core():
    """Run the task by fetching arguments from the BibSched task queue.

    This is what BibSched will be invoking via daemon call.
    """
    fmts = task_get_option('format', 'HB,RECJSON')
    for fmt in fmts.split(','):
        last_updated = fetch_last_updated(fmt)
        write_message("last stored run date is %s" % last_updated)

        recids = intbitset()

        if task_has_option("all"):
            recids += all_records()

        if task_has_option("last"):
            recids += outdated_caches(fmt, last_updated)

        if task_has_option('ignore_without'):
            without_fmt = intbitset()
        else:
            without_fmt = missing_caches(fmt)
            recids += without_fmt

        cli_recids = split_cli_ids_arg(task_get_option('recids', ''))
        recids += cli_recids

        query_params = {'collection': task_get_option('collection', ''),
                        'field': task_get_option('field', ''),
                        'pattern': task_get_option('pattern', ''),
                        'matching': task_get_option('matching', '')}
        recids += query_records(query_params)

        bibreformat_task(fmt,
                         recids,
                         without_fmt,
                         not task_has_option('noprocess'))

    return True
Exemple #7
0
def task_run_core():
    """Run the task by fetching arguments from the BibSched task queue.

    This is what BibSched will be invoking via daemon call.
    """
    fmts = task_get_option('format', 'HB,RECJSON')
    for fmt in fmts.split(','):
        last_updated = fetch_last_updated(fmt)
        write_message("last stored run date is %s" % last_updated)

        recids = intbitset()

        if task_has_option("all"):
            recids += all_records()

        if task_has_option("last"):
            recids += outdated_caches(fmt, last_updated)

        if task_has_option('ignore_without'):
            without_fmt = intbitset()
        else:
            without_fmt = missing_caches(fmt)
            recids += without_fmt

        cli_recids = split_cli_ids_arg(task_get_option('recids', ''))
        recids += cli_recids

        query_params = {
            'collection': task_get_option('collection', ''),
            'field': task_get_option('field', ''),
            'pattern': task_get_option('pattern', ''),
            'matching': task_get_option('matching', '')
        }
        recids += query_records(query_params)

        bibreformat_task(fmt, recids, without_fmt,
                         not task_has_option('noprocess'))

    return True
Exemple #8
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return   # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;",
                      (task_get_task_param('task_starting_time'), rule_name,))
    if not updated: # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
Exemple #9
0
def task_submit_check_options():
    """Check that options are valid."""
    if task_has_option('wjob'):
        jobnames = task_get_option('wjob')
        if jobnames:
            jobnames = jobnames.split(',')
            for jobname in jobnames:
                res = run_sql("SELECT COUNT(*) FROM expJOB WHERE jobname=%s", (jobname,))
                if res and res[0][0]:
                    # okay, jobname exists
                    pass
                else:
                    write_message("Sorry, job name %s is not known. Exiting." % jobname)
                    return False
    return True
Exemple #10
0
def task_submit_check_options():
    """Check that options are valid."""
    if task_has_option('wjob'):
        jobnames = task_get_option('wjob')
        if jobnames:
            jobnames = jobnames.split(',')
            for jobname in jobnames:
                res = run_sql("SELECT COUNT(*) FROM expJOB WHERE jobname=%s", (jobname,))
                if res and res[0][0]:
                    # okay, jobname exists
                    pass
                else:
                    write_message("Sorry, job name %s is not known. Exiting." % jobname)
                    return False
    return True
Exemple #11
0
def update_rule_last_run(rule_name):
    """
    Set the last time a rule was run to now. This function should be called
    after a rule has been ran.
    """

    if task_has_option('record_ids') or task_get_option('no_upload', False) \
            or task_get_option('no_tickets', False):
        return  # We don't want to update the database in this case

    updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", (
        task_get_task_param('task_starting_time'),
        rule_name,
    ))
    if not updated:  # rule not in the database, insert it
        run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)",
                (rule_name, task_get_task_param('task_starting_time')))
Exemple #12
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is what BibSched will be invoking via daemon call."""

    ## initialize parameters

    fmt = task_get_option('format')
    sql = {
        "all":
        "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'"
        % fmt,
        "last":
        "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date"
        % fmt,
        "q1":
        "select br.id from bibrec as br",
        "q2":
        "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'"
        % fmt
    }
    sql_queries = []
    cds_query = {}
    if task_has_option("all"):
        sql_queries.append(sql['all'])
    if task_has_option("last"):
        sql_queries.append(sql['last'])
    if task_has_option("collection"):
        cds_query['collection'] = task_get_option('collection')
    else:
        cds_query['collection'] = ""

    if task_has_option("field"):
        cds_query['field'] = task_get_option('field')
    else:
        cds_query['field'] = ""

    if task_has_option("pattern"):
        cds_query['pattern'] = task_get_option('pattern')
    else:
        cds_query['pattern'] = ""

    if task_has_option("matching"):
        cds_query['matching'] = task_get_option('matching')
    else:
        cds_query['matching'] = ""

    recids = intbitset()
    if task_has_option("recids"):
        for recid in task_get_option('recids').split(','):
            if ":" in recid:
                start = int(recid.split(':')[0])
                end = int(recid.split(':')[1])
                recids += range(start, end)
            else:
                recids.add(int(recid))


### sql commands to be executed during the script run
###

    bibreformat_task(fmt, sql, sql_queries, cds_query,
                     task_has_option('without'),
                     not task_has_option('noprocess'), recids)

    return True
def task_run_core():
    """ Reimplement to add the body of the task."""
##
## ------->--->time--->------>
##  (-1)  |   ( 0)    |  ( 1)
##        |     |     |
## [T.db] |  [T.fc]   | [T.db]
##        |     |     |
##        |<-tol|tol->|
##
## the above is the compare_timestamps_with_tolerance result "diagram"
## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp
## ( -1, 0, 1) stand for the returned value
## tol stands for the tolerance in seconds
##
## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc
## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the
## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus
## slightly before the T.db (practically the time distance between the start of the task and the last call of
## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the
## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if
## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if
## webcoll runs again it will not be fully ran
##
    task_run_start_timestamp = get_current_time_timestamp()
    colls = []
    # decide whether we need to run or not, by comparing last updated timestamps:
    write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3)
    write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3)
    if task_has_option("part"):
        write_message("Running cache update part %s only." % task_get_option("part"), verbose=3)
    if check_nbrecs_for_all_external_collections() or task_has_option("force") or \
    compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
                                        get_cache_last_updated_timestamp(),
                                        CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0:
        ## either forced update was requested or cache is not up to date, so recreate it:
        # firstly, decide which collections to do:
        if task_has_option("collection"):
            coll = get_collection(task_get_option("collection"))
            colls.append(coll)
            if task_has_option("recursive"):
                r_type_descendants = coll.get_descendants(type='r')
                colls += r_type_descendants
                v_type_descendants = coll.get_descendants(type='v')
                colls += v_type_descendants
        else:
            res = run_sql("SELECT name FROM collection ORDER BY id")
            for row in res:
                colls.append(get_collection(row[0]))
        # secondly, update collection reclist cache:
        if task_get_option('part', 1) == 1:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / reclist cache update" % coll.name)
                if str(coll.dbquery).startswith("hostedcollection:"):
                    coll.set_nbrecs_for_external_collection()
                else:
                    coll.calculate_reclist()
                task_sleep_now_if_required()
                coll.update_reclist()
                task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)
        # thirdly, update collection webpage cache:
        if task_get_option("part", 2) == 2:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / webpage cache update" % coll.name)
                coll.update_webpage_cache()
                task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)

        # finally update the cache last updated timestamp:
        # (but only when all collections were updated, not when only
        # some of them were forced-updated as per admin's demand)
        if not task_has_option("collection"):
            set_cache_last_updated_timestamp(task_run_start_timestamp)
            write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3)
    else:
        ## cache up to date, we don't have to run
        write_message("Collection cache is up to date, no need to run.")
    ## we are done:
    return True
Exemple #14
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is what BibSched will be invoking via daemon call."""

    ## initialize parameters
    if task_get_option('format'):
        fmts = task_get_option('format')
    else:
        fmts = 'HB'  # default value if no format option given
    for fmt in fmts.split(','):
        last_updated = fetch_last_updated(fmt)
        write_message("last stored run date is %s" % last_updated)

        sql = {
            "all" : """SELECT br.id FROM bibrec AS br, bibfmt AS bf
                       WHERE bf.id_bibrec = br.id AND bf.format = '%s'""" % fmt,
            "last": """SELECT br.id FROM bibrec AS br
                       INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id
                       WHERE br.modification_date >= '%(last_updated)s'
                       AND bf.format='%(format)s'
                       AND bf.last_updated < br.modification_date""" \
                            % {'format': fmt,
                               'last_updated': last_updated.strftime('%Y-%m-%d %H:%M:%S')},
            "missing"  : """SELECT br.id
                            FROM bibrec as br
                            LEFT JOIN bibfmt as bf
                            ON bf.id_bibrec = br.id AND bf.format ='%s'
                            WHERE bf.id_bibrec IS NULL
                            AND br.id BETWEEN %%s AND %%s
                         """ % fmt,
        }
        sql_queries = []
        cds_query = {}
        if task_has_option("all"):
            sql_queries.append(sql['all'])
        if task_has_option("last"):
            sql_queries.append(sql['last'])
        if task_has_option("collection"):
            cds_query['collection'] = task_get_option('collection')
        else:
            cds_query['collection'] = ""

        if task_has_option("field"):
            cds_query['field'] = task_get_option('field')
        else:
            cds_query['field'] = ""

        if task_has_option("pattern"):
            cds_query['pattern'] = task_get_option('pattern')
        else:
            cds_query['pattern'] = ""

        if task_has_option("matching"):
            cds_query['matching'] = task_get_option('matching')
        else:
            cds_query['matching'] = ""

        if task_has_option("recids"):
            recids = list(split_cli_ids_arg(task_get_option('recids')))
        else:
            recids = []

    ### sql commands to be executed during the script run
    ###
        bibreformat_task(fmt, sql, sql_queries, cds_query,
                         task_has_option('without'),
                         not task_has_option('noprocess'), recids)
    return True
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        ## There should be no standalone arguments for any refextract job
        ## This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'.\n" % args[0])

    ## Task name specified
    if key in ('-e', '--extraction-job'):

        ## Make sure that the user is not mixing job name with other defined
        ## Refextract flags on the command line
        if filter(lambda p: task_get_option(p), possible_task_option_keys):
            write_message("Error: cli and extraction-job extraction parameters specified together.")
            write_message("The extraction-job flag cannot be mixed with other cli flags.")
            return False

        ## ---- Get the task file with this name
        task_file_dir = os.path.join(CFG_ETCDIR, 'bibedit')
        ## The job file name
        task_file =  value + '.cfg'
        abs_path = os.path.join(task_file_dir, task_file)
        try:
            ## Open and readlines from file
            file_hdl = open(abs_path, 'r')
            file_params = file_hdl.readlines()
            file_hdl.close()
        except IOError:
            write_message("Error: Unable to read job file '%s'" % \
                            abs_path, stream=sys.stdout, verbose=0)
            return False
        ## ---- Get the database 'last_updated' value for this name
        xtrJOB_row = _task_name_exists(value)
        ## Build the information for this extraction job
        ## These dictionaries will be extended with extra file parameters
        if xtrJOB_row:
            task_info = {'id'           :   xtrJOB_row[0][0],
                         'name'         :   xtrJOB_row[0][1],
                         'last_updated' :   xtrJOB_row[0][2],
                         'collections'  :   [],
                         'recids'       :   [],}
        else:
            ## Save the name as the input argument for this job
            task_info = {'name'         :   value,
                         'last_updated' :   None,
                         'collections'  :   [],
                         'recids'       :   [],}
        ## ---- Save job parameters
        for p in file_params:
            p = p.strip()
            ## Ignore comments and titles, and skip blank lines
            if (not p) or p.startswith('#') or p.startswith("["):
                continue
            ## Split arguments just once
            p_args = map(lambda x: x.strip(), p.split("=", 1))
            ## Check cfg file param against list of vaild params
            if not (p_args[0] in CFG_REFEXTRACT_JOB_FILE_PARAMS):
                write_message("Error: Unknown task param '%s' inside '%s'." \
                              % (p_args[0], task_file),
                    stream=sys.stdout, verbose=0)
                return False

            if p_args[0] == 'collection':
                ## Separate and strip collections
                collections = map(lambda c: c.strip(), p_args[1].split(','))
                task_info['collections'].extend([c for c in collections if c.strip()])

#FIXME add author extraction functionality
#            elif p_args[0] == 'extraction-mode':
#                if p_args[0] == 'authors':
#                    task_set_option('authors', p_args[1])

            elif p_args[0] == 'recid':
                recids = p_args[1].split(",")
                task_info['recids'].extend([r for r in recids if r.strip()])
            elif len(p_args) == 2:
                ## All other flags
                task_info[p_args[0]] = p_args[1]
            else:
                ## Standalone flag
                task_info[p_args[0]] = 1

        if not ('xmlfile' in task_info):
            task_info['xmlfile'] = _generate_default_xml_out()

        ## Used to flag the creation of a bibupload task
        task_set_option('extraction-job', task_info)

        ## using the extraction-job options...
        ## set the task options
        for option, value in task_info.items():
            if option == 'collections':
                for collection in value:
                    collection_row = _collection_exists(collection)
                    if not collection_row:
                        write_message("Error: '%s' is not a valid collection." % collection,
                            stream=sys.stdout, verbose=0)
                        return 0
                    ## Use the collection name matched from the database
                    task_get_option(option).append(collection_row[0][0])
            elif option == 'recids':
                for recid in value:
                    if not _recid_exists(recid):
                        write_message("Error: '%s' is not a valid record id." % recid,
                            stream=sys.stdout, verbose=0)
                        return 0
                    ## Add this valid record id to the list of record ids
                    task_get_option(option).append(recid)
            elif option not in ('id', 'name', 'last_updated'):
                ## Usual way of setting options, but this time from the extraction-job file
                task_set_option(option, value)

    else:
        ## Quick check to see if an extraction job has also been specified
        if task_has_option('extraction-job'):
            write_message("Error: cli and extraction-job extraction parameters specified together.")
            write_message("The extraction-job flag cannot be mixed with other cli flags.")
            return False

        # Recid option
        elif key in ("-i", "--recid"):
            split_recids = value.split(":")
            if len(split_recids) == 2:
                first = last = valid_range = None
                try:
                    first = int(split_recids[0])
                    last = int(split_recids[1])
                    valid_range = first < last
                except ValueError:
                    write_message("Error: Range values for --recid must be integers, "
                        "not '%s'." % value, stream=sys.stdout, verbose=0)
                if first is None or last is None:
                    return False
                if not _recid_exists(first) or not _recid_exists(last) or not valid_range:
                    write_message("Error: '%s' is not a valid range of record ID's." % value,
                        stream=sys.stdout, verbose=0)
                    return False
                task_get_option('recids').extend(range(first, last))
            else:
                int_val = None
                try:
                    int_val = int(value)
                except ValueError:
                    write_message("Error: The value specified for --recid must be a "
                        "valid integer, not '%s'." % value, stream=sys.stdout,
                        verbose=0)
                if not _recid_exists(value) or int_val is None:
                    write_message("Error: '%s' is not a valid record ID." % value,
                        stream=sys.stdout, verbose=0)
                    return False
                task_get_option('recids').append(value)
        # Collection option
        elif key in ("-c", "--collection"):
            collection_row = _collection_exists(value)
            if not collection_row:
                write_message("Error: '%s' is not a valid collection." % value,
                    stream=sys.stdout, verbose=0)
                return False
            task_get_option('collections').append(collection_row[0][0])
        elif key in ('-z', '--raw-references'):
            task_set_option('raw-references', True)
        elif key in ('-r', '--output-raw-refs'):
            task_set_option('output-raw-refs', True)
        elif key in ('-x', '--xmlfile'):
            task_set_option('xmlfile', value)
        elif key in ('-d', '--dictfile'):
            task_set_option('dictfile', value)
        elif key in ('-p', '--inspire'):
            task_set_option('inspire', True)
        elif key in ('-j', '--kb-journal'):
            task_set_option('kb-journal', value)
        elif key in ('-n', '--kb-report-number'):
            task_set_option('kb-report-number', value)
    return True
Exemple #16
0
def _task_run_core():
    """calls extract_references in refextract"""
    def _append_recid_collection_list(collection, current_recids):
        """Updated list of recids with new recids from collection
        @param collection: (string) collection name to use to obtain record
        ids
        @param current_recids: (list) list of current record ids
        which have already been obtained from previous collection or
        recid flags
        @return: (list) current record ids with newly appended recids
        from input collection
        """
        records = get_collection_reclist(collection)
        for r in records:
            if r not in current_recids:
                current_recids.append(r)
        return current_recids

    daemon_cli_opts = {
        'treat_as_reference_section': 0,
        'fulltext': [],
        'output_raw': 0,
        'verbosity': 0,
        'xmlfile': 0,
        'dictfile': 0,
        'inspire': 0,
        'kb-journal': 0,
        'kb-report-number': 0,
        'extraction-mode': 'ref',
    }

    ## holds the name of the extraction job, and if it's already in the db
    task_info = task_get_option('extraction-job')

    ## Now set the cli options, from the set task options list
    if task_has_option('verbose'):
        v = task_get_option('verbose')
        if not v.isdigit():
            daemon_cli_opts['verbosity'] = 0
        elif int(v) not in xrange(0, 10):
            daemon_cli_opts['verbosity'] = 0
        else:
            daemon_cli_opts['verbosity'] = int(v)
    if task_has_option('raw-references'):
        daemon_cli_opts['treat_as_reference_section'] = 1
    if task_has_option('output-raw-refs'):
        daemon_cli_opts['output_raw'] = 1
    if task_has_option('xmlfile'):
        daemon_cli_opts['xmlfile'] = task_get_option('xmlfile')
    if task_has_option('dictfile'):
        daemon_cli_opts['dictfile'] = task_get_option('dictfile')
    if task_has_option('inspire'):
        daemon_cli_opts['inspire'] = 1
    if task_has_option('kb-journal'):
        daemon_cli_opts['kb-journal'] = task_get_option('kb-journal')
    if task_has_option('kb-report-number'):
        daemon_cli_opts['kb-report-number'] = task_get_option(
            'kb-report-number')
    if task_get_option('recids'):
        ## Construct the fulltext argument equivalent from record id's
        ## (records, and arguments, which have valid files)
        try:
            fulltexts_for_collection = \
                _get_fulltext_args_from_recids(task_get_option('recids'), task_info)
            daemon_cli_opts['fulltext'].extend(fulltexts_for_collection)
        except Exception, err:
            write_message('Error: Unable to obtain fulltexts for recid %s. %s' \
                           % (str(task_get_option('recids')), err), \
                           stream=sys.stdout, verbose=0)
            raise StandardError
Exemple #17
0
def bibreformat_task(fmt, recids, without_fmt, process):
    """BibReformat main task.

    @param fmt: output format to use
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    latest_bibrank_run = get_bibrankmethod_lastupdate('citation')

    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids

    def recid_chunker(recids):
        recids_processed = intbitset()
        chunk = intbitset()

        for recid in recids:
            if len(chunk) == 5000:
                for r in related_records(chunk, recids_processed):
                    yield r
                recids_processed += chunk
                chunk = intbitset()

            if recid not in recids_processed:
                chunk.add(recid)

        if chunk:
            for r in related_records(chunk, recids_processed):
                yield r

    recIDs = list(recid_chunker(recids))

    ### list of corresponding record IDs was retrieved
    ### now format the selected records

    if without_fmt:
        write_message("Records to be processed: %d" % len(recIDs))
        write_message("Out of it records without existing cache: %d" %
                      len(without_fmt))
    else:
        write_message("Records to be processed: %d" % len(recIDs))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #18
0
def bibreformat_task(fmt, recids, without_fmt, process):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    latest_bibrank_run = get_bibrankmethod_lastupdate('citation')

    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime("%Y-%m-%d %H:%M:%S") < latest_bibrank_run
            rel_recids = intbitset([recid for recid, mod_date in run_sql(sql)
                                                    if check_date(mod_date)])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids

    def recid_chunker(recids):
        recids_processed = intbitset()
        chunk = intbitset()

        for recid in recids:
            if len(chunk) == 5000:
                for r in related_records(chunk, recids_processed):
                    yield r
                recids_processed += chunk
                chunk = intbitset()

            if recid not in recids_processed:
                chunk.add(recid)

        if chunk:
            for r in related_records(chunk, recids_processed):
                yield r

    recIDs = list(recid_chunker(recids))

### list of corresponding record IDs was retrieved
### now format the selected records

    if without_fmt:
        write_message("Records to be processed: %d" % len(recIDs))
        write_message("Out of it records without existing cache: %d" %
                                                        len(without_fmt))
    else:
        write_message("Records to be processed: %d" % len(recIDs))


### Initialize main loop

    total_rec   = 0     # Total number of records
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call


### Iterate over all records prepared in lists I (option)
    if process:
        total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)

### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #19
0
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process,
                     recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now()

    ### Query the database
    ###
    task_update_progress('Fetching records to process')
    if process_format:  # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = intbitset(recids)

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=cds_query['collection'],
                                       p=cds_query['pattern'],
                                       f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF" and recIDs:
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        start_date = latest_bibrank_run
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" %
                      len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_old(recIDs, fmt)
        else:
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_old(without_format, fmt)
        else:
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_new(without_format, fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #20
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    rules_to_reset = task_get_option("reset_rules")
    if rules_to_reset:
        write_message("Resetting the following rules: %s" % rules_to_reset)
        for rule in rules_to_reset:
            reset_rule_last_run(rule)
    plugins = load_plugins()
    rules = load_rules(plugins)
    write_message("Loaded rules: %s" % rules, verbose=9)
    task_set_option('plugins', plugins)
    recids_for_rules = get_recids_for_rules(rules)
    write_message("recids for rules: %s" % recids_for_rules, verbose=9)

    update_database = not (task_has_option('record_ids') or task_get_option(
        'no_upload', False) or task_get_option('no_tickets', False))

    if update_database:
        next_starting_dates = {}
        for rule_name, rule in rules.iteritems():
            next_starting_dates[rule_name] = get_next_starting_date(rule)

    all_recids = intbitset([])
    single_rules = set()
    batch_rules = set()
    for rule_name, rule_recids in recids_for_rules.iteritems():
        all_recids.union_update(rule_recids)
        if plugins[rules[rule_name]["check"]]["batch"]:
            batch_rules.add(rule_name)
        else:
            single_rules.add(rule_name)

    records_to_upload_holdingpen = []
    records_to_upload_replace = []
    records_to_submit_tickets = []
    for batch in iter_batches(all_recids, CFG_BATCH_SIZE):

        for rule_name in batch_rules:
            rule = rules[rule_name]
            rule_recids = recids_for_rules[rule_name]
            task_sleep_now_if_required(can_stop_too=True)
            records = []
            for i, record_id, record in batch:
                if record_id in rule_recids:
                    records.append(record)
            if len(records):
                check_records(rule, records)

        # Then run them through normal rules
        for i, record_id, record in batch:
            progress_percent = int(float(i) / len(all_recids) * 100)
            task_update_progress("Processing record %s/%s (%i%%)." %
                                 (i, len(all_recids), progress_percent))
            write_message("Processing record %s" % record_id)

            for rule_name in single_rules:
                rule = rules[rule_name]
                rule_recids = recids_for_rules[rule_name]
                task_sleep_now_if_required(can_stop_too=True)
                if record_id in rule_recids:
                    check_record(rule, record)

            if record.amended:
                if record.holdingpen:
                    records_to_upload_holdingpen.append(record)
                else:
                    records_to_upload_replace.append(record)

            if not record.valid:
                records_to_submit_tickets.append(record)

        if len(records_to_submit_tickets) >= CFG_BATCH_SIZE:
            Tickets(records_to_submit_tickets).submit()
            records_to_submit_tickets = []
        if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_holdingpen, True)
            records_to_upload_holdingpen = []
        if len(records_to_upload_replace) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_replace, False)
            records_to_upload_replace = []

    ## In case there are still some remaining amended records
    if records_to_submit_tickets:
        Tickets(records_to_submit_tickets).submit()
    if records_to_upload_holdingpen:
        upload_amendments(records_to_upload_holdingpen, True)
    if records_to_upload_replace:
        upload_amendments(records_to_upload_replace, False)

    # Update the database with the last time each rule was ran
    if update_database:
        for rule_name, rule in rules.iteritems():
            update_rule_last_run(rule_name, next_starting_dates[rule_name])

    return True
Exemple #21
0
            ## with a timestamp
            perm_file_fd, perm_file_name = \
                mkstemp(suffix='.xml', prefix="refextract_%s_" % \
                            time.strftime("%Y-%m-%d_%H:%M:%S"), \
                            dir=os.path.join(CFG_TMPDIR, "refextract"))
            copyfile(daemon_cli_opts['xmlfile'], perm_file_name)
            os.close(perm_file_fd)
        except IOError, err:
            write_message("Error: Unable to copy content to timestamped XML file, %s" \
                              % err)
            return 0

        ## Now, given the references have been output to option 'xmlfile'
        ## enrich the meta-data of the affected records, via bibupload
        ## Only if a named file was given as input
        if task_has_option('extraction-job'):
            cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, perm_file_name)
            errcode = 0
            try:
                errcode = os.system(cmd)
            except OSError, exc:
                write_message('Error: Command %s failed [%s].' % (cmd, exc),
                              stream=sys.stdout,
                              verbose=0)
            if errcode != 0:
                write_message("Error: %s failed, error code is %d." %
                              (cmd, errcode),
                              stream=sys.stdout,
                              verbose=0)
                return 0
            ## Update the extraction_date for each record id,
def task_run_core():
    """ Reimplement to add the body of the task."""
##
## ------->--->time--->------>
##  (-1)  |   ( 0)    |  ( 1)
##        |     |     |
## [T.db] |  [T.fc]   | [T.db]
##        |     |     |
##        |<-tol|tol->|
##
## the above is the compare_timestamps_with_tolerance result "diagram"
## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp
## ( -1, 0, 1) stand for the returned value
## tol stands for the tolerance in seconds
##
## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc
## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the
## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus
## slightly before the T.db (practically the time distance between the start of the task and the last call of
## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the
## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if
## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if
## webcoll runs again it will not be fully ran
##
    task_run_start_timestamp = get_current_time_timestamp()
    colls = []
    # decide whether we need to run or not, by comparing last updated timestamps:
    write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3)
    write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3)
    if task_has_option("part"):
        write_message("Running cache update part %s only." % task_get_option("part"), verbose=3)
    if check_nbrecs_for_all_external_collections() or task_has_option("force") or \
    compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
                                        get_cache_last_updated_timestamp(),
                                        CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0:
        ## either forced update was requested or cache is not up to date, so recreate it:
        # firstly, decide which collections to do:
        if task_has_option("collection"):
            coll = get_collection(task_get_option("collection"))
            colls.append(coll)
            if task_has_option("recursive"):
                r_type_descendants = coll.get_descendants(type='r')
                colls += r_type_descendants
                v_type_descendants = coll.get_descendants(type='v')
                colls += v_type_descendants
        else:
            res = run_sql("SELECT name FROM collection ORDER BY id")
            for row in res:
                colls.append(get_collection(row[0]))
        # secondly, update collection reclist cache:
        if task_get_option('part', 1) == 1:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / reclist cache update" % coll.name)
                if str(coll.dbquery).startswith("hostedcollection:"):
                    coll.set_nbrecs_for_external_collection()
                else:
                    coll.calculate_reclist()
                task_sleep_now_if_required()
                coll.update_reclist()
                task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)
        # thirdly, update collection webpage cache:
        if task_get_option("part", 2) == 2:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / webpage cache update" % coll.name)
                coll.update_webpage_cache()
                task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)

        # finally update the cache last updated timestamp:
        # (but only when all collections were updated, not when only
        # some of them were forced-updated as per admin's demand)
        if not task_has_option("collection"):
            set_cache_last_updated_timestamp(task_run_start_timestamp)
            write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3)
    else:
        ## cache up to date, we don't have to run
        write_message("Collection cache is up to date, no need to run.")
    ## we are done:
    return True
Exemple #23
0
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now()

### Query the database
###
    task_update_progress('Fetching records to process')
    if process_format:  # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = intbitset(recids)

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None, of='id',
                                         c=cds_query['collection'],
                                         p=cds_query['pattern'],
                                         f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF" and recIDs:
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        start_date = latest_bibrank_run
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" % len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec   = 0     # Total number of records
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call


### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs,
                                                                         fmt)
        else:
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs,
                                                                         fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format,
                                                                         fmt)
        else:
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format,
                                                                         fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)

### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #24
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is what BibSched will be invoking via daemon call."""

    ## initialize parameters
    if task_get_option('format'):
        fmts = task_get_option('format')
    else:
        fmts = 'HB'  # default value if no format option given
    for fmt in fmts.split(','):
        last_updated = fetch_last_updated(fmt)
        write_message("last stored run date is %s" % last_updated)

        sql = {
            "all" : """SELECT br.id FROM bibrec AS br, bibfmt AS bf
                       WHERE bf.id_bibrec = br.id AND bf.format = '%s'""" % fmt,
            "last": """SELECT br.id FROM bibrec AS br
                       INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id
                       WHERE br.modification_date >= '%(last_updated)s'
                       AND bf.format='%(format)s'
                       AND bf.last_updated < br.modification_date""" \
                            % {'format': fmt,
                               'last_updated': last_updated.strftime('%Y-%m-%d %H:%M:%S')},
            "missing"  : """SELECT br.id
                            FROM bibrec as br
                            LEFT JOIN bibfmt as bf
                            ON bf.id_bibrec = br.id AND bf.format ='%s'
                            WHERE bf.id_bibrec IS NULL
                            AND br.id BETWEEN %%s AND %%s
                         """ % fmt,
        }
        sql_queries = []
        cds_query = {}
        if task_has_option("all"):
            sql_queries.append(sql['all'])
        if task_has_option("last"):
            sql_queries.append(sql['last'])
        if task_has_option("collection"):
            cds_query['collection'] = task_get_option('collection')
        else:
            cds_query['collection'] = ""

        if task_has_option("field"):
            cds_query['field']      = task_get_option('field')
        else:
            cds_query['field']      = ""

        if task_has_option("pattern"):
            cds_query['pattern']      = task_get_option('pattern')
        else:
            cds_query['pattern']      = ""

        if task_has_option("matching"):
            cds_query['matching']      = task_get_option('matching')
        else:
            cds_query['matching']      = ""

        if task_has_option("recids"):
            recids = list(split_cli_ids_arg(task_get_option('recids')))
        else:
            recids = []

    ### sql commands to be executed during the script run
    ###
        bibreformat_task(fmt, sql, sql_queries, cds_query, task_has_option('without'), not task_has_option('noprocess'), recids)
    return True
Exemple #25
0
def task_submit_check_options():
    """
    NOTE: Depending on the parameters, either "BibSched mode" or plain
          straigh-forward execution mode is entered.
    """
    if task_has_option("create_event_with_id"):
        print webstat.create_customevent(
            task_get_option("create_event_with_id"),
            task_get_option("event_name", None),
            task_get_option("column_headers", []))
        sys.exit(0)

    elif task_has_option("destroy_event_with_id"):
        print webstat.destroy_customevent(
            task_get_option("destroy_event_with_id"))
        sys.exit(0)

    elif task_has_option("list_events"):
        events = webstat._get_customevents()
        if len(events) == 0:
            print "There are no custom events available."
        else:
            print "Available custom events are:\n"
            print '\n'.join([
                x[0] + ": " +
                ((x[1] == None) and "No descriptive name" or str(x[1]))
                for x in events
            ])
        sys.exit(0)

    elif task_has_option("cache_events"):
        events = task_get_option("cache_events")

        write_message(str(events), verbose=9)

        if events[0] == 'ALL':
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] == 'KEYEVENTS':
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = []

        elif events[0] == 'CUSTOMEVENTS':
            keyevents_to_cache = []
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] != '':
            keyevents_to_cache = [
                x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events
            ]
            customevents_to_cache = [
                x[0] for x in webstat._get_customevents() if x in events
            ]

        # Control so that we have valid event names
        if len(keyevents_to_cache + customevents_to_cache) == 0:
            # Oops, no events. Abort and display help.
            return False
        else:
            task_set_option("keyevents", keyevents_to_cache)
            task_set_option("customevents", customevents_to_cache)

        return True

    elif task_has_option("dump_config"):
        print """\
[general]
visitors_box = True
search_box = True
record_box = True
bibsched_box = True
basket_box = True
apache_box = True
uptime_box = True

[webstat_custom_event_1]
name = baskets
param1 = action
param2 = basket
param3 = user

[apache_log_analyzer]
profile = nil
nb-histogram-items-to-print = 20
exclude-ip-list = ("137.138.249.162")
home-collection = "Atlantis Institute of Fictive Science"
search-interface-url = "/?"
detailed-record-url = "/%s/"
search-engine-url = "/search?"
search-engine-url-old-style = "/search.py?"
basket-url = "/yourbaskets/"
add-to-basket-url = "/yourbaskets/add"
display-basket-url = "/yourbaskets/display"
display-public-basket-url = "/yourbaskets/display_public"
alert-url = "/youralerts/"
display-your-alerts-url = "/youralerts/list"
display-your-searches-url = "/youralerts/display"
""" % CFG_SITE_RECORD
        sys.exit(0)

    elif task_has_option("load_config"):
        from ConfigParser import ConfigParser
        conf = ConfigParser()
        conf.read(CFG_WEBSTAT_CONFIG_PATH)
        for section in conf.sections():
            if section[:21] == "webstat_custom_event_":
                cols = []
                name = ""
                for option, value in conf.items(section):
                    if option == "name":
                        name = value
                    if option[:5] == "param":
                        # add the column name in it's position
                        index = int(option[-1]) - 1
                        while len(cols) <= index:
                            cols.append("")
                        cols[index] = value
                if name:
                    res = run_sql(
                        "SELECT COUNT(id) FROM staEVENT WHERE id = %s",
                        (name, ))
                    if res[0][0] == 0:
                        # name does not exist, create customevent
                        webstat.create_customevent(name, name, cols)
                    else:
                        # name already exists, update customevent
                        webstat.modify_customevent(name, cols=cols)

        sys.exit(0)

    else:
        # False means that the --help should be displayed
        return False
Exemple #26
0
def task_submit_check_options():
    """
    NOTE: Depending on the parameters, either "BibSched mode" or plain
          straigh-forward execution mode is entered.
    """
    if task_has_option("create_event_with_id"):
        print webstat.create_customevent(
            task_get_option("create_event_with_id"),
            task_get_option("event_name", None),
            task_get_option("column_headers", []),
        )
        sys.exit(0)

    elif task_has_option("destroy_event_with_id"):
        print webstat.destroy_customevent(task_get_option("destroy_event_with_id"))
        sys.exit(0)

    elif task_has_option("list_events"):
        events = webstat._get_customevents()
        if len(events) == 0:
            print "There are no custom events available."
        else:
            print "Available custom events are:\n"
            print "\n".join([x[0] + ": " + ((x[1] == None) and "No descriptive name" or str(x[1])) for x in events])
        sys.exit(0)

    elif task_has_option("cache_events"):
        events = task_get_option("cache_events")

        write_message(str(events), verbose=9)

        if events[0] == "ALL":
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] == "KEYEVENTS":
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = []

        elif events[0] == "CUSTOMEVENTS":
            keyevents_to_cache = []
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] != "":
            keyevents_to_cache = [x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events]
            customevents_to_cache = [x[0] for x in webstat._get_customevents() if x in events]

        # Control so that we have valid event names
        if len(keyevents_to_cache + customevents_to_cache) == 0:
            # Oops, no events. Abort and display help.
            return False
        else:
            task_set_option("keyevents", keyevents_to_cache)
            task_set_option("customevents", customevents_to_cache)

        return True

    elif task_has_option("dump_config"):
        print """\
[general]
visitors_box = True
search_box = True
record_box = True
bibsched_box = True
basket_box = True
apache_box = True
uptime_box = True

[webstat_custom_event_1]
name = baskets
param1 = action
param2 = basket
param3 = user

[apache_log_analyzer]
profile = nil
nb-histogram-items-to-print = 20
exclude-ip-list = ("137.138.249.162")
home-collection = "Atlantis Institute of Fictive Science"
search-interface-url = "/?"
detailed-record-url = "/%s/"
search-engine-url = "/search?"
search-engine-url-old-style = "/search.py?"
basket-url = "/yourbaskets/"
add-to-basket-url = "/yourbaskets/add"
display-basket-url = "/yourbaskets/display"
display-public-basket-url = "/yourbaskets/display_public"
alert-url = "/youralerts/"
display-your-alerts-url = "/youralerts/list"
display-your-searches-url = "/youralerts/display"
""" % CFG_SITE_RECORD
        sys.exit(0)

    elif task_has_option("load_config"):
        from ConfigParser import ConfigParser

        conf = ConfigParser()
        conf.read(CFG_WEBSTAT_CONFIG_PATH)
        for section in conf.sections():
            if section[:21] == "webstat_custom_event_":
                cols = []
                name = ""
                for option, value in conf.items(section):
                    if option == "name":
                        name = value
                    if option[:5] == "param":
                        # add the column name in it's position
                        index = int(option[-1]) - 1
                        while len(cols) <= index:
                            cols.append("")
                        cols[index] = value
                if name:
                    res = run_sql("SELECT COUNT(id) FROM staEVENT WHERE id = %s", (name,))
                    if res[0][0] == 0:
                        # name does not exist, create customevent
                        webstat.create_customevent(name, name, cols)
                    else:
                        # name already exists, update customevent
                        webstat.modify_customevent(name, cols=cols)

        sys.exit(0)

    else:
        # False means that the --help should be displayed
        return False
            ## with a timestamp
            perm_file_fd, perm_file_name = \
                mkstemp(suffix='.xml', prefix="refextract_%s_" % \
                            time.strftime("%Y-%m-%d_%H:%M:%S"), \
                            dir=os.path.join(CFG_TMPDIR, "refextract"))
            copyfile(daemon_cli_opts['xmlfile'], perm_file_name)
            os.close(perm_file_fd)
        except IOError, err:
            write_message("Error: Unable to copy content to timestamped XML file, %s" \
                              % err)
            return 0

        ## Now, given the references have been output to option 'xmlfile'
        ## enrich the meta-data of the affected records, via bibupload
        ## Only if a named file was given as input
        if task_has_option('extraction-job'):
            cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, perm_file_name)
            errcode = 0
            try:
                errcode = os.system(cmd)
            except OSError, exc:
                write_message('Error: Command %s failed [%s].' % (cmd, exc),
                    stream=sys.stdout, verbose=0)
            if errcode != 0:
                write_message("Error: %s failed, error code is %d." %
                    (cmd, errcode), stream=sys.stdout, verbose=0)
                return 0
            ## Update the extraction_date for each record id,
            ## (only those which have been given to Refextract)
            if task_info['last_updated']:
                ## If the last updated time exists in the db.. update it
def _task_run_core():
    """calls extract_references in refextract"""

    def _append_recid_collection_list(collection, current_recids):
        """Updated list of recids with new recids from collection
        @param collection: (string) collection name to use to obtain record
        ids
        @param current_recids: (list) list of current record ids
        which have already been obtained from previous collection or
        recid flags
        @return: (list) current record ids with newly appended recids
        from input collection
        """
        records = get_collection_reclist(collection)
        for r in records:
            if r not in current_recids:
                current_recids.append(r)
        return current_recids

    daemon_cli_opts = { 'treat_as_reference_section' : 0,
                        'fulltext'                   : [],
                        'output_raw'                 : 0,
                        'verbosity'                  : 0,
                        'xmlfile'                    : 0,
                        'dictfile'                   : 0,
                        'inspire'                    : 0,
                        'kb-journal'                 : 0,
                        'kb-report-number'           : 0,
                        'extraction-mode'            : 'ref',
                        'authors'                    : 0,
                        'affiliations'               : 0,
                        'treat_as_raw_section'       : 0,
                      }

    ## holds the name of the extraction job, and if it's already in the db
    task_info = task_get_option('extraction-job')

    ## Now set the cli options, from the set task options list
    if task_has_option('verbose'):
        v = task_get_option('verbose')
        if not v.isdigit():
            daemon_cli_opts['verbosity'] = 0
        elif int(v) not in xrange(0, 10):
            daemon_cli_opts['verbosity'] = 0
        else:
            daemon_cli_opts['verbosity'] = int(v)
    if task_has_option('raw-references'):
        daemon_cli_opts['treat_as_reference_section'] = 1
    if task_has_option('output-raw-refs'):
        daemon_cli_opts['output_raw'] = 1
    if task_has_option('xmlfile'):
        daemon_cli_opts['xmlfile'] = task_get_option('xmlfile')
    if task_has_option('dictfile'):
        daemon_cli_opts['dictfile'] = task_get_option('dictfile')
    if task_has_option('inspire'):
        daemon_cli_opts['inspire'] = 1
    if task_has_option('kb-journal'):
        daemon_cli_opts['kb-journal'] = task_get_option('kb-journal')
    if task_has_option('kb-report-number'):
        daemon_cli_opts['kb-report-number'] = task_get_option('kb-report-number')
    if task_get_option('recids'):
        ## Construct the fulltext argument equivalent from record id's
        ## (records, and arguments, which have valid files)
        try:
            fulltexts_for_collection = \
                _get_fulltext_args_from_recids(task_get_option('recids'), task_info)
            daemon_cli_opts['fulltext'].extend(fulltexts_for_collection)
        except Exception, err:
            write_message('Error: Unable to obtain fulltexts for recid %s. %s' \
                           % (str(task_get_option('recids')), err), \
                           stream=sys.stdout, verbose=0)
            raise StandardError
Exemple #29
0
def _task_submit_elaborate_specific_parameter(key, value, opts, args):
    """ Must be defined for bibtask to create a task """
    if args and len(args) > 0:
        ## There should be no standalone arguments for any refextract job
        ## This will catch args before the job is shipped to Bibsched
        raise StandardError("Error: Unrecognised argument '%s'.\n" % args[0])

    ## Task name specified
    if key in ('-e', '--extraction-job'):

        ## Make sure that the user is not mixing job name with other defined
        ## Refextract flags on the command line
        if filter(lambda p: task_get_option(p), possible_task_option_keys):
            write_message(
                "Error: cli and extraction-job extraction parameters specified together."
            )
            write_message(
                "The extraction-job flag cannot be mixed with other cli flags."
            )
            return False

        ## ---- Get the task file with this name
        task_file_dir = os.path.join(CFG_ETCDIR, 'bibedit')
        ## The job file name
        task_file = value + '.cfg'
        abs_path = os.path.join(task_file_dir, task_file)
        try:
            ## Open and readlines from file
            file_hdl = open(abs_path, 'r')
            file_params = file_hdl.readlines()
            file_hdl.close()
        except IOError:
            write_message("Error: Unable to read job file '%s'" % \
                            abs_path, stream=sys.stdout, verbose=0)
            return False
        ## ---- Get the database 'last_updated' value for this name
        xtrJOB_row = _task_name_exists(value)
        ## Build the information for this extraction job
        ## These dictionaries will be extended with extra file parameters
        if xtrJOB_row:
            task_info = {
                'id': xtrJOB_row[0][0],
                'name': xtrJOB_row[0][1],
                'last_updated': xtrJOB_row[0][2],
                'collections': [],
                'recids': [],
            }
        else:
            ## Save the name as the input argument for this job
            task_info = {
                'name': value,
                'last_updated': None,
                'collections': [],
                'recids': [],
            }
        ## ---- Save job parameters
        for p in file_params:
            p = p.strip()
            ## Ignore comments and titles, and skip blank lines
            if (not p) or p.startswith('#') or p.startswith("["):
                continue
            ## Split arguments just once
            p_args = map(lambda x: x.strip(), p.split("=", 1))
            ## Check cfg file param against list of vaild params
            if not (p_args[0] in CFG_REFEXTRACT_JOB_FILE_PARAMS):
                write_message("Error: Unknown task param '%s' inside '%s'." \
                              % (p_args[0], task_file),
                    stream=sys.stdout, verbose=0)
                return False

            if p_args[0] == 'collection':
                ## Separate and strip collections
                collections = map(lambda c: c.strip(), p_args[1].split(','))
                task_info['collections'].extend(
                    [c for c in collections if c.strip()])


#FIXME add author extraction functionality
#            elif p_args[0] == 'extraction-mode':
#                if p_args[0] == 'authors':
#                    task_set_option('authors', p_args[1])

            elif p_args[0] == 'recid':
                recids = p_args[1].split(",")
                task_info['recids'].extend([r for r in recids if r.strip()])
            elif len(p_args) == 2:
                ## All other flags
                task_info[p_args[0]] = p_args[1]
            else:
                ## Standalone flag
                task_info[p_args[0]] = 1

        if not ('xmlfile' in task_info):
            task_info['xmlfile'] = _generate_default_xml_out()

        ## Used to flag the creation of a bibupload task
        task_set_option('extraction-job', task_info)

        ## using the extraction-job options...
        ## set the task options
        for option, value in task_info.items():
            if option == 'collections':
                for collection in value:
                    collection_row = _collection_exists(collection)
                    if not collection_row:
                        write_message(
                            "Error: '%s' is not a valid collection." %
                            collection,
                            stream=sys.stdout,
                            verbose=0)
                        return 0
                    ## Use the collection name matched from the database
                    task_get_option(option).append(collection_row[0][0])
            elif option == 'recids':
                for recid in value:
                    if not _recid_exists(recid):
                        write_message("Error: '%s' is not a valid record id." %
                                      recid,
                                      stream=sys.stdout,
                                      verbose=0)
                        return 0
                    ## Add this valid record id to the list of record ids
                    task_get_option(option).append(recid)
            elif option not in ('id', 'name', 'last_updated'):
                ## Usual way of setting options, but this time from the extraction-job file
                task_set_option(option, value)

    else:
        ## Quick check to see if an extraction job has also been specified
        if task_has_option('extraction-job'):
            write_message(
                "Error: cli and extraction-job extraction parameters specified together."
            )
            write_message(
                "The extraction-job flag cannot be mixed with other cli flags."
            )
            return False

        # Recid option
        elif key in ("-i", "--recid"):
            split_recids = value.split(":")
            if len(split_recids) == 2:
                first = last = valid_range = None
                try:
                    first = int(split_recids[0])
                    last = int(split_recids[1])
                    valid_range = first < last
                except ValueError:
                    write_message(
                        "Error: Range values for --recid must be integers, "
                        "not '%s'." % value,
                        stream=sys.stdout,
                        verbose=0)
                if first is None or last is None:
                    return False
                if not _recid_exists(first) or not _recid_exists(
                        last) or not valid_range:
                    write_message(
                        "Error: '%s' is not a valid range of record ID's." %
                        value,
                        stream=sys.stdout,
                        verbose=0)
                    return False
                task_get_option('recids').extend(range(first, last))
            else:
                int_val = None
                try:
                    int_val = int(value)
                except ValueError:
                    write_message(
                        "Error: The value specified for --recid must be a "
                        "valid integer, not '%s'." % value,
                        stream=sys.stdout,
                        verbose=0)
                if not _recid_exists(value) or int_val is None:
                    write_message("Error: '%s' is not a valid record ID." %
                                  value,
                                  stream=sys.stdout,
                                  verbose=0)
                    return False
                task_get_option('recids').append(value)
        # Collection option
        elif key in ("-c", "--collection"):
            collection_row = _collection_exists(value)
            if not collection_row:
                write_message("Error: '%s' is not a valid collection." % value,
                              stream=sys.stdout,
                              verbose=0)
                return False
            task_get_option('collections').append(collection_row[0][0])
        elif key in ('-z', '--raw-references'):
            task_set_option('raw-references', True)
        elif key in ('-r', '--output-raw-refs'):
            task_set_option('output-raw-refs', True)
        elif key in ('-x', '--xmlfile'):
            task_set_option('xmlfile', value)
        elif key in ('-d', '--dictfile'):
            task_set_option('dictfile', value)
        elif key in ('-p', '--inspire'):
            task_set_option('inspire', True)
        elif key in ('-j', '--kb-journal'):
            task_set_option('kb-journal', value)
        elif key in ('-n', '--kb-report-number'):
            task_set_option('kb-report-number', value)
    return True