Beispiel #1
0
 def fun():
     try:
         return task_run_core(name, core_func, extra_vars)
     except Exception:
         # Remove extra '\n'
         write_message(traceback.format_exc()[:-1])
         raise
Beispiel #2
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s" % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME))
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename)
    if dummy1:
        write_message("ERROR: mysqldump exit code is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy2:
        write_message("ERROR: mysqldump stdout is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy3:
        write_message("ERROR: mysqldump stderr is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse):
    """the core function of the PAGERANK_EXT method
    returns an array with the ranks coresponding to each recid"""
    weights_old = array((), float32)
    weights_old = ones((len_), float32)
    weights_new = array((), float32)
    converged = False
    nr_of_check_points = 0
    difference = len_
    while not converged:
        nr_of_check_points += 1
        for step in (range(check_point)):
            weights_new = zeros((len_), float32)
            for (i, j) in sparse.keys():
                weights_new[i] += sparse[(i, j)]*weights_old[j]
            total_sum = 0.0
            for j in semi_sparse:
                total_sum += semi_sparse[j]*weights_old[j]
            weights_new[1:len_] = weights_new[1:len_] + total_sum
            if step == check_point - 1:
                diff = weights_new - weights_old
                difference = sqrt(dot(diff, diff))/len_
                write_message("Finished step: %s, %s " \
                    % (str(check_point*(nr_of_check_points-1) + step), \
                        str(difference)), verbose=5)
            weights_old = weights_new.copy()
            converged = (difference < conv_threshold)
    write_message("PageRank calculated for all recids finnished in %s steps. \
The threshold was %s" % (str(nr_of_check_points), \
            str(difference)), verbose=2)
    #return weights_old[1:len_]/(len_ - weights_old[0])
    return weights_old[1:len_]
Beispiel #4
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
def get_config_parameter(jobname, parameter_name, is_parameter_collection = False):
    """Detect export method of JOBNAME.  Basically, parse JOBNAME.cfg
       and return export_method.  Return None if problem found."""
    jobconfig = ConfigParser()
    jobconffile = CFG_ETCDIR + os.sep + 'bibexport' + os.sep + jobname + '.cfg'

    if not os.path.exists(jobconffile):
        write_message("ERROR: cannot find config file %s." % jobconffile)
        return None

    jobconfig.read(jobconffile)

    if is_parameter_collection:
        all_items = jobconfig.items(section='export_job')

        parameters = []

        for item_name, item_value in all_items:
            if item_name.startswith(parameter_name):
                parameters.append(item_value)

        return parameters
    else:
        parameter = jobconfig.get('export_job', parameter_name)
        return parameter
Beispiel #6
0
def extract_package(package, batch_size, delete_zip, directory):
    try:
        extractAll(package, delete_zip, directory)
    except BadZipfile:
        write_message("Error BadZipfile %s", (package,))
        task_update_status("CERROR")
        remove(package)
def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
    """returns a dictionary containing the number of
    external links for each recid
    external link=citation that is not in our database """
    ext_links = {}
    dict_all_ref = {}
    for recid in dict_of_ids:
        dict_all_ref[recid] = 0
        ext_links[dict_of_ids[recid]] = 0
    reference_db_id = reference_indicator[0:2]
    reference_tag_regex = reference_indicator + "[a-z]"
    tag_list = run_sql("select id from bib" + reference_db_id + \
                         "x where tag RLIKE %s", (reference_tag_regex, ))
    tag_set = set()
    for tag in tag_list:
        tag_set.add(tag[0])
    ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \
                       bibrec_bib" + reference_db_id + "x group by \
                       id_bibrec, field_number")
    for item in ref_list:
        recid = int(item[0])
        id_bib = int(item[1])
        if recid in dict_of_ids and id_bib in tag_set:
            dict_all_ref[recid] += 1
    for recid in dict_of_ids:
        total_links = dict_all_ref[recid]
        internal_links = ref[dict_of_ids[recid]]
        ext_links[dict_of_ids[recid]] = total_links - internal_links
        if ext_links[dict_of_ids[recid]] < 0:
            ext_links[dict_of_ids[recid]] = 0
    write_message("External link information extracted", verbose=2)
    write_message("External links: %s" % str(ext_links), verbose=9)
    return ext_links
Beispiel #8
0
def submit_records_via_ftp(filename, location=""):
    """Submits given file to FTP server as defined.

    The FTP server uploaded to is controlled with the config variables:

    CFG_FTP_AUTHENTICATION_FILE (netrc_file)
    CFG_FTP_SERVER

    @param filename: file to upload
    @type filename: str

    @param location: location on FTP server. Defaults to root.
    @type location: str
    """
    from invenio.config import (CFG_FTP_SERVER,
                                CFG_FTP_AUTHENTICATION_FILE,)

    try:
        ftp = FtpHandler(CFG_FTP_SERVER, netrc_file=CFG_FTP_AUTHENTICATION_FILE)
        ftp.upload(filename, location)
        ftp.close()
        write_message("%s successfully uploaded to FTP server" % filename)
    except Exception as e:
        write_message("Failed to upload %s to FTP server: %s\n%s"
                      % (filename, str(e), traceback.format_exc()))
def bst_inspire_authority_ids_synchronizer(
        url=SYNC_URL_INSPIRE_RECORDS_SRC, tmp_dir=SYNC_LOCAL_TMP_DIR):
    """Synchronize INSPIRE authority ids.

    :param string url: valid URL to the gzip (.gz) file
    :param string tmp_dir: existing directory path for temporary files
    """
    xml_content = get_inspire_dump(
        url, os.path.join(tmp_dir, SYNC_LOCAL_INSPIRE_RECORDS_FILE_NAME))

    task_sleep_now_if_required()

    authority_ids = parse_inspire_xml(xml_content)

    task_sleep_now_if_required()

    if authority_ids:
        record_ids = get_record_ids()
        write_message(
            "Info: {0} record ids have been requested".format(len(record_ids)))
        if record_ids:
            synchronize(
                record_ids,
                authority_ids,
                os.path.join(tmp_dir, SYNC_LOCAL_CDS_RECORDS_UPDATES_FILE_NAME))
Beispiel #10
0
def fetch_concerned_arxiv_records(name):
    task_update_progress("Fetching arxiv record ids")

    dummy, last_date = fetch_last_updated(name)

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
        "WHERE `modification_date` >= %s " \
        "AND `creation_date` > NOW() - INTERVAL 7 DAY " \
        "ORDER BY `modification_date`" \
        "LIMIT 5000"
    records = run_sql(sql, [last_date.isoformat()])

    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False

    def check_pdf_date(recid):
        doc = get_pdf_doc(recid)
        if doc:
            return doc.md > last_date
        return False

    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
    records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
    write_message("recids %s" % repr([(r, mod_date.isoformat()) \
                                               for r, mod_date in records]))
    task_update_progress("Done fetching arxiv record ids")
    return records
    def check_nbrecs_for_external_collection(self):
        """Check if the external collections has changed its total number of records, aka nbrecs.
        Rerurns True if the total number of records has changed and False if it's the same"""

        write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6)
        write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6)
        return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
Beispiel #12
0
def get_records_to_harvest(parameters):
    """ Get APSRecord to harvest.

    Using the given parameters dict (from bst_apsharvest), we check how
    to get the list of records to process.

    Returns a tuple of (record_list, harvest_from_date, date_checked) where
    record_list is the list of APSRecord instances, harvest_from_date is the
    decided date to harvest from and date_checked is the datetime when the
    harvest was initiated.
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()
    new_harvest_date = None
    harvest_from_date = None
    harvest_until_date = None

    if parameters.get("input_file"):
        # We get input from file
        with open(parameters.get("input_file")) as fd:
            for line in fd.readlines():
                doi = line.strip()
                if not doi:
                    continue
                final_record_list.append(APSRecord(doi=doi))

    if parameters.get("threshold_date"):
        # Input from user. Validate date
        try:
            validate_date(parameters.get("threshold_date"))
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e),),
                          stream=sys.stderr)
            raise
def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE):
    """Inserts a new record into all the data structures"""
    #data_dict
    data_dict[recid] = value
    #data_dict_ordered & data_list_sorted
    #calculate at which index the rec should be inserted in data_list_sorted
    index_for_insert = binary_search(data_list_sorted, value, data_dict)
    #we have to calculate the weight of this record in data_dict_ordered
    #and it will be the med between its neighbours in the data_list_sorted
    if index_for_insert == len(data_list_sorted):#insert at the end of the list
        #append at the end of the list
        data_list_sorted.append(recid)
        #weight = highest weight + the distance
        data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing
    else:
        if index_for_insert == 0: #insert at the begining of the list
            left_neighbor_weight = 0
        else:
            left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]]
        right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]]
        #the recid's weight will be the med between left and right
        weight = (right_neighbor_weight - left_neighbor_weight)/2
        if weight < 1: #there is no more space to insert, we have to create some space
            data_list_sorted.insert(index_for_insert, recid)
            data_dict_ordered[recid] = left_neighbor_weight + spacing
            create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing)
        else:
            data_list_sorted.insert(index_for_insert, recid)
            data_dict_ordered[recid] = left_neighbor_weight + weight
    write_message("Record %s done." %recid, verbose=5)
    return index_for_insert
def write_to_buckets_table(id_method, bucket_no, bucket_data, bucket_last_value, update_timestamp=True):
    """Serialize the date and write it to the bsrMEHODDATA_BUCKETS"""
    write_message('Writing the data for bucket number %s for ' \
                  'method_id=%s to the database' \
                  %(bucket_no, id_method), verbose=5)
    write_message('Serializing data for bucket number %s' %bucket_no, verbose=5)
    serialized_bucket_data = bucket_data.fastdump()
    date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    if not update_timestamp:
        try:
            date = run_sql('SELECT last_update from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \
                           (id_method, bucket_no))[0][0]
        except IndexError:
            pass # keep the generated date
    try:
        write_message('Deleting old data.', verbose=5)
        run_sql("DELETE FROM bsrMETHODDATABUCKET \
                WHERE id_bsrMETHOD = %s AND bucket_no = %s", \
                (id_method, bucket_no, ))
        write_message('Inserting new data.', verbose=5)
        run_sql("INSERT into bsrMETHODDATABUCKET \
            (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \
            VALUES (%s, %s, %s, %s, %s)", \
            (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, ))
    except Error, err:
        write_message("The error [%s] occured when inserting new bibsort data " \
                      "into bsrMETHODATA_BUCKETS table" %err, sys.stderr)
        return False
Beispiel #15
0
def update_sorting(methods, recids):
    """Runs the updating of the sorting tables for methods and recids
    Recids is a list of integer numbers(record ids)
    but can also contain intervals"""
    method_list = []
    if methods:
        method_list = methods.strip().split(',')

    recid_list = []
    if recids:
        cli_recid_list = recids.strip().split(',')
        for recid in cli_recid_list:
            if recid.find('-') > 0:
                rec_range = recid.split('-')
                try:
                    recid_min = int(rec_range[0])
                    recid_max = int(rec_range[1])
                    for rec in range(recid_min, recid_max + 1):
                        recid_list.append(rec)
                except Error, err:
                    write_message("Error: [%s] occured while trying \
                          to parse the recids argument." %err, sys.stderr)
                    return False
            else:
                recid_list.append(int(recid))
Beispiel #16
0
def submit_xml(xml, mode, stamp):
    """
    Write temporary xml file and submit for batchupload.
    Do nothing for empty xml.

    @param xml: body xml
    @param mode: mode for upload ['delete' | 'correct']
    @param stamp: additional string in filename
    """

    if not xml:
        return

    xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n'\
          + xml + '\n</collection>\n'
    tmp_file_fd, tmp_file = mkstemp(
                            suffix='.xml',
                            prefix="bst_check4template-%s_%s" % (mode, stamp),
                            dir=CFG_TMPSHAREDDIR
                            )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if mode == 'delete':
        flag = '-d'
    elif mode == 'correct':
        flag = '-c'
    else:
        write_message("Wrong mode: %s" % mode)
        return

    task = task_low_level_submission('bibupload', 'check4template',
                                     flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
Beispiel #17
0
def query_records(params):
    """Prduces record IDs from given query parameters

    By passing the appriopriate CLI options, we can query here for additional
    records.
    """
    write_message("Querying database (records query)...")
    res = intbitset()
    if params['field'] or params['collection'] or params['pattern']:

        if not params['collection']:
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=params['pattern'],
                                 f=params['field'],
                                 m=params['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None,
                                                   of='id',
                                                   c=params['collection'],
                                                   p=params['pattern'],
                                                   f=params['field']))
    return res
def iterate_over_new(list, fmt):
    "Iterate over list of IDs"
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)):
            run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt))
        else:
            run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Beispiel #19
0
def find_records(collection, subfields):
    """
    Find records with VOLATILE content.

    @param collection: collection to be checked
    @type  collection: string
    @param subfields: VOLATILE content in tagiic
    @type  subfields: dict
    @return: dict {recid: array of tagiic}
    """

    sf_keys = subfields.keys()
    sf_keys.sort()

    recs_collection = get_collection_reclist(collection)
    recs_to_change = {}
    for tagiic in sf_keys:
        for value in subfields[tagiic]:
            result = search_pattern(p=value, f=tagiic, m='e') & recs_collection
            if result:
                write_message('Update %i records with %s:"%s" -- %s' \
                              % (len(result), tagiic, value, list(result)))
            for recid in result:
                if recs_to_change.has_key(recid):
                    recs_to_change[recid].append(tagiic)
                else:
                    recs_to_change[recid] = [tagiic, ]
    return recs_to_change
 def get_ancestors(self):
     "Returns list of ancestors of the current collection."
     ancestors = []
     ancestors_ids = intbitset()
     id_son = self.id
     while 1:
         query = (
             "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "
             "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son)
         )
         res = run_sql(query, None, 1)
         if res:
             col_ancestor = get_collection(res[0][1])
             # looking for loops
             if self.id in ancestors_ids:
                 write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
                 raise OverflowError("Loop found in collection %s" % self.name)
             else:
                 ancestors.append(col_ancestor)
                 ancestors_ids.add(col_ancestor.id)
                 id_son = res[0][0]
         else:
             break
     ancestors.reverse()
     return ancestors
Beispiel #21
0
def iterate_over_new(recIDs, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call

    tot = len(recIDs)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID,
                                                  of=fmt,
                                                  on_the_fly=True,
                                                  save_missing=False)
        save_preformatted_record(recID=recID,
                                 of=fmt,
                                 res=formatted_record,
                                 needs_2nd_pass=needs_2nd_pass,
                                 low_priority=True)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
def build_bai_knowledge():
    ret = {}
    for personid, tag, data in run_sql("SELECT personid, tag, data FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%' OR tag = 'canonical_name' or tag = 'uid'"):
        if tag == 'canonical_name':
            tag = 'BAI'
        elif tag == 'extid:INSPIREID':
            tag = 'INSPIRE'
        elif tag == 'extid:ORCID':
            tag = 'ORCID'
        elif tag == 'extid:KAKEN':
            tag = 'KAKEN'
        elif tag == 'uid':
            tag = 'UID'
        else:
            continue
        data = data.strip()
        if personid not in ret:
            ret[personid] = {'personid': personid}
        if tag in ret[personid]:
            write_message("ERROR: http://inspirehep.net/author/profile/{personid} has invalid IDs".format(personid=personid), stream=sys.stderr)
            continue
        ret[personid][tag] = data.upper()
        if tag == 'BAI':
            ret[personid]['ORIGINAL_BAI'] = data
    return ret.values()
def filter_out_broken_ids(hepname_kb, bai_kb):
    broken = []
    for id_type in ('BAI', 'INSPIRE', 'ORCID', 'KAKEN'):
        broken.extend(project_ids(hepname_kb, id_type)[1])
        broken.extend(project_ids(bai_kb, id_type)[1])
    write_message("Broken entries: {len}".format(len=len(broken)), stream=sys.stderr)
    broken_ids = {}
    for elem in broken:
        for key, value in elem.iteritems():
            if key not in broken_ids:
                broken_ids[key] = set([value])
            else:
                broken_ids[key].add(value)
    new_hepname_kb = []
    for elem in hepname_kb:
        for key, value in elem.iteritems():
            if value in broken_ids.get(key, set()):
                break
        else:
            new_hepname_kb.append(elem)

    new_bai_kb = []
    for elem in bai_kb:
        for key, value in elem.iteritems():
            if value in broken_ids.get(key, set()):
                break
        else:
            new_bai_kb.append(elem)

    return new_hepname_kb, new_bai_kb
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Beispiel #25
0
def convert_files(xml_files, els, prefix="", threshold_date=None):
    """Convert the list of publisher XML to MARCXML using given instance."""
    results = {}
    for xml_file in xml_files:
        task_sleep_now_if_required()
        full_xml_filepath = join(prefix, xml_file)
        dom_xml = parse(full_xml_filepath)
        date = els.get_publication_information(dom_xml)[-2]
        if threshold_date and date < threshold_date:
            continue
        doctype = els.get_doctype(dom_xml).lower()
        if doctype in INTERESTING_DOCTYPES:
            new_full_xml_filepath = join(dirname(full_xml_filepath),
                                         "upload.xml")
            try:
                converted_xml = els.get_record(
                    full_xml_filepath, refextract_callback=refextract)
            except Exception as e:
                _errors_detected.append(e)
                error_trace = traceback.format_exc()
                # Some error happened, lets gracefully quit
                results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR,
                                              error_trace)
                write_message('Error converting:'
                              ' \n {0}'.format(error_trace))
                continue
            with open(new_full_xml_filepath, "w") as marcfile:
                marcfile.write(converted_xml)
            results[full_xml_filepath] = (StatusCodes.OK,
                                          new_full_xml_filepath)
        else:
            results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG,
                                          doctype)
            write_message("Doctype not interesting: {0}".format(doctype))
    return results
def record_collect_oai_identifiers(record_xml):
    """
    Collects all OAI identifiers from given MARCXML.

    Returns a list of found values in the tag
    CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG.

    @param record_xml: string containing MARCXML to parse

    @return list of identifiers
    """
    result = None
    (record, status_code, list_of_errors) = create_record(record_xml)
    if not status_code:
        # Error happened
        write_message("Error collecting OAI identifier from record: %s" %
                     ("\n".join(list_of_errors),))
    else:
        # All OK! We can get the IDs
        result = record_get_field_values(record,
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4],
                                         CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5])
        if not result:
            # No IDs found..
            write_message("No OAI IDs found in record")
    return result
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
def replace_cites(recid, new_cites):
    """
    Given a set of citations, replaces the citations of given recid
    in the database.
    The changes are logged into rnkCITATIONLOG.

    See @replace_refs
    """
    old_cites = set(row[0] for row in run_sql("""SELECT citer
                                                FROM rnkCITATIONDICT
                                                WHERE citee = %s""", [recid]))

    cites_to_add = new_cites - old_cites
    cites_to_delete = old_cites - new_cites

    for cite in cites_to_add:
        write_message('adding cite %s %s' % (recid, cite), verbose=1)
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated)
                   VALUES (%s, %s, %s)""", (recid, cite, now))
        run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                   VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now))

    for cite in cites_to_delete:
        write_message('deleting cite %s %s' % (recid, cite), verbose=1)
        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        run_sql("""DELETE FROM rnkCITATIONDICT
                   WHERE citee = %s and citer = %s""", (recid, cite))
        run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                   VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
def align_entries(hepname_kb, bai_kb):
    hepname_updates= {}
    bai_updates = {}
    for key in ('ORCID', 'BAI', 'INSPIRE', 'KAKEN'):
        projected_hepnames = project_ids(hepname_kb, key)[0]
        projected_bais = project_ids(bai_kb, key)[0]
        for id_value in set(projected_hepnames.iterkeys()) & set(projected_bais.iterkeys()):
            merged_entry = dict(projected_hepnames[id_value].items())
            for key, value in projected_bais[id_value].iteritems():
                if key in merged_entry and merged_entry[key].upper() != value.upper():
                    write_message("ERROR: conflicting entries {entry1} Vs. {entry2}".format(entry1=format_entry(projected_hepnames[id_value]), entry2=format_entry(projected_bais[id_value])))
                    break
            else:
                merged_entry.update(projected_bais[id_value])
                if (set(merged_entry.keys()) ^ set(projected_hepnames[id_value].keys())) & set(['ORCID', 'BAI', 'INSPIRE', 'KAKEN']):
                    write_message("INFO: {hepname} should be updated to {merged_entry}".format(hepname=format_entry(projected_hepnames[id_value]), merged_entry=format_entry(merged_entry)))
                    recid = merged_entry['recid']
                    if recid in hepname_updates:
                        if hepname_updates[recid] != merged_entry:
                            write_message("ERROR: conflict for recid {recid}: {entry1} Vs. {entry2}".format(recid=recid, entry1=format_entry(hepname_updates[recid]), entry2=format_entry(merged_entry)), stream=sys.stderr)
                    else:
                        hepname_updates[recid] = merged_entry

                if (set(merged_entry.keys()) ^ set(projected_bais[id_value].keys())) & set(['ORCID', 'BAI', 'INSPIRE', 'KAKEN']):
                    write_message("INFO: {bai} should be updated to {merged_entry}".format(bai=format_entry(projected_bais[id_value]), merged_entry=format_entry(merged_entry)))
                    personid = merged_entry['personid']
                    if personid in bai_updates:
                        if bai_updates[personid] != merged_entry:
                            write_message("ERROR: conflict for personid {personid}: {entry1} Vs. {entry2}".format(personid=personid, entry1=format_entry(bai_updates[recid]), entry2=format_entry(merged_entry)), stream=sys.stderr)
                    else:
                        bai_updates[personid] = merged_entry
    return hepname_updates, bai_updates
Beispiel #31
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    rules_to_reset = task_get_option("reset_rules")
    if rules_to_reset:
        write_message("Resetting the following rules: %s" % rules_to_reset)
        for rule in rules_to_reset:
            reset_rule_last_run(rule)
    plugins = load_plugins()
    rules = load_rules(plugins)
    write_message("Loaded rules: %s" % rules, verbose=9)
    task_set_option('plugins', plugins)
    recids_for_rules = get_recids_for_rules(rules)
    write_message("recids for rules: %s" % recids_for_rules, verbose=9)

    update_database = not (task_has_option('record_ids') or task_get_option(
        'no_upload', False) or task_get_option('no_tickets', False))

    if update_database:
        next_starting_dates = {}
        for rule_name, rule in rules.iteritems():
            next_starting_dates[rule_name] = get_next_starting_date(rule)

    all_recids = intbitset([])
    single_rules = set()
    batch_rules = set()
    for rule_name, rule_recids in recids_for_rules.iteritems():
        all_recids.union_update(rule_recids)
        if plugins[rules[rule_name]["check"]]["batch"]:
            batch_rules.add(rule_name)
        else:
            single_rules.add(rule_name)

    records_to_upload_holdingpen = []
    records_to_upload_replace = []
    records_to_submit_tickets = []
    for batch in iter_batches(all_recids, CFG_BATCH_SIZE):

        for rule_name in batch_rules:
            rule = rules[rule_name]
            rule_recids = recids_for_rules[rule_name]
            task_sleep_now_if_required(can_stop_too=True)
            records = []
            for i, record_id, record in batch:
                if record_id in rule_recids:
                    records.append(record)
            if len(records):
                check_records(rule, records)

        # Then run them through normal rules
        for i, record_id, record in batch:
            progress_percent = int(float(i) / len(all_recids) * 100)
            task_update_progress("Processing record %s/%s (%i%%)." %
                                 (i, len(all_recids), progress_percent))
            write_message("Processing record %s" % record_id)

            for rule_name in single_rules:
                rule = rules[rule_name]
                rule_recids = recids_for_rules[rule_name]
                task_sleep_now_if_required(can_stop_too=True)
                if record_id in rule_recids:
                    check_record(rule, record)

            if record.amended:
                if record.holdingpen:
                    records_to_upload_holdingpen.append(record)
                else:
                    records_to_upload_replace.append(record)

            if not record.valid:
                records_to_submit_tickets.append(record)

        if len(records_to_submit_tickets) >= CFG_BATCH_SIZE:
            Tickets(records_to_submit_tickets).submit()
            records_to_submit_tickets = []
        if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_holdingpen, True)
            records_to_upload_holdingpen = []
        if len(records_to_upload_replace) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_replace, False)
            records_to_upload_replace = []

    ## In case there are still some remaining amended records
    if records_to_submit_tickets:
        Tickets(records_to_submit_tickets).submit()
    if records_to_upload_holdingpen:
        upload_amendments(records_to_upload_holdingpen, True)
    if records_to_upload_replace:
        upload_amendments(records_to_upload_replace, False)

    # Update the database with the last time each rule was ran
    if update_database:
        for rule_name, rule in rules.iteritems():
            update_rule_last_run(rule_name, next_starting_dates[rule_name])

    return True
def bibrank_engine(run):
    """Run the indexing task.
    Return 1 in case of success and 0 in case of failure.
    """
    startCreate = time.time()
    try:
        options["run"] = []
        options["run"].append(run)
        for rank_method_code in options["run"]:
            task_sleep_now_if_required(can_stop_too=True)
            cfg_name = getName(rank_method_code)
            write_message("Running rank method: %s." % cfg_name)

            file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
            config = ConfigParser.ConfigParser()
            try:
                config.readfp(open(file))
            except StandardError, e:
                write_message("Cannot find configurationfile: %s" % file,
                              sys.stderr)
                raise StandardError

            cfg_short = rank_method_code
            cfg_function = config.get("rank_method", "function") + "_exec"
            cfg_repair_function = config.get("rank_method",
                                             "function") + "_repair_exec"
            cfg_name = getName(cfg_short)
            options["validset"] = get_valid_range(rank_method_code)

            if task_get_option("collection"):
                l_of_colls = string.split(task_get_option("collection"), ", ")
                recIDs = perform_request_search(c=l_of_colls)
                recIDs_range = []
                for recID in recIDs:
                    recIDs_range.append([recID, recID])
                options["recid_range"] = recIDs_range
            elif task_get_option("id"):
                options["recid_range"] = task_get_option("id")
            elif task_get_option("modified"):
                options["recid_range"] = add_recIDs_by_date(
                    rank_method_code, task_get_option("modified"))
            elif task_get_option("last_updated"):
                options["recid_range"] = add_recIDs_by_date(rank_method_code)
            else:
                write_message("No records specified, updating all", verbose=2)
                min_id = run_sql("SELECT min(id) from bibrec")[0][0]
                max_id = run_sql("SELECT max(id) from bibrec")[0][0]
                options["recid_range"] = [[min_id, max_id]]

            if task_get_option("quick") == "no":
                write_message(
                    "Recalculate parameter not used, parameter ignored.",
                    verbose=9)

            if task_get_option("cmd") == "del":
                del_recids(cfg_short, options["recid_range"])
            elif task_get_option("cmd") == "add":
                func_object = globals().get(cfg_function)
                func_object(rank_method_code, cfg_name, config)
            elif task_get_option("cmd") == "stat":
                rank_method_code_statistics(rank_method_code)
            elif task_get_option("cmd") == "check":
                check_method(rank_method_code)
            elif task_get_option("cmd") == "print-missing":
                func_object = globals().get(cfg_function)
                func_object(rank_method_code, cfg_name, config)
            elif task_get_option("cmd") == "repair":
                func_object = globals().get(cfg_repair_function)
                func_object()
            else:
                write_message(
                    "Invalid command found processing %s" % rank_method_code,
                    sys.stderr)
                raise StandardError
    except StandardError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        write_message(traceback.format_exc()[:-1])
        register_exception()
        raise StandardError
def single_tag_rank_method_repair_exec():
    """Repair single tag ranking method"""
    write_message(
        "Repairing for this ranking method is not defined. Skipping.")
    return
def file_similarity_by_times_downloaded_repair_exec():
    """Repair file similarity by times downloaded ranking method"""
    write_message(
        "Repairing for this ranking method is not defined. Skipping.")
    return
def download_weight_total_repair_exec():
    """Repair download weight total ranking method"""
    write_message(
        "Repairing for this ranking method is not defined. Skipping.")
    return
def download_weight_filtering_user_repair_exec():
    """Repair download weight filtering user ranking method"""
    write_message(
        "Repairing for this ranking method is not defined. Skipping.")
    return
def showtime(timeused):
    """Show time used for method"""
    write_message("Time used: %d second(s)." % timeused, verbose=9)
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" % \
                   config.get(config.get("rank_method", "function"), "kb_src"))
    input = open(config.get(config.get("rank_method", "function"), "kb_src"),
                 'r')
    data = input.readlines()
    for line in data:
        if not line[0:1] == "#":
            kb_data[string.strip(
                (string.split(string.strip(line), "---"))[0])] = (string.split(
                    string.strip(line), "---"))[1]
    write_message("Number of lines read from knowledgebase file: %s" %
                  len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"),
                      "check_mandatory_tags").split(", ")
    if tags == ['']:
        tags = ""

    records = []
    for (recids, recide) in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql(
            "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s"
            % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset()
            newset += [
                recid[0] for recid in (run_sql(
                    "SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s"
                    % (tag[0:2], tag[0:2]), (key, recids, recide)))
            ]
            valid.intersection_update(newset)
        if tags:
            recs = filter(lambda x: x[0] in valid, recs)
        records = records + list(recs)
        write_message("Number of records found with the necessary tags: %s" %
                      len(records))

    records = filter(lambda x: x[0] in options["validset"], records)
    rnkset = {}
    for key, value in records:
        if kb_data.has_key(value):
            if not rnkset.has_key(key):
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(
                        rnkset[key]) and float(kb_data[value]) > float(
                            (rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" %
                  len(rnkset))
    return rnkset
Beispiel #39
0
def guest_user_garbage_collector():
    """Session Garbage Collector

    program flow/tasks:
    1: delete expired sessions
    1b:delete guest users without session
    2: delete queries not attached to any user
    3: delete baskets not attached to any user
    4: delete alerts not attached to any user
    5: delete expired mailcookies
    5b: delete expired not confirmed email address
    6: delete expired roles memberships

    verbose - level of program output.
              0 - nothing
              1 - default
              9 - max, debug"""

    # dictionary used to keep track of number of deleted entries
    delcount = {
        'session': 0,
        'user': 0,
        'user_query': 0,
        'query': 0,
        'bskBASKET': 0,
        'user_bskBASKET': 0,
        'bskREC': 0,
        'bskRECORDCOMMENT': 0,
        'bskEXTREC': 0,
        'bskEXTFMT': 0,
        'user_query_basket': 0,
        'mail_cookie': 0,
        'email_addresses': 0,
        'role_membership': 0
    }

    write_message("CLEANING OF GUEST SESSIONS STARTED")

    # 1 - DELETE EXPIRED SESSIONS
    write_message("- deleting expired sessions")
    timelimit = convert_datestruct_to_datetext(time.gmtime())
    write_message("  DELETE FROM session WHERE"
                  " session_expiry < %s \n" % (timelimit, ),
                  verbose=9)
    delcount['session'] += run_sql(
        "DELETE FROM session WHERE"
        " session_expiry < %s "
        "", (timelimit, ))

    # 1b - DELETE GUEST USERS WITHOUT SESSION
    write_message("- deleting guest users without session")

    # get uids
    write_message(
        """  SELECT u.id\n  FROM user AS u LEFT JOIN session AS s\n  ON u.id = s.uid\n  WHERE s.uid IS NULL AND u.email = ''""",
        verbose=9)

    result = run_sql("""SELECT u.id
    FROM user AS u LEFT JOIN session AS s
    ON u.id = s.uid
    WHERE s.uid IS NULL AND u.email = ''""")
    write_message(result, verbose=9)

    if result:
        # work on slices of result list in case of big result
        for i in range(0, len(result), CFG_MYSQL_ARGUMENTLIST_SIZE):
            # create string of uids
            uidstr = ''
            for (id_user, ) in result[i:i + CFG_MYSQL_ARGUMENTLIST_SIZE]:
                if uidstr: uidstr += ','
                uidstr += "%s" % (id_user, )

            # delete users
            write_message(
                "  DELETE FROM user WHERE"
                " id IN (TRAVERSE LAST RESULT) AND email = '' \n",
                verbose=9)
            delcount['user'] += run_sql("DELETE FROM user WHERE"
                                        " id IN (%s) AND email = ''" %
                                        (uidstr, ))

    # 2 - DELETE QUERIES NOT ATTACHED TO ANY USER

    # first step, delete from user_query
    write_message("- deleting user_queries referencing" " non-existent users")

    # find user_queries referencing non-existent users
    write_message(
        "  SELECT DISTINCT uq.id_user\n"
        "  FROM user_query AS uq LEFT JOIN user AS u\n"
        "  ON uq.id_user = u.id\n  WHERE u.id IS NULL",
        verbose=9)
    result = run_sql("""SELECT DISTINCT uq.id_user
        FROM user_query AS uq LEFT JOIN user AS u
        ON uq.id_user = u.id
        WHERE u.id IS NULL""")
    write_message(result, verbose=9)

    # delete in user_query one by one
    write_message(
        "  DELETE FROM user_query WHERE"
        " id_user = '******' \n",
        verbose=9)
    for (id_user, ) in result:
        delcount['user_query'] += run_sql("""DELETE FROM user_query
            WHERE id_user = %s""" % (id_user, ))

    # delete the actual queries
    write_message("- deleting queries not attached to any user")

    # select queries that must be deleted
    write_message(
        """  SELECT DISTINCT q.id\n  FROM query AS q LEFT JOIN user_query AS uq\n  ON uq.id_query = q.id\n  WHERE uq.id_query IS NULL AND\n  q.type <> 'p' """,
        verbose=9)
    result = run_sql("""SELECT DISTINCT q.id
                        FROM query AS q LEFT JOIN user_query AS uq
                        ON uq.id_query = q.id
                        WHERE uq.id_query IS NULL AND
                              q.type <> 'p'""")
    write_message(result, verbose=9)

    # delete queries one by one
    write_message(
        """  DELETE FROM query WHERE id = 'TRAVERSE LAST RESULT' \n""",
        verbose=9)
    for (id_user, ) in result:
        delcount['query'] += run_sql("""DELETE FROM query WHERE id = %s""",
                                     (id_user, ))

    # 3 - DELETE BASKETS NOT OWNED BY ANY USER
    write_message("- deleting baskets not owned by any user")

    # select basket ids
    write_message(
        """ SELECT ub.id_bskBASKET\n  FROM user_bskBASKET AS ub LEFT JOIN user AS u\n  ON u.id = ub.id_user\n  WHERE u.id IS NULL""",
        verbose=9)
    try:
        result = run_sql("""SELECT ub.id_bskBASKET
                              FROM user_bskBASKET AS ub LEFT JOIN user AS u
                                ON u.id = ub.id_user
                             WHERE u.id IS NULL""")
    except:
        result = []
    write_message(result, verbose=9)

    # delete from user_basket and basket one by one
    write_message(
        """  DELETE FROM user_bskBASKET WHERE id_bskBASKET = 'TRAVERSE LAST RESULT' """,
        verbose=9)
    write_message(
        """  DELETE FROM bskBASKET WHERE id = 'TRAVERSE LAST RESULT' """,
        verbose=9)
    write_message(
        """  DELETE FROM bskREC WHERE id_bskBASKET = 'TRAVERSE LAST RESULT'""",
        verbose=9)
    write_message(
        """  DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET = 'TRAVERSE LAST RESULT' \n""",
        verbose=9)
    for (id_basket, ) in result:
        delcount['user_bskBASKET'] += run_sql(
            """DELETE FROM user_bskBASKET WHERE id_bskBASKET = %s""",
            (id_basket, ))
        delcount['bskBASKET'] += run_sql(
            """DELETE FROM bskBASKET WHERE id = %s""", (id_basket, ))
        delcount['bskREC'] += run_sql(
            """DELETE FROM bskREC WHERE id_bskBASKET = %s""", (id_basket, ))
        delcount['bskRECORDCOMMENT'] += run_sql(
            """DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET = %s""",
            (id_basket, ))
    write_message(
        """ SELECT DISTINCT ext.id, rec.id_bibrec_or_bskEXTREC FROM bskEXTREC AS ext \nLEFT JOIN bskREC AS rec ON ext.id=-rec.id_bibrec_or_bskEXTREC WHERE id_bibrec_or_bskEXTREC is NULL""",
        verbose=9)
    try:
        result = run_sql("""SELECT DISTINCT ext.id FROM bskEXTREC AS ext
                            LEFT JOIN bskREC AS rec ON ext.id=-rec.id_bibrec_or_bskEXTREC
                            WHERE id_bibrec_or_bskEXTREC is NULL""")
    except:
        result = []
    write_message(result, verbose=9)
    write_message(
        """  DELETE FROM bskEXTREC WHERE id = 'TRAVERSE LAST RESULT' """,
        verbose=9)
    write_message(
        """  DELETE FROM bskEXTFMT WHERE id_bskEXTREC = 'TRAVERSE LAST RESULT' \n""",
        verbose=9)
    for (id_basket, ) in result:
        delcount['bskEXTREC'] += run_sql(
            """DELETE FROM bskEXTREC WHERE id=%s""", (id_basket, ))
        delcount['bskEXTFMT'] += run_sql(
            """DELETE FROM bskEXTFMT WHERE id_bskEXTREC=%s""", (id_basket, ))

    # 4 - DELETE ALERTS NOT OWNED BY ANY USER
    write_message('- deleting alerts not owned by any user')

    # select user ids in uqb that reference non-existent users
    write_message(
        """SELECT DISTINCT uqb.id_user FROM user_query_basket AS uqb LEFT JOIN user AS u ON uqb.id_user = u.id WHERE u.id IS NULL""",
        verbose=9)
    result = run_sql(
        """SELECT DISTINCT uqb.id_user FROM user_query_basket AS uqb LEFT JOIN user AS u ON uqb.id_user = u.id WHERE u.id IS NULL"""
    )
    write_message(result, verbose=9)

    # delete all these entries
    for (id_user, ) in result:
        write_message(
            """DELETE FROM user_query_basket WHERE id_user = '******'user_query_basket'] += run_sql(
            """DELETE FROM user_query_basket WHERE id_user = %s """,
            (id_user, ))

    # 5 - delete expired mailcookies
    write_message("""mail_cookie_gc()""", verbose=9)
    delcount['mail_cookie'] = mail_cookie_gc()

    ## 5b - delete expired not confirmed email address
    write_message(
        """DELETE FROM user WHERE note='2' AND NOW()>ADDTIME(last_login, '%s 0:0:0')"""
        % CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS,
        verbose=9)
    delcount['email_addresses'] = run_sql(
        """DELETE FROM user WHERE note='2' AND NOW()>ADDTIME(last_login, '%s 0:0:0')""",
        (CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS, ))

    # 6 - delete expired roles memberships
    write_message("""DELETE FROM user_accROLE WHERE expiration<NOW()""",
                  verbose=9)
    delcount['role_membership'] = run_sql(
        """DELETE FROM user_accROLE WHERE expiration<NOW()""")

    # print STATISTICS
    write_message("""- statistics about deleted data: """)
    write_message("""  %7s sessions.""" % (delcount['session'], ))
    write_message("""  %7s users.""" % (delcount['user'], ))
    write_message("""  %7s user_queries.""" % (delcount['user_query'], ))
    write_message("""  %7s queries.""" % (delcount['query'], ))
    write_message("""  %7s baskets.""" % (delcount['bskBASKET'], ))
    write_message("""  %7s user_baskets.""" % (delcount['user_bskBASKET'], ))
    write_message("""  %7s basket_records.""" % (delcount['bskREC'], ))
    write_message("""  %7s basket_external_records.""" %
                  (delcount['bskEXTREC'], ))
    write_message("""  %7s basket_external_formats.""" %
                  (delcount['bskEXTFMT'], ))
    write_message("""  %7s basket_comments.""" %
                  (delcount['bskRECORDCOMMENT'], ))
    write_message("""  %7s user_query_baskets.""" %
                  (delcount['user_query_basket'], ))
    write_message("""  %7s mail_cookies.""" % (delcount['mail_cookie'], ))
    write_message("""  %7s non confirmed email addresses.""" %
                  delcount['email_addresses'])
    write_message("""  %7s role_memberships.""" %
                  (delcount['role_membership'], ))
    write_message("""CLEANING OF GUEST SESSIONS FINISHED""")
def rank_method_code_statistics(rank_method_code):
    """Print statistics"""

    method = fromDB(rank_method_code)
    max = ('', -999999)
    maxcount = 0
    min = ('', 999999)
    mincount = 0

    for (recID, value) in method.iteritems():
        if value < min and value > 0:
            min = value
        if value > max:
            max = value

    for (recID, value) in method.iteritems():
        if value == min:
            mincount += 1
        if value == max:
            maxcount += 1

    write_message("Showing statistic for selected method")
    write_message("Method name: %s" % getName(rank_method_code))
    write_message("Short name: %s" % rank_method_code)
    write_message("Last run: %s" % get_lastupdated(rank_method_code))
    write_message("Number of records: %s" % len(method))
    write_message("Lowest value: %s - Number of records: %s" % (min, mincount))
    write_message("Highest value: %s - Number of records: %s" %
                  (max, maxcount))
    write_message("Divided into 10 sets:")
    for i in range(1, 11):
        setcount = 0
        distinct_values = {}
        lower = -1.0 + ((float(max + 1) / 10)) * (i - 1)
        upper = -1.0 + ((float(max + 1) / 10)) * i
        for (recID, value) in method.iteritems():
            if value >= lower and value <= upper:
                setcount += 1
                distinct_values[value] = 1
        write_message("Set %s (%s-%s) %s Distinct values: %s" %
                      (i, lower, upper, len(distinct_values), setcount))
Beispiel #41
0
        filenames = os.listdir(rss_cache_dir)
    except OSError:
        filenames = []
    count = 0
    for filename in filenames:
        filename = os.path.join(rss_cache_dir, filename)
        last_update_time = datetime.datetime.fromtimestamp(
            os.stat(os.path.abspath(filename)).st_mtime)
        if not (datetime.datetime.now() < last_update_time +
                datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL)):
            try:
                os.remove(filename)
                count += 1
            except OSError, e:
                write_message("Error: %s" % e)
    write_message("""%s rss cache file pruned out of %s.""" %
                  (count, len(filenames)))
    write_message("""CLEANING OF OLD CACHED RSS REQUEST FINISHED""")

    write_message("""CLEANING OF OLD CACHED WEBJOURNAL FILES STARTED""")
    webjournal_cache_dir = "%s/webjournal/" % CFG_CACHEDIR
    filenames = []
    try:
        for root, dummy, files in os.walk(webjournal_cache_dir):
            filenames.extend(
                os.path.join(root, filename) for filename in files)
    except OSError:
        pass
    count = 0
    for filename in filenames:
        filename = os.path.join(webjournal_cache_dir, filename)
        last_update_time = datetime.datetime.fromtimestamp(
Beispiel #42
0
def gc_exec_command(command):
    """ Exec the command logging in appropriate way its output."""
    write_message('  %s' % command, verbose=9)
    (dummy, output, errors) = os.popen3(command)
    write_messages(errors.read())
    write_messages(output.read())
def perform_fulltext_harvest(record_list,
                             add_metadata,
                             attach_fulltext,
                             hidden_fulltext,
                             out_folder,
                             threshold_date=None,
                             journal_mappings=None):
    """
    For every record in given list APSRecord(record ID, DOI, date last
    updated), yield a APSRecord with added FFT dictionary containing URL to
    fulltext/metadata XML downloaded locally.

    If a download is unsuccessful, an error message is given.

    @return: tuple of (APSRecord, error_message)
    """
    count = 0
    request_end = None
    request_start = None
    for record in record_list:
        task_sleep_now_if_required(can_stop_too=False)
        # Unless this is the first request, lets sleep a bit
        if request_end and request_start:
            request_dt = request_end - request_start
            write_message("Checking request time (%d)" % (request_dt, ),
                          verbose=3)
            if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT:
                write_message("Initiating sleep for %.1f seconds" %
                              (request_dt, ),
                              verbose=3)
                time.sleep(request_dt)

        count += 1
        task_update_progress("Harvesting record (%d/%d)" %
                             (count, len(record_list)))

        if not record.doi:
            msg = "No DOI found for record %d" % (record.recid or "", )
            write_message("Error: %s" % (msg, ), stream=sys.stderr)
            yield record, msg
            continue

        url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi}
        result_file = os.path.join(out_folder,
                                   "%s.zip" % (record.doi.replace('/', '_')))
        try:
            request_start = time.time()
            if os.path.exists(result_file):
                # File already downloaded recently, lets see if it is the same
                file_last_modified = get_file_modified_date(result_file)
                if not compare_datetime_to_iso8601_date(
                        file_last_modified, record.last_modified):
                    # File is not older than APS version, we should not download.
                    raise APSHarvesterFileExits

            write_message("Trying to save to %s" % (result_file, ), verbose=5)

            result_file = download_url(url=url,
                                       download_to_file=result_file,
                                       content_type="zip",
                                       retry_count=5,
                                       timeout=60.0)
            write_message("Downloaded %s to %s" % (url, result_file),
                          verbose=2)
        except InvenioFileDownloadError, e:
            msg = "URL could not be opened: %s" % (url, )
            write_message("Error: %s" % (msg, ), stream=sys.stderr)
            yield record, msg
            continue

        except APSHarvesterFileExits:
            write_message("File exists at %s" % (result_file, ), verbose=2)
Beispiel #44
0
def clean_tempfiles():
    """ Clean old temporary files. """
    write_message("""CLEANING OF TMP FILES STARTED""")
    write_message("- deleting/gzipping temporary empty/old "
                  "BibReformat xml files")
    vstr = task_get_option('verbose') > 1 and '-v' or ''
    gc_exec_command('find %s %s -name "rec_fmt_*"'
        ' -size 0c -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr))
    gc_exec_command('find %s %s -name "rec_fmt_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_FMT, vstr))
    gc_exec_command('find %s %s -name "rec_fmt_*"'
        ' -atime +%s -exec gzip %s -9 {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_ZIP_FMT, vstr))

    write_message("- deleting/gzipping temporary old " "OAIHarvest xml files")
    gc_exec_command('find %s %s -name "oaiharvestadmin.*"'
        ' -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr))
    gc_exec_command('find %s %s -name "bibconvertrun.*"'
        ' -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, vstr))
    # Using mtime and -r here to include directories.
    gc_exec_command('find %s %s -name "oaiharvest*"'
        ' -mtime +%s -exec gzip %s -9 {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_ZIP_OAI, vstr))
    gc_exec_command('find %s %s -name "oaiharvest*"'
        ' -mtime +%s -exec rm %s -rf {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_OAI, vstr))
    gc_exec_command('find %s %s -name "oai_archive*"'
        ' -mtime +%s -exec rm %s -rf {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_OAI, vstr))

    write_message("- deleting/gzipping temporary old " "BibSword files")
    gc_exec_command('find %s %s -name "bibsword_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_BIBSWORD, vstr))
    gc_exec_command('find %s %s -name "bibsword_*"'
        ' -atime +%s -exec gzip %s -9 {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_ZIP_BIBSWORD, vstr))

    # DELETE ALL FILES CREATED DURING VIDEO SUBMISSION
    write_message("- deleting old video submissions")
    gc_exec_command('find %s -name %s* -atime +%s -exec rm %s -f {} \;' \
                    % (CFG_TMPSHAREDDIR, CFG_WEBSUBMIT_TMP_VIDEO_PREFIX,
                       CFG_MAX_ATIME_WEBSUBMIT_TMP_VIDEO, vstr))

    write_message("- deleting temporary old " "RefExtract files")
    gc_exec_command('find %s %s -name "refextract*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR,
               CFG_MAX_ATIME_RM_REFEXTRACT, vstr))

    write_message("- deleting temporary old bibdocfiles")
    gc_exec_command('find %s %s -name "bibdocfile_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_BIBDOC, vstr))

    write_message("- deleting old temporary WebSubmit icons")
    gc_exec_command('find %s %s -name "websubmit_icon_creator_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_ICON, vstr))

    write_message("- deleting old temporary WebSubmit stamps")
    gc_exec_command('find %s %s -name "websubmit_file_stamper_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_STAMP, vstr))

    write_message("- deleting old temporary WebJournal XML files")
    gc_exec_command('find %s %s -name "webjournal_publish_*"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPDIR, CFG_TMPSHAREDDIR, \
               CFG_MAX_ATIME_RM_WEBJOURNAL_XML, vstr))

    write_message("- deleting old temporary files attached with CKEditor")
    gc_exec_command('find %s/var/tmp/attachfile/ '
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_PREFIX, CFG_MAX_ATIME_RM_WEBSUBMIT_CKEDITOR_FILE,
               vstr))

    write_message("- deleting old temporary files attached with BibEdit")
    gc_exec_command('find %s -name "bibedit*.tmp"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPSHAREDDIR + '/bibedit-cache/', CFG_MAX_ATIME_BIBEDIT_TMP,
               vstr))

    write_message("- deleting old XML files submitted via BibEdit")
    gc_exec_command('find %s -name "bibedit*.xml"'
        ' -atime +%s -exec rm %s -f {} \;' \
            % (CFG_TMPSHAREDDIR + '/bibedit-cache/', CFG_MAX_ATIME_BIBEDIT_XML,
               vstr))

    write_message("""CLEANING OF TMP FILES FINISHED""")
Beispiel #45
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")
        if not bibdocfiles:
            # Maybe that this is one of those INSPIRE-PUBLIC with
            # still an arXiv file in it
            for name, bibdoc in docs.list_bibdocs_by_names().items():
                if name.startswith('arXiv:'):
                    bibdocfiles = bibdoc.list_latest_files()

        bibdocfiles = [
            bibdocfile for bibdocfile in bibdocfiles
            if bibdocfile.get_superformat() == '.pdf'
        ]

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
Beispiel #46
0
def create_MARCXML(figures, id_fulltext, code, extracted, write_file=True):
    """
	Function that creates a file MARCXML from the vector of figures
	
	@param figures: the list of all figures
	@param id_fulltext: the id of the fulltext
	@param code: The code for Latex, PDF or both 
	@param extracted: where the file will be generated
	@param write_file: it's True when the user wants to write the data into file
	
	@return: the path to the MARCXML file 
	"""
    both_doc = 0
    no_latex = 1
    no_pdf = 2
    list = []
    list.append('<?xml version="1.0" encoding="UTF-8"?>')
    list.append('<collection>')

    figure_number = 1
    parent_id = -1
    for figure in figures:
        if figure.subfigure != None:
            if 'is subfigure of' in figure.subfigure:
                print 'ok'
                print figure.identifier
        list.append('<record>')
        for i in range(len(figure.files)):
            text_references = ""
            if not figure.files[i].path.endswith("context"):
                list.append('    <datafield tag="FFT" ind1=" " ind2=" ">')
                list.append('      <subfield code="a">' +
                            figure.files[i].path + '</subfield>')
                list.append(
                    '      <subfield code="r">restricted_pict</subfield>')
                list.append('      <subfield code="n">' + figure.identifier +
                            '</subfield>')
                list.append('      <subfield code="d">' + figure.caption +
                            '</subfield>')
                if i == 0:
                    if (figure.subfigure != None):
                        if 'is subfigure of' in figure.subfigure:
                            list.append(
                                '      <subfield code="i">TMP:SUBFIGURE:' +
                                str(id_fulltext) + ':' +
                                str(figure.subfigure_id) + '</subfield>')
                            list.append(
                                '      <subfield code="v">TMP:SUBFIGURE:' +
                                str(id_fulltext) + ':v' +
                                str(figure.subfigure_id) + '</subfield>')
                    else:
                        list.append('      <subfield code="i">TMP:' +
                                    str(id_fulltext) + ':' +
                                    str(figure_number) + '</subfield>')
                        list.append('      <subfield code="v">TMP:' +
                                    str(id_fulltext) + ':v' +
                                    str(figure_number) + '</subfield>')
                        if figure.is_parent != None:
                            if 'is parent' in figure.is_parent:
                                parent_id = figure_number
                list.append('    </datafield>')

            else:
                text_references = figure.text_references
            if i < len(figure.files) - 1:
                list.append('\n')


#			if i == len(figure.files) - 1:
#				list.append('  </record>')
# if we have the fulltext pdf we add the BDR tag after the FFT tag
            if code != no_pdf and i == len(figure.files) - 1:
                list.append('    <datafield tag="BDR" ind1=" " ind2=" ">')
                #				list.append('	<subfield code="i">TMP:OAI:' + str(figure_number) + '</subfield>')
                #				list.append('	<subfield code="v">TMP:OAI:' + str(figure_number) + 'v' + '</subfield>')
                # id, v1, id, v
                if (figure.subfigure != None):
                    if 'is subfigure of' in figure.subfigure:
                        list.append('      <subfield code="i">TMP:SUBFIGURE:' +
                                    str(id_fulltext) + ':' +
                                    str(figure.subfigure_id) + '</subfield>')
                        list.append('      <subfield code="v">TMP:SUBFIGURE:' +
                                    str(id_fulltext) + ':v' +
                                    str(figure.subfigure_id) + '</subfield>')
                        list.append('      <subfield code="j">TMP:' +
                                    str(id_fulltext) + ':' + str(parent_id) +
                                    '</subfield>')
                        list.append('      <subfield code="w">TMP:' +
                                    str(id_fulltext) + ':v' + str(parent_id) +
                                    '</subfield>')
                        list.append(
                            '      <subfield code="t">is_subfigure_of</subfield>'
                        )
                        figure_number = figure_number - 1
                else:

                    list.append('      <subfield code="i">TMP:' +
                                str(id_fulltext) + ':' + str(figure_number) +
                                '</subfield>')
                    list.append('      <subfield code="v">TMP:' +
                                str(id_fulltext) + ':v' + str(figure_number) +
                                '</subfield>')
                    list.append('      <subfield code="j">TMP:' +
                                str(id_fulltext) + '</subfield>')
                    list.append('      <subfield code="w">TMP:' +
                                str(id_fulltext) + ':v' + '</subfield>')
                    list.append(
                        '      <subfield code="t">is_extracted_from</subfield>'
                    )
                dict = {}
                dict["figures"] = {}
                dict["figures"]["caption"] = figure.caption
                write_message("adding field figure.caption")
                v = ["location", "caption_location"]
                for i, item in enumerate(v):
                    if (figure.get_location(i) != None):
                        dict["figures"][item] = {}
                        dict["figures"][item][
                            "page_num"] = figure.get_location(i).page_num
                        write_message("adding figure.get_location.page_num")
                        if (figure.get_location(i).page_resolution != None):
                            dict["figures"][item]["page_resolution"] = {}
                            dict["figures"][item]["page_resolution"][
                                "width"] = figure.get_location(
                                    i).page_resolution.width
                            write_message(
                                "adding figure.get_location.page_resolution.page_num_width"
                            )
                            dict["figures"][item]["page_resolution"][
                                "height"] = figure.get_location(
                                    i).page_resolution.height
                            write_message(
                                "adding figure.get_location.page_resolution.height"
                            )
                        if (figure.get_location(i).boundary != None):
                            dict["figures"][item]["boundary"] = {}
                            dict["figures"][item]["boundary"][
                                "width"] = figure.get_location(
                                    i).boundary.width
                            write_message(
                                "adding figure.get_location.boundary.width")
                            dict["figures"][item]["boundary"][
                                "height"] = figure.get_location(
                                    i).boundary.height
                            write_message(
                                "adding figure.get_location.boundary.height")
                            dict["figures"][item]["boundary"][
                                "x"] = figure.get_location(i).boundary.x
                            write_message(
                                "adding figure.get_location.boundary.x")
                            dict["figures"][item]["boundary"][
                                "y"] = figure.get_location(i).boundary.y
                            write_message(
                                "adding figure.get_location.boundary.y")
                        if i == 0:
                            dict["figures"][item][
                                "page_scale"] = figure.get_location(
                                    i).page_scale
                            write_message(
                                "adding figure.get_location.page_scale")
                dict["figures"]["text_references"] = text_references
                write_message("adding figure.text_references")

                d = cPickle.dumps(dict)
                info = base64.encodestring(d)
                list.append('      <subfield code="m">' + info + '</subfield>')
                list.append('    </datafield>')
        figure_number = figure_number + 1
        list.append('</record>')
    list.append('</collection>')
    marc = '\n'.join(list)
    if write_file:
        marc_path = str(extracted) + "/extracted.xml"
        f = codecs.open(marc_path, encoding="utf-8", mode="a")
        #f = open(marc_path, 'a')
        f.write(marc)
        f.close()
    return marc_path
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if threshold_date:
        # Input from user. Validate date
        try:
            harvest_from_date = validate_date(threshold_date)
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e), ),
                          stream=sys.stderr)
            return 1
def submit_records(records_filename,
                   records_list,
                   mode,
                   directory,
                   taskid=0,
                   silent=False,
                   devmode=False,
                   subject=None):
    """
    Performs the logic to submit given file (filepath) of records
    either by e-mail or using BibUpload with given mode.

    Taskid is given to indicate if the task submission should wait for any
    previously submitted tasks.

    The submission can also be made "silent" in the sense of not
    updating the modification date of the records.

    @param records_filename: filepath to XML file containing records.
    @type records_filename: string

    @param records_list: list of APSRecord objects for records
    @type records_list: list

    @param mode: which submission mode is it?
    @type mode: string

    @param taskid: bibsched taskid, wait for task to complete before submission
    @type taskid: int

    @param silent: do not update the modification date of the records
    @type silent: bool

    @return: returns the given taskid upon submission, or True/False from email.
    """
    if devmode:
        return None
    if not subject:
        now = datetime.datetime.now()
        subject = "APS harvest results: %s" % (
            now.strftime("%Y-%m-%d %H:%M:%S"), )

    # Check if we should create bibupload or e-mail
    if mode == "email":
        # Lets parse the records and find our IDs.
        list_of_dois = []
        for record in records_list:
            # We strip away the first part of the DOI for readability.
            list_of_dois.append('/'.join(record.doi.split('/')[1:]))
        # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
        body = "Harvested new records: %s" % (records_filename, )
        try:
            try:
                shutil.move(records_filename, directory)
                records_filename = os.path.join(
                    directory, os.path.basename(records_filename))
                body = "Harvested new records on %s. They are located here:\n %s" % \
                       (now.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
            except IOError, e:
                # Some IOError
                body = "Error while harvesting records: \nError saving %s - %s" % \
                       (records_filename, str(e))
                raise e
        finally:
            body = "%s\nRecords harvested (%s total):\n%s\n" % (
                body, str(len(list_of_dois)), "\n".join(list_of_dois))
            res = submit_records_via_mail(subject, body)
            write_message("Sent e-mail to %s with path to %s" %
                          (CFG_APSHARVEST_EMAIL, records_filename))
            return res
    else:
        # We submit a BibUpload task and wait for it to finish
        task_update_progress("Waiting for task to finish")

        if taskid != 0:
            write_message("Going to wait for %d to finish" % (taskid, ))

        while not can_launch_bibupload(taskid):
            # Lets wait until the previously launched task exits.
            task_sleep_now_if_required(can_stop_too=False)
            time.sleep(5.0)

        taskid = submit_bibupload_for_records(mode, records_filename, silent)
        write_message("Submitted BibUpload task #%s with mode %s" %
                      (str(taskid), mode))
        return taskid
Beispiel #49
0
def _task_submit_check_options():
    """
    Required by bibtask. Checks the options.
    """
    update_personid = bibtask.task_get_option("update_personid")
    disambiguate = bibtask.task_get_option("disambiguate")
    merge = bibtask.task_get_option("merge")

    record_ids = bibtask.task_get_option("record_ids")
    all_records = bibtask.task_get_option("all_records")
    from_scratch = bibtask.task_get_option("from_scratch")

    commands = bool(update_personid) + bool(disambiguate) + bool(merge)

    if commands == 0:
        bibtask.write_message(
            "ERROR: At least one command should be specified!",
            stream=sys.stdout,
            verbose=0)
        return False

    if commands > 1:
        bibtask.write_message(
            "ERROR: The options --update-personid, --disambiguate "
            "and --merge are mutually exclusive.",
            stream=sys.stdout,
            verbose=0)
        return False

    assert commands == 1

    if update_personid:
        if any((from_scratch, )):
            bibtask.write_message(
                "ERROR: The only options which can be specified "
                "with --update-personid are --record-ids and "
                "--all-records",
                stream=sys.stdout,
                verbose=0)
            return False

        options = bool(record_ids) + bool(all_records)
        if options > 1:
            bibtask.write_message(
                "ERROR: conflicting options: --record-ids and "
                "--all-records are mutually exclusive.",
                stream=sys.stdout,
                verbose=0)
            return False

        if record_ids:
            for iden in record_ids:
                if not iden.isdigit():
                    bibtask.write_message("ERROR: Record_ids expects numbers. "
                                          "Provided: %s." % iden)
                    return False

    if disambiguate:
        if any((record_ids, all_records)):
            bibtask.write_message(
                "ERROR: The only option which can be specified "
                "with --disambiguate is from-scratch",
                stream=sys.stdout,
                verbose=0)
            return False

    if merge:
        if any((record_ids, all_records, from_scratch)):
            bibtask.write_message(
                "ERROR: There are no options which can be "
                "specified along with --merge",
                stream=sys.stdout,
                verbose=0)
            return False

    return True
        perpage = 100

        # Are we harvesting from last time or a specific date?
        if from_date == "last":
            dummy, harvest_from_date = fetch_last_updated(
                name="apsharvest_api_download")

            # Keeping current time until completed harvest.
            new_harvest_date = datetime.datetime.now()
        else:
            # Input from user. Validate date
            try:
                harvest_from_date = validate_date(from_date)
            except ValueError, e:
                write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                              (str(e), ),
                              stream=sys.stderr)
                return 1

        # Turn harvest_from_date back into a string (away from datetime object)
        harvest_from_date = harvest_from_date.strftime("%Y-%m-%d")

        status_message = "Checking for new records from APS from %s" % \
                         (harvest_from_date,)
        if until_date:
            # Input from user. Validate date
            try:
                validate_date(until_date)
            except ValueError, e:
                write_message(
                    "Error parsing until_date, use (YYYY-MM-DD): %s" %
Beispiel #51
0
def task_submit_check_options():
    """
    NOTE: Depending on the parameters, either "BibSched mode" or plain
          straigh-forward execution mode is entered.
    """
    if task_has_option("create_event_with_id"):
        print webstat.create_customevent(
            task_get_option("create_event_with_id"),
            task_get_option("event_name", None),
            task_get_option("column_headers", []))
        sys.exit(0)

    elif task_has_option("destroy_event_with_id"):
        print webstat.destroy_customevent(
            task_get_option("destroy_event_with_id"))
        sys.exit(0)

    elif task_has_option("list_events"):
        events = webstat._get_customevents()
        if len(events) == 0:
            print "There are no custom events available."
        else:
            print "Available custom events are:\n"
            print '\n'.join([
                x[0] + ": " +
                ((x[1] == None) and "No descriptive name" or str(x[1]))
                for x in events
            ])
        sys.exit(0)

    elif task_has_option("cache_events"):
        events = task_get_option("cache_events")

        write_message(str(events), verbose=9)

        if events[0] == 'ALL':
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] == 'KEYEVENTS':
            keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys()
            customevents_to_cache = []

        elif events[0] == 'CUSTOMEVENTS':
            keyevents_to_cache = []
            customevents_to_cache = [x[0] for x in webstat._get_customevents()]

        elif events[0] != '':
            keyevents_to_cache = [
                x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events
            ]
            customevents_to_cache = [
                x[0] for x in webstat._get_customevents() if x in events
            ]

        # Control so that we have valid event names
        if len(keyevents_to_cache + customevents_to_cache) == 0:
            # Oops, no events. Abort and display help.
            return False
        else:
            task_set_option("keyevents", keyevents_to_cache)
            task_set_option("customevents", customevents_to_cache)

        return True

    elif task_has_option("dump_config"):
        print """\
[general]
visitors_box = True
search_box = True
record_box = True
bibsched_box = True
basket_box = True
apache_box = True
uptime_box = True

[webstat_custom_event_1]
name = baskets
param1 = action
param2 = basket
param3 = user

[apache_log_analyzer]
profile = nil
nb-histogram-items-to-print = 20
exclude-ip-list = ("137.138.249.162")
home-collection = "Atlantis Institute of Fictive Science"
search-interface-url = "/?"
detailed-record-url = "/record/"
search-engine-url = "/search?"
search-engine-url-old-style = "/search.py?"
basket-url = "/yourbaskets/"
add-to-basket-url = "/yourbaskets/add"
display-basket-url = "/yourbaskets/display"
display-public-basket-url = "/yourbaskets/display_public"
alert-url = "/youralerts/"
display-your-alerts-url = "/youralerts/list"
display-your-searches-url = "/youralerts/display"
"""
        sys.exit(0)

    elif task_has_option("load_config"):
        from ConfigParser import ConfigParser
        conf = ConfigParser()
        conf.read(CFG_WEBSTAT_CONFIG_PATH)
        for section in conf.sections():
            if section[:21] == "webstat_custom_event_":
                cols = []
                name = ""
                for option, value in conf.items(section):
                    if option == "name":
                        name = value
                    if option[:5] == "param":
                        # add the column name in it's position
                        index = int(option[-1]) - 1
                        while len(cols) <= index:
                            cols.append("")
                        cols[index] = value
                if name:
                    res = run_sql(
                        "SELECT COUNT(id) FROM staEVENT WHERE id = %s",
                        (name, ))
                    if res[0][0] == 0:
                        # name does not exist, create customevent
                        webstat.create_customevent(name, name, cols)
                    else:
                        # name already exists, update customevent
                        webstat.modify_customevent(name, cols=cols)

        sys.exit(0)

    else:
        # False means that the --help should be displayed
        return False
Beispiel #52
0
def bibreformat_task(fmt, recids, without_fmt, process):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    latest_bibrank_run = get_bibrankmethod_lastupdate('citation')

    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids

    def recid_chunker(recids):
        recids_processed = intbitset()
        chunk = intbitset()

        for recid in recids:
            if len(chunk) == 5000:
                for r in related_records(chunk, recids_processed):
                    yield r
                recids_processed += chunk
                chunk = intbitset()

            if recid not in recids_processed:
                chunk.add(recid)

        if chunk:
            for r in related_records(chunk, recids_processed):
                yield r

    recIDs = list(recid_chunker(recids))

    ### list of corresponding record IDs was retrieved
    ### now format the selected records

    if without_fmt:
        write_message("Records to be processed: %d" % len(recIDs))
        write_message("Out of it records without existing cache: %d" %
                      len(without_fmt))
    else:
        write_message("Records to be processed: %d" % len(recIDs))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Beispiel #53
0
def add_other_id(other_id=None,
                 doi="",
                 eprint="",
                 recid=None,
                 system_number=None,
                 reportnumbers=None,
                 all_recids=None):
    """Search and match using given identifiers."""
    query = ""
    if all_recids is None:
        all_recids = get_all_recids()
    if reportnumbers is None:
        reportnumbers = []
    if recid is not None:
        query = "existing recid"
        try:
            recid = int(recid)
        except ValueError:
            recid = None
        if recid and recid not in all_recids:
            write_message(
                "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong"
                % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE),
                stream=sys.stderr)
            recid = None
    if recid is None and eprint:
        query = 'oai:arXiv.org:%s' % (eprint, )
        arxiv_ids = search_pattern(p=query, f='035__a', m='e') & all_recids
        if len(arxiv_ids) > 1:
            write_message(
                "ERROR: %s record %s matches more than one record in %s via %s: %s"
                % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, arxiv_ids),
                stream=sys.stderr)
            return [other_id] + list(arxiv_ids)
        elif len(arxiv_ids) == 1:
            recid = arxiv_ids[0]
    if recid is None and doi:
        query = 'doi:"%s"' % doi
        doi_ids = search_pattern(p=query) & all_recids
        if len(doi_ids) > 1:
            write_message(
                "ERROR: %s record %s matches more than one record in %s via %s: %s"
                % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, doi_ids),
                stream=sys.stderr)
            return [other_id] + list(doi_ids)
        elif len(doi_ids) == 1:
            recid = doi_ids[0]
    if recid is None and reportnumbers:
        query = "037__a:" + " OR 037__a:".join(reportnumbers)
        reportnumbers_ids = intbitset()
        for rn in reportnumbers:
            reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e')
        reportnumbers_ids &= all_recids
        if len(reportnumbers_ids) > 1:
            write_message(
                "ERROR: %s record %s matches more than one record in %s via %s: %s"
                % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query,
                   reportnumbers_ids),
                stream=sys.stderr)
            return [other_id] + list(reportnumbers_ids)
        elif len(reportnumbers_ids) == 1:
            recid = reportnumbers_ids[0]
    if recid is None and system_number and CFG_CERN_SITE:
        query = "035:%s 035:SPIRES" % (system_number, )
        system_number_ids = search_pattern(p=query)
        system_number_ids &= all_recids
        if len(system_number_ids) > 1:
            write_message(
                "ERROR: %s record %s matches more than one record in %s via %s: %s"
                % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query,
                   system_number_ids),
                stream=sys.stderr)
            return [other_id] + list(system_number_ids)
        elif len(system_number_ids) == 1:
            recid = system_number_ids[0]

    if recid:
        recid = int(recid)
        record = get_record(recid)
        fields = record_get_field_instances(record, '035')
        for field in fields:
            subfields = dict(field_get_subfield_instances(field))
            if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper():
                stored_recid = subfields.get('a', 0)
                try:
                    stored_recid = int(stored_recid)
                except ValueError:
                    # Not an integer, we move on and add the new ID.
                    continue
                if stored_recid and int(stored_recid) != int(other_id):
                    write_message(
                        "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s"
                        % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid,
                           stored_recid, CFG_OTHER_SITE),
                        stream=sys.stderr)
                if CFG_INSPIRE_SITE and int(other_id) not in CERN_IDS:
                    write_message(
                        "INFO: ID was found in 035 but the record is not core CERN hence it should be moved into 595"
                    )
                else:
                    return

        if CFG_INSPIRE_SITE:
            fields = record_get_field_instances(record, '595')
            for field in fields:
                subfields = dict(field_get_subfield_instances(field))
                if "CDS" in subfields.get('a', '').upper():
                    stored_recid = subfields.get('a', 0).split("-")[-1]
                    try:
                        stored_recid = int(stored_recid)
                    except ValueError:
                        # Not an integer, we move on and add the new ID.
                        continue
                    if stored_recid and int(stored_recid) != int(other_id):
                        write_message(
                            "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s"
                            % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid,
                               stored_recid, CFG_OTHER_SITE),
                            stream=sys.stderr)
                    if int(other_id) in CERN_IDS:
                        write_message(
                            "INFO: ID was found in 595 but the record is core CERN hence it should be moved into 035"
                        )
                    else:
                        return

        write_message("Matched {1}/{0} to {3}/{2} with {4}".format(
            other_id, CFG_OTHER_URL, recid, CFG_THIS_URL, query))
        rec = {}
        record_add_field(rec, '001', controlfield_value='%s' % recid)

        # Let's filter out previous values in 035/595
        for field in record_get_field_instances(record, '035'):
            subfields = field_get_subfield_instances(field)
            subfields_dict = dict(subfields)
            if subfields_dict.get('a') != str(other_id) or subfields_dict.get(
                    '9') != CFG_OTHER_SITE:
                record_add_field(rec, '035', subfields=subfields)
        for field in record_get_field_instances(record, '595'):
            subfields = field_get_subfield_instances(field)
            subfields_dict = dict(subfields)
            if subfields_dict.get('a') != "CDS-{0}".format(
                    other_id) or subfields_dict.get('9') != 'CERN':
                record_add_field(rec, '595', subfields=subfields)

        if CFG_INSPIRE_SITE:
            if int(other_id) in CERN_IDS:
                write_message("CERN relevant paper: adding 035")
                record_add_field(rec,
                                 '035',
                                 ind1=' ',
                                 ind2=' ',
                                 subfields=(('9', CFG_OTHER_SITE), ('a',
                                                                    other_id)))
            else:
                write_message("Non-CERN relevant paper: adding 595")
                record_add_field(rec,
                                 '595',
                                 ind1=' ',
                                 ind2=' ',
                                 subfields=(('9', "CERN"),
                                            ('a', "CDS-{0}".format(other_id))))
        else:
            record_add_field(rec,
                             '035',
                             ind1=' ',
                             ind2=' ',
                             subfields=(('9', CFG_OTHER_SITE), ('a',
                                                                other_id)))
        return record_xml_output(rec)
Beispiel #54
0
def import_recid_list(input_stream=sys.stdin,
                      batch_limit=500,
                      automatic_upload=False):
    """Import identifiers from file, match and generate output files."""
    all_recids = get_all_recids()
    output_files = []
    current_batch = []
    current_dupes = []
    i = 0
    for row in input_stream:
        if row.endswith('\n'):
            row = row[:-1]
        row = row.split('|')
        if row:
            try:
                other_id, doi, eprint, recid, system_number = row[0], row[
                    1], row[2], row[3], row[4]
            except IndexError:
                # Something is up
                write_message("WARNING: {0} is invalid".format(row),
                              stream=sys.stderr)
                continue
            if len(row) > 5:
                reportnumbers = row[5:]
            else:
                reportnumbers = None
            if not other_id:
                other_id = None
            if not recid:
                recid = None
            result = add_other_id(other_id, doi, eprint, recid, system_number,
                                  reportnumbers, all_recids)
            if result:
                if isinstance(result, list):
                    # Duplications found
                    current_dupes.append(result)
                    continue
                current_batch.append(result)
                i += 1
                if i % batch_limit == 0:
                    output_file = write_results(current_batch)
                    output_files.append(output_file)
                    if automatic_upload:
                        task_low_level_submission('bibupload',
                                                  'bst_inspire_cds_synchro',
                                                  '-c', output_file, '-n')
                        write_message("Scheduled bibupload --correct %s" %
                                      output_file)
                    task_sleep_now_if_required()
                    current_batch = []
    if len(current_batch) > 0:
        output_file = write_results(current_batch)
        output_files.append(output_file)
        if automatic_upload:
            task_low_level_submission('bibupload', 'bst_inspire_cds_synchro',
                                      '-c', output_file, '-n')
            write_message("Scheduled bibupload --correct %s" % output_file)
    write_message("Matched in total {0} records.".format(i))

    if len(current_dupes) > 0:
        # We have duplications
        dupes_output_file = get_temporary_file("cds_duplicates_", ".txt")
        with open(dupes_output_file, "w") as fd:
            fd.write("\n".join([
                "{0}: {1}".format(dupe[0], dupe[1:]) for dupe in current_dupes
            ]))
        write_message(
            "Found {0} possible duplicates which are available here: {1}".
            format(len(current_dupes), dupes_output_file))
    return output_files
Beispiel #55
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(
        ' ', '_') + '.sql'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Beispiel #56
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)
        if task_get_option('missing'):
            recids |= fetch_records_missing_arxiv_fulltext()
        else:
            recids |= fetch_records_missing_arxiv_fulltext() & \
                fetch_records_modified_since(last_date)

    updated_recids = set()

    try:

        for count, recid in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
Beispiel #57
0
def single_tag_rank(config):
    """Connect the given tag with the data from the kb file given"""
    write_message("Loading knowledgebase file", verbose=9)
    kb_data = {}
    records = []

    write_message("Reading knowledgebase file: %s" %
                   config.get(config.get("rank_method", "function"), "kb_src"))
    with open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') as f:
        for line in f:
            if not line[0:1] == "#":
                key, value = line.strip().split("---")
                kb_data[key.strip()] = value.strip()
    write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))

    tag = config.get(config.get("rank_method", "function"), "tag")
    tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(",")
    if tags == ['']:
        tags = ""

    records = []
    for recids, recide in options["recid_range"]:
        task_sleep_now_if_required(can_stop_too=True)
        write_message("......Processing records #%s-%s" % (recids, recide))
        recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
        valid = intbitset(trailing_bits=1)
        valid.discard(0)
        for key in tags:
            newset = intbitset(run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))
            valid &= newset
        if tags:
            recs = [(rec, value) for recid, value in recs if recid in valid]
        records += list(recs)
        write_message("Number of records found with the necessary tags: %s" % len(records))

    records = [(recid, value) for recid, value in records if recid in options["validset"]]
    rnkset = {}
    for key, value in records:
        if value in kb_data:
            if key not in rnkset:
                rnkset[key] = float(kb_data[value])
            else:
                if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                    rnkset[key] = float(kb_data[value])
        else:
            rnkset[key] = 0

    write_message("Number of records available in rank method: %s" % len(rnkset))
    return rnkset
Beispiel #58
0
 def warn(self, msg):
     """ Add a warning to the record """
     self.issues.append(Issue('warning', self.rule['name'], msg))
     write_message("[WARN] record %s by rule %s: %s" %
                   (self.record_id, self.rule["name"], msg))
Beispiel #59
0
def process_batch_job(batch_job_file):
    """ Processes a batch job description dictionary

    @param batch_job_file: a fullpath to a batch job file
    @type batch_job_file: string
    @return: 1 if the process was successfull, 0 if not
    @rtype; int
    """
    def upload_marcxml_file(marcxml):
        """ Creates a temporary marcxml file and sends it to bibupload
        """
        xml_filename = 'bibencode_' + str(batch_job['recid']) + '_' + str(
            uuid.uuid4()) + '.xml'
        xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR,
                                    xml_filename)
        xml_file = file(xml_filename, 'w')
        xml_file.write(marcxml)
        xml_file.close()
        targs = ['-c', xml_filename]
        task_low_level_submission('bibupload', 'bibencode', *targs)

    #---------#
    # GENERAL #
    #---------#

    _task_write_message("----------- Handling Master -----------")

    ## Check the validity of the batch file here
    batch_job = json_decode_file(batch_job_file)

    ## Sanitise batch description and raise errrors
    batch_job = sanitise_batch_job(batch_job)

    ## Check if the record exists
    if record_exists(batch_job['recid']) < 1:
        raise Exception("Record not found")

    recdoc = BibRecDocs(batch_job['recid'])

    #--------------------#
    # UPDATE FROM MASTER #
    #--------------------#

    ## We want to add new stuff to the video's record, using the master as input
    if getval(batch_job, 'update_from_master'):
        found_master = False
        bibdocs = recdoc.list_bibdocs()
        for bibdoc in bibdocs:
            bibdocfiles = bibdoc.list_all_files()
            for bibdocfile in bibdocfiles:
                comment = bibdocfile.get_comment()
                description = bibdocfile.get_description()
                subformat = bibdocfile.get_subformat()
                m_comment = getval(batch_job, 'bibdoc_master_comment', comment)
                m_description = getval(batch_job, 'bibdoc_master_description',
                                       description)
                m_subformat = getval(batch_job, 'bibdoc_master_subformat',
                                     subformat)
                if (comment == m_comment and description == m_description
                        and subformat == m_subformat):
                    found_master = True
                    batch_job['input'] = bibdocfile.get_full_path()
                    ## Get the aspect of the from the record
                    try:
                        ## Assumes pbcore metadata mapping
                        batch_job['aspect'] = get_fieldvalues(
                            124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0]
                    except IndexError:
                        pass
                    break
            if found_master:
                break
        if not found_master:
            _task_write_message("Video master for record %d not found" %
                                batch_job['recid'])
            task_update_progress("Video master for record %d not found" %
                                 batch_job['recid'])
            ## Maybe send an email?
            return 1

    ## Clean the job to do no upscaling etc
    if getval(batch_job, 'assure_quality'):
        batch_job = clean_job_for_quality(batch_job)

    global _BATCH_STEPS
    _BATCH_STEPS = len(batch_job['jobs'])

    ## Generate the docname from the input filename's name or given name
    bibdoc_video_docname, bibdoc_video_extension = decompose_file(
        batch_job['input'])[1:]
    if not bibdoc_video_extension or getval(batch_job,
                                            'bibdoc_master_extension'):
        bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension')
    if getval(batch_job, 'bibdoc_master_docname'):
        bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname')

    write_message("Creating BibDoc for %s" % bibdoc_video_docname)
    ## If the bibdoc exists, receive it
    if bibdoc_video_docname in recdoc.get_bibdoc_names():
        bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname)
    ## Create a new bibdoc if it does not exist
    else:
        bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname)

    ## Get the directory auf the newly created bibdoc to copy stuff there
    bibdoc_video_directory = bibdoc_video.get_base_dir()

    #--------#
    # MASTER #
    #--------#
    if not getval(batch_job, 'update_from_master'):
        if getval(batch_job, 'add_master'):
            ## Generate the right name for the master
            ## The master should be hidden first an then renamed
            ## when it is really available
            ## !!! FIX !!!
            _task_write_message("Adding %s master to the BibDoc" %
                                bibdoc_video_docname)
            master_format = compose_format(
                bibdoc_video_extension,
                getval(batch_job, 'bibdoc_master_subformat', 'master'))
            ## If a file of the same format is there, something is wrong, remove it!
            ## it might be caused by a previous corrupted submission etc.
            if bibdoc_video.format_already_exists_p(master_format):
                bibdoc_video.delete_file(master_format, 1)
            bibdoc_video.add_file_new_format(
                batch_job['input'],
                version=1,
                description=getval(batch_job, 'bibdoc_master_description'),
                comment=getval(batch_job, 'bibdoc_master_comment'),
                docformat=master_format)

    #-----------#
    # JOBS LOOP #
    #-----------#

    return_code = 1
    global _BATCH_STEP

    for job in batch_job['jobs']:

        _task_write_message("----------- Job %s of %s -----------" %
                            (_BATCH_STEP, _BATCH_STEPS))

        ## Try to substitute docname with master docname
        if getval(job, 'bibdoc_docname'):
            job['bibdoc_docname'] = Template(
                job['bibdoc_docname']).safe_substitute(
                    {'bibdoc_master_docname': bibdoc_video_docname})

        #-------------#
        # TRANSCODING #
        #-------------#

        if job['mode'] == 'encode':

            ## Skip the job if assure_quality is not set and marked as fallback
            if not getval(batch_job, 'assure_quality') and getval(
                    job, 'fallback'):
                continue

            if getval(job, 'profile'):
                profile = get_encoding_profile(job['profile'])
            else:
                profile = None
            ## We need an extension defined fot the video container
            bibdoc_video_extension = getval(job, 'extension',
                                            getval(profile, 'extension'))
            if not bibdoc_video_extension:
                raise Exception("No container/extension defined")
            ## Get the docname and subformat
            bibdoc_video_subformat = getval(job, 'bibdoc_subformat')
            bibdoc_slave_video_docname = getval(job, 'bibdoc_docname',
                                                bibdoc_video_docname)
            ## The subformat is incompatible with ffmpegs name convention
            ## We do the encoding without and rename it afterwards
            bibdoc_video_fullpath = compose_file(bibdoc_video_directory,
                                                 bibdoc_slave_video_docname,
                                                 bibdoc_video_extension)
            _task_write_message(
                "Transcoding %s to %s;%s" %
                (bibdoc_slave_video_docname, bibdoc_video_extension,
                 bibdoc_video_subformat))
            ## We encode now directly into the bibdocs directory
            encoding_result = encode_video(
                input_file=batch_job['input'],
                output_file=bibdoc_video_fullpath,
                acodec=getval(job, 'audiocodec'),
                vcodec=getval(job, 'videocodec'),
                abitrate=getval(job, 'videobitrate'),
                vbitrate=getval(job, 'audiobitrate'),
                resolution=getval(job, 'resolution'),
                passes=getval(job, 'passes', 1),
                special=getval(job, 'special'),
                specialfirst=getval(job, 'specialfirst'),
                specialsecond=getval(job, 'specialsecond'),
                metadata=getval(job, 'metadata'),
                width=getval(job, 'width'),
                height=getval(job, 'height'),
                aspect=getval(batch_job, 'aspect'),  # Aspect for every job
                profile=getval(job, 'profile'),
                update_fnc=_task_update_overall_status,
                message_fnc=_task_write_message)
            return_code &= encoding_result
            ## only on success
            if encoding_result:
                ## Rename it, adding the subformat
                os.rename(
                    bibdoc_video_fullpath,
                    compose_file(bibdoc_video_directory,
                                 bibdoc_video_extension,
                                 bibdoc_video_subformat, 1,
                                 bibdoc_slave_video_docname))
                bibdoc_video._build_file_list()
                bibdoc_video_format = compose_format(bibdoc_video_extension,
                                                     bibdoc_video_subformat)
                if getval(job, 'bibdoc_comment'):
                    bibdoc_video.set_comment(getval(job, 'bibdoc_comment'),
                                             bibdoc_video_format)
                if getval(job, 'bibdoc_description'):
                    bibdoc_video.set_description(
                        getval(job, 'bibdoc_description'), bibdoc_video_format)

        #------------#
        # EXTRACTION #
        #------------#

        # if there are multiple extraction jobs, all the produced files
        # with the same name will be in the same bibdoc! Make sure that
        # you use different subformats or docname templates to avoid
        # conflicts.

        if job['mode'] == 'extract':
            if getval(job, 'profile'):
                profile = get_extract_profile(job['profile'])
            else:
                profile = {}
            bibdoc_frame_subformat = getval(job, 'bibdoc_subformat')
            _task_write_message("Extracting frames to temporary directory")
            tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4())
            os.mkdir(tmpdir)
            #Move this to the batch description
            bibdoc_frame_docname = getval(job, 'bibdoc_docname',
                                          bibdoc_video_docname)
            tmpfname = (
                tmpdir + "/" + bibdoc_frame_docname + '.' +
                getval(profile, 'extension', getval(job, 'extension', 'jpg')))
            extraction_result = extract_frames(
                input_file=batch_job['input'],
                output_file=tmpfname,
                size=getval(job, 'size'),
                positions=getval(job, 'positions'),
                numberof=getval(job, 'numberof'),
                width=getval(job, 'width'),
                height=getval(job, 'height'),
                aspect=getval(batch_job, 'aspect'),
                profile=getval(job, 'profile'),
                update_fnc=_task_update_overall_status,
            )
            return_code &= extraction_result

            ## only on success:
            if extraction_result:
                ## for every filename in the directorys, create a bibdoc that contains
                ## all sizes of the frame from the two directories
                files = os.listdir(tmpdir)
                for filename in files:
                    ## The docname was altered by BibEncode extract through substitution
                    ## Retrieve it from the filename again
                    bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext(
                        filename)
                    _task_write_message("Creating new bibdoc for %s" %
                                        bibdoc_frame_docname)
                    ## If the bibdoc exists, receive it
                    if bibdoc_frame_docname in recdoc.get_bibdoc_names():
                        bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname)
                    ## Create a new bibdoc if it does not exist
                    else:
                        bibdoc_frame = recdoc.add_bibdoc(
                            docname=bibdoc_frame_docname)

                    ## The filename including path from tmpdir
                    fname = os.path.join(tmpdir, filename)

                    bibdoc_frame_format = compose_format(
                        bibdoc_frame_extension, bibdoc_frame_subformat)
                    ## Same as with the master, if the format allready exists,
                    ## override it, because something went wrong before
                    if bibdoc_frame.format_already_exists_p(
                            bibdoc_frame_format):
                        bibdoc_frame.delete_file(bibdoc_frame_format, 1)
                    _task_write_message("Adding %s jpg;%s to BibDoc" %
                                        (bibdoc_frame_docname,
                                         getval(job, 'bibdoc_subformat')))
                    bibdoc_frame.add_file_new_format(
                        fname,
                        version=1,
                        description=getval(job, 'bibdoc_description'),
                        comment=getval(job, 'bibdoc_comment'),
                        docformat=bibdoc_frame_format)
            ## Remove the temporary folders
            _task_write_message("Removing temporary directory")
            shutil.rmtree(tmpdir)

        _BATCH_STEP = _BATCH_STEP + 1

    #-----------------#
    # FIX BIBDOC/MARC #
    #-----------------#

    _task_write_message("----------- Handling MARCXML -----------")

    ## Fix the BibDoc for all the videos previously created
    _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname)
    bibdoc_video._build_file_list()

    ## Fix the MARC
    _task_write_message("Fixing MARC")
    cli_fix_marc({}, [batch_job['recid']], False)

    if getval(batch_job, 'collection'):
        ## Make the record visible by moving in from the collection
        marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>"
                   "<datafield tag=\"980\" ind1=\" \" ind2=\" \">"
                   "<subfield code=\"a\">%s</subfield></datafield></record>"
                   ) % (batch_job['recid'], batch_job['collection'])
        upload_marcxml_file(marcxml)

    #---------------------#
    # ADD MASTER METADATA #
    #---------------------#

    if getval(batch_job, 'add_master_metadata'):
        _task_write_message("Adding master metadata")
        pbcore = pbcore_metadata(input_file=getval(batch_job, 'input'),
                                 pbcoreIdentifier=batch_job['recid'],
                                 aspect_override=getval(batch_job, 'aspect'))
        marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT)
        upload_marcxml_file(marcxml)

    #------------------#
    # ADD MARC SNIPPET #
    #------------------#

    if getval(batch_job, 'marc_snippet'):
        marc_snippet = open(getval(batch_job, 'marc_snippet'))
        marcxml = marc_snippet.read()
        marc_snippet.close()
        upload_marcxml_file(marcxml)

    #--------------#
    # DELETE INPUT #
    #--------------#

    if getval(batch_job, 'delete_input'):
        _task_write_message("Deleting input file")
        # only if successfull
        if not return_code:
            # only if input matches pattern
            if getval(batch_job, 'delete_input_pattern',
                      '') in getval(batch_job, 'input'):
                try:
                    os.remove(getval(batch_job, 'input'))
                except OSError:
                    pass

    #--------------#
    # NOTIFICATION #
    #--------------#

    ## Send Notification emails on errors
    if not return_code:
        if getval(batch_job, 'notify_user'):
            _notify_error_user(
                getval(batch_job, 'notify_user'),
                getval(batch_job, 'submission_filename', batch_job['input']),
                getval(batch_job, 'recid'),
                getval(batch_job, 'submission_title', ""))
            _task_write_message("Notify user because of an error")
        if getval(batch_job, 'notify_admin'):
            _task_write_message("Notify admin because of an error")
            if type(getval(batch_job, 'notify_admin') == type(str())):
                _notify_error_admin(batch_job, getval(batch_job,
                                                      'notify_admin'))

            else:
                _notify_error_admin(batch_job)
    else:
        if getval(batch_job, 'notify_user'):
            _task_write_message("Notify user because of success")
            _notify_success_user(
                getval(batch_job, 'notify_user'),
                getval(batch_job, 'submission_filename', batch_job['input']),
                getval(batch_job, 'recid'),
                getval(batch_job, 'submission_title', ""))
    return 1
Beispiel #60
0
def bibrank_engine(run):
    """Run the indexing task.
    Return 1 in case of success and 0 in case of failure.
    """
    startCreate = time.time()

    options["run"] = []
    options["run"].append(run)
    for rank_method_code in options["run"]:
        task_sleep_now_if_required(can_stop_too=True)
        cfg_name = getName(rank_method_code)
        write_message("Running rank method: %s." % cfg_name)
        config = load_config(rank_method_code)
        cfg_short = rank_method_code
        cfg_function = "%s_exec" % config.get("rank_method", "function")
        cfg_repair_function = "%s_repair_exec" % config.get("rank_method", "function")
        cfg_name = getName(cfg_short)
        options["validset"] = get_valid_range(rank_method_code)

        if task_get_option("query") is not None:
            params = {"of": "id"}
            if task_get_option("collection"):
                params["c"] = task_get_option("collection").split(",")
            params["p"] = task_get_option("query")
            recIDs = perform_request_search(**params)
            ranges = [(recID, recID) for recID in recIDs]
            task_set_option("id", task_get_option("id", []) + ranges)
            options["recid_range"] = ranges
        elif task_get_option("collection"):
            l_of_colls = task_get_option("collection").split(",")
            recIDs = perform_request_search(c=l_of_colls)
            options["recid_range"] = [(recID, recID) for recID in recIDs]
        elif task_get_option("id"):
            options["recid_range"] = task_get_option("id")
        elif task_get_option("modified"):
            options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified"))
        elif task_get_option("last_updated"):
            options["recid_range"] = add_recIDs_by_date(rank_method_code)
        else:
            write_message("No records specified, updating all", verbose=2)
            min_id = run_sql("SELECT min(id) from bibrec")[0][0]
            max_id = run_sql("SELECT max(id) from bibrec")[0][0]
            options["recid_range"] = [(min_id, max_id)]

        if task_get_option("quick") == "yes":
            write_message("Recalculate parameter used", verbose=9)

        if task_get_option("cmd") == "del":
            del_recids(cfg_short, options["recid_range"])
        elif task_get_option("cmd") == "add":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "stat":
            rank_method_code_statistics(rank_method_code)
        elif task_get_option("cmd") == "check":
            check_method(rank_method_code)
        elif task_get_option("cmd") == "print-missing":
            func_object = globals().get(cfg_function)
            func_object(rank_method_code, cfg_name, config)
        elif task_get_option("cmd") == "repair":
            func_object = globals().get(cfg_repair_function)
            func_object()
        else:
            msg = "Invalid command found processing %s" % rank_method_code
            write_message(msg, sys.stderr)
            raise StandardError(msg)

    if task_get_option("verbose"):
        showtime((time.time() - startCreate))

    return 1