def download_files(from_date, to_date):
    """Downloads the new files from the EDP Sciences
    FTP server."""
    download_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, 'packages')
    old_files = listdir(download_folder)
    ftp = FtpHandler(CFG_EDPSCIENCE_SERVER,
                     CFG_EDPSCIENCE_USERNAME,
                     CFG_EDPSCIENCE_PASSWORD)
    ftp.cd('incoming')
    new_files = ftp.ls()[0]
    new_files = filter(lambda a: is_younger(a,
                                            from_date,
                                            ftp),
                       new_files)
    files_to_download = filter(lambda a: a not in old_files,
                               new_files)
    counter = 1
    for filename in files_to_download:
        task_update_progress('Downloading files 1/3 \t%s of %s'
                             % (counter, len(new_files)))
        write_message('Downloading file %s' % (filename,))
        ftp.download(filename, download_folder)
        filename = join(download_folder, filename)
        counter += 1
    ftp.close()
    return map(lambda a: join(download_folder, a), new_files)
Example #2
0
def iterate_over_new(list, fmt):
    "Iterate over list of IDs"
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)):
            run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt))
        else:
            run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Example #3
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Example #4
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0     # time taken up by external call
    tbibupload = 0     # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Example #5
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
def fill_self_cites_tables(config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon
    """
    algorithm = config['algorithm']
    tags = get_authors_tags()
    all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')]
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun)
Example #7
0
def bst_doi_timestamp(reset=0):
    prepate_doi_table()
    now = datetime.now()
    last_run = ((run_sql("SELECT max(creation_date) FROM doi")[0][0] or datetime(2014, 1, 1)) - timedelta(days=4)).strftime("%Y-%m-%d")
    if int(reset):
        last_run = (datetime(2014, 1, 1) - timedelta(days=4)).strftime("%Y-%m-%d")
    write_message("Retrieving DOIs modified since %s" % last_run)
    restart_on_error = True
    while restart_on_error:
        restart_on_error = False
        for publisher, re_match in CFG_SCOAP3_DOIS.items():
            task_update_progress("Retrieving DOIs for %s" % publisher)
            write_message("Retriving DOIs for %s" % publisher)
            try:
                res = get_all_modified_dois(publisher, last_run, re_match, debug=True)
                for doi in res:
                    if run_sql("SELECT doi FROM doi WHERE doi=%s", (doi, )):
                        continue
                    write_message("New DOI discovered for publisher %s: %s" % (publisher, doi))
                    run_sql("INSERT INTO doi(doi, creation_date) VALUES(%s, %s)", (doi, now))
            except URLError as e:
                write_message("Problem with connection! %s" % (e,))
                restart_on_error = True
            except socket.timeout as e:
                write_message("Timeout error %s" % (e,))
                write_message("Finishing and rescheduling")
                restart_on_error = True
            except ValueError as e:
                write_message("Value error in JSON string! %s" % (e,))
                restart_on_error = True
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Example #11
0
def bst_consyn_harvest(CONSYNATOMURL="https://consyn.elsevier.com/batch/atom?key=QUhvbHRrYW1wOzM0Mjc%253d"):
    """
    Task to download metadata given an ATOM feed from consyn.elsevier.com
    and a folder to store the files.

    @param CONSYNATOMURL: The URL of the atom feed to download.
    """
    if not os.path.exists(CFG_CONSYN_OUT_DIRECTORY):
        folders = CFG_CONSYN_OUT_DIRECTORY.split("/")
        folder = "/"
        for i in range(1, len(folders)):
            folder = os.path.join(folder, folders[i]).strip()
            if not os.path.exists(folder):
                os.mkdir(folder)
    try:
        run_sql("SELECT filename FROM CONSYNHARVEST")
    except:
        run_sql("CREATE TABLE CONSYNHARVEST ("
                "filename VARCHAR(100) NOT NULL PRIMARY KEY,"
                "date VARCHAR(50),"
                "size VARCHAR(30) );")
    # Get list of entries from XML document
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_file = download_url(url=CONSYNATOMURL,
                                   retry_count=5,
                                   timeout=60.0)
        xmlString = open(result_file, 'r').read()
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (CONSYNATOMURL,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #12
0
def iterate_over_new(recIDs, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call

    tot = len(recIDs)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID,
                                                  of=fmt,
                                                  on_the_fly=True,
                                                  save_missing=False)
        save_preformatted_record(recID=recID,
                                 of=fmt,
                                 res=formatted_record,
                                 needs_2nd_pass=needs_2nd_pass,
                                 low_priority=True)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Example #13
0
def bst_move_dbdump(sourcedir, destdir, number_to_keep):
    """
    Will move a MySQL dump from local machine to a remote space.

    @param sourcedir: directory where the local dump is stored.
    @type sourcedir: string

    @param destdir: directory where the dump should live.
    @type destdir: string
    """
    output_file_prefix = CFG_DATABASE_NAME + '-dbdump-'
    files = [x for x in os.listdir(sourcedir)
             if x.startswith(output_file_prefix)]
    task_update_progress("Starting moving of database-dump")
    if len(files) != 1:
        write_message("... none or too many files found. Exiting.")
        return
    filename = files[0]
    full_path_source = sourcedir + os.sep + filename
    write_message("... moving %s" % (full_path_source,))
    full_path_destination = destdir + os.sep + filename
    try:
        shutil.copy(full_path_source, full_path_destination)
    except Exception, e:
        write_message("... could not move %s to %s: %s" %
                      (full_path_source, full_path_destination, str(e)))
        return
Example #14
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Example #15
0
def fetch_concerned_arxiv_records(name):
    task_update_progress("Fetching arxiv record ids")

    dummy, last_date = fetch_last_updated(name)

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
        "WHERE `modification_date` >= %s " \
        "AND `creation_date` > NOW() - INTERVAL 7 DAY " \
        "ORDER BY `modification_date`" \
        "LIMIT 5000"
    records = run_sql(sql, [last_date.isoformat()])

    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False

    def check_pdf_date(recid):
        doc = get_pdf_doc(recid)
        if doc:
            return doc.md > last_date
        return False

    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
    records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
    write_message("recids %s" % repr([(r, mod_date.isoformat()) \
                                               for r, mod_date in records]))
    task_update_progress("Done fetching arxiv record ids")
    return records
Example #16
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, '037__9'):
            if report_number == 'arXiv':
                return True
        return False

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
          "WHERE `modification_date` >= %s " \
          "ORDER BY `modification_date`"
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option('verbose') >= 9:
        write_message('recids:', verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
Example #17
0
def shall_sleep(recid, i, tot, time_estimator):
    """Check if we shall sleep"""
    time_estimation = time_estimator()[1]
    if (i + 1) % 100 == 0:
        task_update_progress("%s (%s%%) -> %s" % (recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation))))
        return True
    return False
def match_missing_ids(remote_ids, batch_size):
    """ For ID pairings that are missing, this function splits the missing
    IDs into batches. The records are pulled from remote, the 035 field read
    and then the remote ID appended to the local record.

    Parameters:
     remote_ids - a list of missing remote rec-ids
     batch_size - How many records to match at a time
    Returns:
     count_appends - number of records being appended
     count_problems - number of records which could not be matched at all
    """
    count_appends = 0
    count_problems = 0

    batches = [remote_ids[x:x+batch_size] for x in
               xrange(0, len(remote_ids), batch_size)]
    _print("Identified %d records which their remote IDs updating."
           % len(remote_ids))
    _print("Processing %d batches of size %d" % (len(batches), batch_size))
    for i, batch in enumerate(batches, 1):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Batch %d of %d" % (i, len(batches)))
        _print("Batch %d of %d" % (i, len(batches)))
        try:
            appends, problems = process_record_batch(batch)
            count_appends += len(appends)
            count_problems += len(problems)
            write_to_file('missing_ids.txt', problems, append=True)
            _print("Submitting batch #%d to BibUpload for appending..." % i, 4)
            start_bibupload_job(appends)
        except StandardError, e:
            _print("Error occured during match of batch %d: %s\n%s"
                   % (i, e, traceback.format_exc()), 2)
 def percent_update(index, percent_last):
     """ Calculates completion percentage, updates task progress """
     per = 100 * float(index)/float(len(remote_ids))
     if per > (percent_last + 0.5):
         percent_last = per
         task_update_progress("Local matching %.1f%% (%d/%d)"
                              % (per, index, len(remote_ids)))
     return percent_last
Example #20
0
def bst_autoclaim():
    orcid_personid_map = get_orcid_personid_map()
    papers = get_papers_with_orcid()
    for i, recid in enumerate(papers):
        autoclaim_paper(recid, orcid_personid_map)
        if i % 10 == 0:
            task_update_progress("Done %s out of %s records (%s%%)" % (i, len(papers), 100*(i)/len(papers)))
            task_sleep_now_if_required(can_stop_too=True)
def compute_cache(pids):
    bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids),
                          stream=sys.stdout, verbose=0)
    for i, p in enumerate(pids):
        bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids)))
        _compute_cache_for_person(p)
        bibtask.task_sleep_now_if_required(can_stop_too=True)
Example #22
0
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError as err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return

    dom = xml.dom.minidom.parseString(xmlString)
    entries = dom.getElementsByTagName("entry")

    # Loop through entries
    for entry in entries:
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
def bst_refresh_author_profiles():
    """Deletes all the WebAuthorProfile cache."""
    task_update_progress("Deleting images...")
    for name in os.listdir(os.path.join(CFG_WEBDIR, 'img', 'tmp')):
        name = os.path.join(CFG_WEBDIR, 'img', 'tmp', name)
        if os.path.isdir(name):
            rmtree(name, ignore_errors=True)
    task_update_progress("Truncating DB cache...")
    run_sql("TRUNCATE wapCACHE")
def _task_update_overall_status(message):
    """ Generates an overall update message for the BibEncode task.
        Stores the messages in a global list for notifications
        @param message: the message that should be printed as task status
        @type message: string
    """
    message = "[%d/%d]%s" % (_BATCH_STEP, _BATCH_STEPS, message)
    task_update_progress(message)
    global _UPD_HISTORY
    _UPD_HISTORY.append(message)
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
def rebuild_tables(rank_method_code, config):
    """Rebuild the tables from scratch

    Called by bibrank -w selfcites -R
    """
    task_update_progress('emptying tables')
    empty_self_cites_tables()
    task_update_progress('filling tables')
    fill_self_cites_tables(rank_method_code, config)
    return True
def task_run_core():
    """
    run daemon
    """

    if task_get_option("update-borrowers"):
        list_of_borrowers = db.get_all_borrowers()

        total_borrowers = len(list_of_borrowers)
        done  = 0

        for borrower in list_of_borrowers:
            user_id = borrower[0]
            update_user_info_from_ldap(user_id)
            done+=1
            task_update_progress("Done %d out of %d." % (done, total_borrowers))
            task_sleep_now_if_required(can_stop_too=True)

    if task_get_option("overdue-letters"):
        expired_loans = db.get_all_expired_loans()

        total_expired_loans = len(expired_loans)
        done  = 0

        for (borrower_id, _bor_name, recid, _barcode, _loaned_on,
             _due_date, _number_of_renewals, number_of_letters,
             date_letters, _notes, loan_id) in expired_loans:

            number_of_letters=int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id)
            elif number_of_letters == 1 and must_send_second_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id)
            elif number_of_letters == 2 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)
            elif number_of_letters >= 3 and must_send_third_recall(date_letters):
                content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)

            if content != '':
                title = book_title_from_MARC(recid)
                subject = "LOAN RECALL: " + title

                update_expired_loan(loan_id)
                send_overdue_letter(borrower_id, subject, content)

            done+=1

            task_update_progress("Done %d out of %d." % (done, total_expired_loans))

            task_sleep_now_if_required(can_stop_too=True)
            time.sleep(1)

    return 1
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last+1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                                                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
Example #29
0
def download_feed(feed_url, batch_size, delete_zip, new_sources,
                  directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)
    for fileUrl, fileName in entries:
        task_sleep_now_if_required()
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            fileUrl = fileUrl.replace(' ', '%20')
            try:
                write_message("Downloading %s to %s\n" % (fileUrl,
                                                          outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                _errors_detected.append(err)
                write_message("URL could not be opened: %s" % fileUrl)
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                xml_files.extend(extractAll(outFilename,
                                            delete_zip,
                                            directory))
            except BadZipfile:
                _errors_detected.append(err)
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
    return xml_files
def bst_synchronize_recids(search_terms=SEARCH_TERMS, log_dir=None,
                           collection=COLLECTION, batch_size=BATCH_SIZE,
                           debug=False, remote_ids=None):
    """Synchronize record IDs between the CERN Document Server (CDS) and Inspire

This BibTasklet is intended to be a general purpose replacement for
'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should
be executable on both CDS and Inspire.

Generally there should be no need to modify these parameters, the
script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf
to determine what type of Invenio instance we're running on. These
parameters will be set by default to the correct values to
synchronise all IDs, though you may want to limit records manually.

Parameters:
 search_terms - The term to use to get record IDs
                (Default "035:<LOCAL>)
 log_dir - The directory to store the log file in
           (Defaults to CFG_TMPSHAREDDIR)
 collection - What collection to take from
              (Default is no collection)
 batch_size - How many records to try and ammend at once
              (Default 200)
 debug - If True, this script will run against the TEST instances
         (Default false)
 remote_ids - Comma seperated values of remote IDs, if this is
              specified, remote IDs will not be searched for.
    """
    configure_globals(search_terms, log_dir, debug)
    _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE))

    if not remote_ids:
        task_update_progress("Finding remote records on %s with %s IDs"
                             % (REMOTE_INSTANCE, LOCAL_INSTANCE))
        remote_ids = get_remote_ids(search_terms, collection)
    else:
        remote_ids = [int(rid) for rid in remote_ids.split(',')]

    task_sleep_now_if_required(can_stop_too=True)
    task_update_progress("Matching remote IDs to local records")
    missing_ids = match_remote_ids(remote_ids)

    count_appends, count_problems = match_missing_ids(missing_ids, batch_size)

    _print("======================== FINAL SCORE ========================", 1)
    _print(" Records matched: %d" % (len(remote_ids)-len(missing_ids)), 1)
    _print(" Records appended: %d" % count_appends, 1)
    _print(" IDs not matched (broken link!): %d" % count_problems, 1)
    _print("=============================================================", 1)

    _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE))

    return True
def submit_records(records_filename,
                   records_list,
                   mode,
                   directory,
                   taskid=0,
                   silent=False,
                   devmode=False,
                   subject=None):
    """
    Performs the logic to submit given file (filepath) of records
    either by e-mail or using BibUpload with given mode.

    Taskid is given to indicate if the task submission should wait for any
    previously submitted tasks.

    The submission can also be made "silent" in the sense of not
    updating the modification date of the records.

    @param records_filename: filepath to XML file containing records.
    @type records_filename: string

    @param records_list: list of APSRecord objects for records
    @type records_list: list

    @param mode: which submission mode is it?
    @type mode: string

    @param taskid: bibsched taskid, wait for task to complete before submission
    @type taskid: int

    @param silent: do not update the modification date of the records
    @type silent: bool

    @return: returns the given taskid upon submission, or True/False from email.
    """
    if devmode:
        return None
    if not subject:
        now = datetime.datetime.now()
        subject = "APS harvest results: %s" % (
            now.strftime("%Y-%m-%d %H:%M:%S"), )

    # Check if we should create bibupload or e-mail
    if mode == "email":
        # Lets parse the records and find our IDs.
        list_of_dois = []
        for record in records_list:
            # We strip away the first part of the DOI for readability.
            list_of_dois.append('/'.join(record.doi.split('/')[1:]))
        # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
        body = "Harvested new records: %s" % (records_filename, )
        try:
            try:
                shutil.move(records_filename, directory)
                records_filename = os.path.join(
                    directory, os.path.basename(records_filename))
                body = "Harvested new records on %s. They are located here:\n %s" % \
                       (now.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
            except IOError, e:
                # Some IOError
                body = "Error while harvesting records: \nError saving %s - %s" % \
                       (records_filename, str(e))
                raise e
        finally:
            body = "%s\nRecords harvested (%s total):\n%s\n" % (
                body, str(len(list_of_dois)), "\n".join(list_of_dois))
            res = submit_records_via_mail(subject, body)
            write_message("Sent e-mail to %s with path to %s" %
                          (CFG_APSHARVEST_EMAIL, records_filename))
            return res
    else:
        # We submit a BibUpload task and wait for it to finish
        task_update_progress("Waiting for task to finish")

        if taskid != 0:
            write_message("Going to wait for %d to finish" % (taskid, ))

        while not can_launch_bibupload(taskid):
            # Lets wait until the previously launched task exits.
            task_sleep_now_if_required(can_stop_too=False)
            time.sleep(5.0)

        taskid = submit_bibupload_for_records(mode, records_filename, silent)
        write_message("Submitted BibUpload task #%s with mode %s" %
                      (str(taskid), mode))
        return taskid
Example #32
0
def run_bibsort_update(recids=None, method_list=None):
    """Updates bibsort tables for the methods in method_list
    and for the records in recids.

    If recids is None: recids = all records that have been modified
    or inserted since last update

    If method_list is None: method_list = all the methods available
    in bsrMETHOD table"""

    write_message('Initial data for run_bibsort_update method: ' \
                  'number of recids = %s; method_list=%s' \
                  %(str(len(recids)), method_list), verbose=5)
    write_message('Updating sorting data.')

    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    method_list = bibsort_methods.keys()
    if not method_list:
        write_message('No methods found in bsrMETHOD table.. exiting.')
        return True

    #we could have 4 types of methods:
    #(i) RNK methods -> they should be rebalanced, not updated
    #(ii) RNK methods to delete -> we should delete their data
    #(iii) non RNK methods to update
    #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated
    #check which of the methods are RNK methods (they do not need modified recids)
    rnk_methods = get_rnk_methods(bibsort_methods)
    rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(
        rnk_methods, bibsort_methods)
    #check which of the methods have no data, so they are actually new,
    #so they need balancing(sorting) instead of updating
    non_rnk_methods = [
        method for method in bibsort_methods.keys()
        if method not in rnk_methods
    ]
    non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(
        non_rnk_methods)

    #(i) + (iv)
    methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted
    if methods_to_balance:  # several methods require rebalancing(sorting) and not updating
        return run_bibsort_rebalance(methods_to_balance)

    #(ii)
    #remove the data for the ranking methods that have been deleted
    for method in rnk_methods_deleted:
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Deleting data for method %s" % method)
        write_message('Starting deleting the data for RNK method %s' % method,
                      verbose=5)
        executed_ok = delete_bibsort_data_for_method(
            bibsort_methods[method]['id'])
        if not executed_ok:
            write_message('Method %s could not be deleted correctly, aborting..' \
                          %method, sys.stderr)
            return False

    #(iii)
    #methods to actually update
    if non_rnk_methods_updated:  # we want to update some 'normal'(not RNK) tables, so we need recids
        update_timestamp = False
        if not recids:
            recids = get_modified_or_inserted_recs(non_rnk_methods_updated)
            if recids == 0:  #error signal
                return False
            if not recids:
                write_message("No records inserted or modified in bibrec table " \
                          "since the last update of bsrMETHODDATA.")
                return True
            write_message("These records have been recently modified/inserted: %s" \
                  %str(recids), verbose=5)
            update_timestamp = True
        recids_i = intbitset(recids)
        for method in non_rnk_methods_updated:
            task_sleep_now_if_required(can_stop_too=True)
            task_update_progress("Updating method %s" % method)
            write_message('Starting updating method %s' % method, verbose=5)
            executed_ok = update_bibsort_tables(recids_i, method,
                                                update_timestamp)
            if not executed_ok:
                write_message('Method %s could not be executed correctly, aborting..' \
                          %method, sys.stderr)
                return False
    return True
def task_run_core():
    """
    Run daemon
    """
    write_message("Starting...")
    if task_get_option("update-borrowers"):
        write_message("Started update-borrowers")
        list_of_borrowers = db.get_all_borrowers()
        total_borrowers = len(list_of_borrowers)

        for done, borrower in enumerate(list_of_borrowers):
            user_id = borrower[0]
            update_user_info_from_ldap(user_id)
            if done % 10 == 0:
                task_update_progress("Borrower: updated %d out of %d." %
                                     (done, total_borrowers))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Borrower: updated %d out of %d." %
                             (done + 1, total_borrowers))
        write_message("Updated %d out of %d total borrowers" %
                      (done + 1, total_borrowers))

    if task_get_option("update-requests"):
        write_message("Started update-requests")
        list_of_reqs = db.get_loan_request_by_status(
            CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING)

        for (_request_id, recid, bc, _name, borrower_id, _library, _location,
             _date_from, _date_to, _request_date) in list_of_reqs:
            description = db.get_item_description(bc)
            list_of_barcodes = db.get_barcodes(recid, description)
            for barcode in list_of_barcodes:
                update_requests_statuses(barcode)
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress(
            "Requests due updated from 'waiting' to 'pending'.")
        write_message("Requests due updated from 'waiting' to 'pending'.")

    if task_get_option("overdue-letters"):
        write_message("Started overdue-letters")
        expired_loans = db.get_all_expired_loans()
        total_expired_loans = len(expired_loans)

        for done, (borrower_id, _bor_name, recid, _barcode, _loaned_on,
                   _due_date, _number_of_renewals, number_of_letters,
                   date_letters, _notes, loan_id) in enumerate(expired_loans):

            number_of_letters = int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id)
            elif number_of_letters == 1 and must_send_second_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id)
            elif number_of_letters == 2 and must_send_third_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)
            elif number_of_letters >= 3 and must_send_third_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id)

            if content != '':
                title = book_title_from_MARC(recid)
                subject = "LOAN RECALL: " + title

                update_expired_loan(loan_id)
                send_overdue_letter(borrower_id,
                                    CFG_BIBCIRCULATION_LOANS_EMAIL, subject,
                                    content)

            if done % 10 == 0:
                task_update_progress("Loan recall: sent %d out of %d." %
                                     (done, total_expired_loans))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress(
            "Loan recall: processed %d out of %d expires loans." %
            (done + 1, total_expired_loans))
        write_message("Processed %d out of %d expired loans." %
                      (done + 1, total_expired_loans))

        # Recalls for expired ILLs
        write_message("Started overdue-letters for Inter Library Loans")
        expired_ills = db.get_all_expired_ills()
        total_expired_ills = len(expired_ills)

        for done, (ill_id, borrower_id, item_info, number_of_letters,
                   date_letters) in enumerate(expired_ills):

            number_of_letters = int(number_of_letters)

            content = ''
            if number_of_letters == 0:
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL1'], ill_id, ill=1)
            elif number_of_letters == 1 and must_send_second_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL2'], ill_id, ill=1)
            elif number_of_letters == 2 and must_send_third_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1)
            elif number_of_letters >= 3 and must_send_third_recall(
                    date_letters):
                content = generate_email_body(
                    CFG_BIBCIRCULATION_TEMPLATES['ILL_RECALL3'], ill_id, ill=1)

            if content != '' and looks_like_dictionary(item_info):
                item_info = eval(item_info)
                if item_info.has_key('title'):
                    book_title = item_info['title']
                    subject = "ILL RECALL: " + str(book_title)
                    update_expired_loan(loan_id=ill_id, ill=1)
                    send_overdue_letter(borrower_id,
                                        CFG_BIBCIRCULATION_ILLS_EMAIL, subject,
                                        content)
            if done % 10 == 0:
                task_update_progress("ILL recall: sent %d out of %d." %
                                     (done, total_expired_ills))
                task_sleep_now_if_required(can_stop_too=True)
        task_update_progress(
            "ILL recall: processed %d out of %d expired ills." %
            (done + 1, total_expired_ills))
        write_message("Processed %d out of %d expired ills." %
                      (done + 1, total_expired_ills))

    return 1
Example #34
0
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process,
                     recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now()

    ### Query the database
    ###
    task_update_progress('Fetching records to process')
    if process_format:  # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = intbitset(recids)

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=cds_query['collection'],
                                       p=cds_query['pattern'],
                                       f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF" and recIDs:
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        start_date = latest_bibrank_run
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" %
                      len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_old(recIDs, fmt)
        else:
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_old(without_format, fmt)
        else:
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_new(without_format, fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Example #35
0
def generate_sitemaps(sitemap_index_writer,
                      collection_names,
                      export_fulltext=True):
    """
    Generate sitemaps themselves. Return list of generated sitemaps files
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    for lang in CFG_SITE_LANGS:
        writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang,
                       lastmod=datetime.today(),
                       changefreq=DEFAULT_CHANGEFREQ_HOME,
                       priority=DEFAULT_PRIORITY_HOME,
                       alternate=True)
        nb_urls += 1
    write_message("... Getting all public records...")
    recids = get_all_public_records(collection_names)
    write_message("... Generating urls for %s records..." % len(recids))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_RECORDS,
                                 priority=DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Sitemap for recid %s/%s" %
                                 (i + 1, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for collections...")
    collections = get_all_public_collections(collection_names)
    for i, (collection, lastmod) in enumerate(collections):
        for lang in CFG_SITE_LANGS:
            if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                       or nb_urls >= MAX_RECORDS):
                sitemap_id += 1
                writer = SitemapWriter(sitemap_id)
                sitemap_index_writer.add_url(writer.get_sitemap_url())
            nb_urls = writer.add_url('%s/collection/%s?ln=%s' %
                                     (CFG_SITE_URL, quote(collection), lang),
                                     lastmod=lastmod,
                                     changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS,
                                     priority=DEFAULT_PRIORITY_COLLECTIONS,
                                     alternate=True)
        if i % 100 == 0:
            task_update_progress("Sitemap for collection %s/%s" %
                                 (i + 1, len(collections)))
            task_sleep_now_if_required(can_stop_too=True)
    if export_fulltext:
        write_message("... Generating urls for fulltexts...")
        recids = filter_fulltexts(recids)
        for i, (recid, lastmod) in enumerate(recids):
            if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                       or nb_urls >= MAX_RECORDS):
                sitemap_id += 1
                writer = SitemapWriter(sitemap_id)
                sitemap_index_writer.add_url(writer.get_sitemap_url())
            nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/files' %
                                     (CFG_SITE_RECORD, recid),
                                     lastmod=lastmod,
                                     changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS,
                                     priority=DEFAULT_PRIORITY_FULLTEXTS)
            if i % 100 == 0:
                task_update_progress("Sitemap for files page %s/%s" %
                                     (i, len(recids)))
                task_sleep_now_if_required(can_stop_too=True)

    write_message("... Generating urls for comments...")
    recids = filter_comments(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/comments' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_COMMENTS,
                                 priority=DEFAULT_PRIORITY_COMMENTS)
        if i % 100 == 0:
            task_update_progress("Sitemap for comments page %s/%s" %
                                 (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
    write_message("... Generating urls for reviews")
    recids = filter_reviews(recids)
    for i, (recid, lastmod) in enumerate(recids):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE
                                   or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            write_message("")
            writer = SitemapWriter(sitemap_id)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s/reviews' %
                                 (CFG_SITE_RECORD, recid),
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_REVIEWS,
                                 priority=DEFAULT_PRIORITY_REVIEWS)
        if i % 100 == 0:
            task_update_progress("Sitemap for reviews page %s/%s" %
                                 (i, len(recids)))
            task_sleep_now_if_required(can_stop_too=True)
def task_run_core():
    """ Reimplement to add the body of the task."""
##
## ------->--->time--->------>
##  (-1)  |   ( 0)    |  ( 1)
##        |     |     |
## [T.db] |  [T.fc]   | [T.db]
##        |     |     |
##        |<-tol|tol->|
##
## the above is the compare_timestamps_with_tolerance result "diagram"
## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp
## ( -1, 0, 1) stand for the returned value
## tol stands for the tolerance in seconds
##
## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc
## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the
## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus
## slightly before the T.db (practically the time distance between the start of the task and the last call of
## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the
## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if
## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if
## webcoll runs again it will not be fully ran
##
    task_run_start_timestamp = get_current_time_timestamp()
    colls = []
    # decide whether we need to run or not, by comparing last updated timestamps:
    write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3)
    write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3)
    if task_has_option("part"):
        write_message("Running cache update part %s only." % task_get_option("part"), verbose=3)
    if check_nbrecs_for_all_external_collections() or task_has_option("force") or \
    compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
                                        get_cache_last_updated_timestamp(),
                                        CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0:
        ## either forced update was requested or cache is not up to date, so recreate it:
        # firstly, decide which collections to do:
        if task_has_option("collection"):
            coll = get_collection(task_get_option("collection"))
            colls.append(coll)
            if task_has_option("recursive"):
                r_type_descendants = coll.get_descendants(type='r')
                colls += r_type_descendants
                v_type_descendants = coll.get_descendants(type='v')
                colls += v_type_descendants
        else:
            res = run_sql("SELECT name FROM collection ORDER BY id")
            for row in res:
                colls.append(get_collection(row[0]))
        # secondly, update collection reclist cache:
        if task_get_option('part', 1) == 1:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / reclist cache update" % coll.name)
                if str(coll.dbquery).startswith("hostedcollection:"):
                    coll.set_nbrecs_for_external_collection()
                else:
                    coll.calculate_reclist()
                task_sleep_now_if_required()
                coll.update_reclist()
                task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)
        # thirdly, update collection webpage cache:
        if task_get_option("part", 2) == 2:
            i = 0
            for coll in colls:
                i += 1
                write_message("%s / webpage cache update" % coll.name)
                coll.update_webpage_cache()
                task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
                task_sleep_now_if_required(can_stop_too=True)

        # finally update the cache last updated timestamp:
        # (but only when all collections were updated, not when only
        # some of them were forced-updated as per admin's demand)
        if not task_has_option("collection"):
            set_cache_last_updated_timestamp(task_run_start_timestamp)
            write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3)
    else:
        ## cache up to date, we don't have to run
        write_message("Collection cache is up to date, no need to run.")
    ## we are done:
    return True
Example #37
0
def process_batch_job(batch_job_file):
    """ Processes a batch job description dictionary

    @param batch_job_file: a fullpath to a batch job file
    @type batch_job_file: string
    @return: 1 if the process was successfull, 0 if not
    @rtype; int
    """
    def upload_marcxml_file(marcxml):
        """ Creates a temporary marcxml file and sends it to bibupload
        """
        xml_filename = 'bibencode_' + str(batch_job['recid']) + '_' + str(
            uuid.uuid4()) + '.xml'
        xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR,
                                    xml_filename)
        xml_file = file(xml_filename, 'w')
        xml_file.write(marcxml)
        xml_file.close()
        targs = ['-c', xml_filename]
        task_low_level_submission('bibupload', 'bibencode', *targs)

    #---------#
    # GENERAL #
    #---------#

    _task_write_message("----------- Handling Master -----------")

    ## Check the validity of the batch file here
    batch_job = json_decode_file(batch_job_file)

    ## Sanitise batch description and raise errrors
    batch_job = sanitise_batch_job(batch_job)

    ## Check if the record exists
    if record_exists(batch_job['recid']) < 1:
        raise Exception("Record not found")

    recdoc = BibRecDocs(batch_job['recid'])

    #--------------------#
    # UPDATE FROM MASTER #
    #--------------------#

    ## We want to add new stuff to the video's record, using the master as input
    if getval(batch_job, 'update_from_master'):
        found_master = False
        bibdocs = recdoc.list_bibdocs()
        for bibdoc in bibdocs:
            bibdocfiles = bibdoc.list_all_files()
            for bibdocfile in bibdocfiles:
                comment = bibdocfile.get_comment()
                description = bibdocfile.get_description()
                subformat = bibdocfile.get_subformat()
                m_comment = getval(batch_job, 'bibdoc_master_comment', comment)
                m_description = getval(batch_job, 'bibdoc_master_description',
                                       description)
                m_subformat = getval(batch_job, 'bibdoc_master_subformat',
                                     subformat)
                if (comment == m_comment and description == m_description
                        and subformat == m_subformat):
                    found_master = True
                    batch_job['input'] = bibdocfile.get_full_path()
                    ## Get the aspect of the from the record
                    try:
                        ## Assumes pbcore metadata mapping
                        batch_job['aspect'] = get_fieldvalues(
                            124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0]
                    except IndexError:
                        pass
                    break
            if found_master:
                break
        if not found_master:
            _task_write_message("Video master for record %d not found" %
                                batch_job['recid'])
            task_update_progress("Video master for record %d not found" %
                                 batch_job['recid'])
            ## Maybe send an email?
            return 1

    ## Clean the job to do no upscaling etc
    if getval(batch_job, 'assure_quality'):
        batch_job = clean_job_for_quality(batch_job)

    global _BATCH_STEPS
    _BATCH_STEPS = len(batch_job['jobs'])

    ## Generate the docname from the input filename's name or given name
    bibdoc_video_docname, bibdoc_video_extension = decompose_file(
        batch_job['input'])[1:]
    if not bibdoc_video_extension or getval(batch_job,
                                            'bibdoc_master_extension'):
        bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension')
    if getval(batch_job, 'bibdoc_master_docname'):
        bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname')

    write_message("Creating BibDoc for %s" % bibdoc_video_docname)
    ## If the bibdoc exists, receive it
    if bibdoc_video_docname in recdoc.get_bibdoc_names():
        bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname)
    ## Create a new bibdoc if it does not exist
    else:
        bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname)

    ## Get the directory auf the newly created bibdoc to copy stuff there
    bibdoc_video_directory = bibdoc_video.get_base_dir()

    #--------#
    # MASTER #
    #--------#
    if not getval(batch_job, 'update_from_master'):
        if getval(batch_job, 'add_master'):
            ## Generate the right name for the master
            ## The master should be hidden first an then renamed
            ## when it is really available
            ## !!! FIX !!!
            _task_write_message("Adding %s master to the BibDoc" %
                                bibdoc_video_docname)
            master_format = compose_format(
                bibdoc_video_extension,
                getval(batch_job, 'bibdoc_master_subformat', 'master'))
            ## If a file of the same format is there, something is wrong, remove it!
            ## it might be caused by a previous corrupted submission etc.
            if bibdoc_video.format_already_exists_p(master_format):
                bibdoc_video.delete_file(master_format, 1)
            bibdoc_video.add_file_new_format(
                batch_job['input'],
                version=1,
                description=getval(batch_job, 'bibdoc_master_description'),
                comment=getval(batch_job, 'bibdoc_master_comment'),
                docformat=master_format)

    #-----------#
    # JOBS LOOP #
    #-----------#

    return_code = 1
    global _BATCH_STEP

    for job in batch_job['jobs']:

        _task_write_message("----------- Job %s of %s -----------" %
                            (_BATCH_STEP, _BATCH_STEPS))

        ## Try to substitute docname with master docname
        if getval(job, 'bibdoc_docname'):
            job['bibdoc_docname'] = Template(
                job['bibdoc_docname']).safe_substitute(
                    {'bibdoc_master_docname': bibdoc_video_docname})

        #-------------#
        # TRANSCODING #
        #-------------#

        if job['mode'] == 'encode':

            ## Skip the job if assure_quality is not set and marked as fallback
            if not getval(batch_job, 'assure_quality') and getval(
                    job, 'fallback'):
                continue

            if getval(job, 'profile'):
                profile = get_encoding_profile(job['profile'])
            else:
                profile = None
            ## We need an extension defined fot the video container
            bibdoc_video_extension = getval(job, 'extension',
                                            getval(profile, 'extension'))
            if not bibdoc_video_extension:
                raise Exception("No container/extension defined")
            ## Get the docname and subformat
            bibdoc_video_subformat = getval(job, 'bibdoc_subformat')
            bibdoc_slave_video_docname = getval(job, 'bibdoc_docname',
                                                bibdoc_video_docname)
            ## The subformat is incompatible with ffmpegs name convention
            ## We do the encoding without and rename it afterwards
            bibdoc_video_fullpath = compose_file(bibdoc_video_directory,
                                                 bibdoc_slave_video_docname,
                                                 bibdoc_video_extension)
            _task_write_message(
                "Transcoding %s to %s;%s" %
                (bibdoc_slave_video_docname, bibdoc_video_extension,
                 bibdoc_video_subformat))
            ## We encode now directly into the bibdocs directory
            encoding_result = encode_video(
                input_file=batch_job['input'],
                output_file=bibdoc_video_fullpath,
                acodec=getval(job, 'audiocodec'),
                vcodec=getval(job, 'videocodec'),
                abitrate=getval(job, 'videobitrate'),
                vbitrate=getval(job, 'audiobitrate'),
                resolution=getval(job, 'resolution'),
                passes=getval(job, 'passes', 1),
                special=getval(job, 'special'),
                specialfirst=getval(job, 'specialfirst'),
                specialsecond=getval(job, 'specialsecond'),
                metadata=getval(job, 'metadata'),
                width=getval(job, 'width'),
                height=getval(job, 'height'),
                aspect=getval(batch_job, 'aspect'),  # Aspect for every job
                profile=getval(job, 'profile'),
                update_fnc=_task_update_overall_status,
                message_fnc=_task_write_message)
            return_code &= encoding_result
            ## only on success
            if encoding_result:
                ## Rename it, adding the subformat
                os.rename(
                    bibdoc_video_fullpath,
                    compose_file(bibdoc_video_directory,
                                 bibdoc_slave_video_docname,
                                 bibdoc_video_extension,
                                 bibdoc_video_subformat, 1))
                bibdoc_video._build_file_list()
                bibdoc_video_format = compose_format(bibdoc_video_extension,
                                                     bibdoc_video_subformat)
                if getval(job, 'bibdoc_comment'):
                    bibdoc_video.set_comment(getval(job, 'bibdoc_comment'),
                                             bibdoc_video_format)
                if getval(job, 'bibdoc_description'):
                    bibdoc_video.set_description(
                        getval(job, 'bibdoc_description'), bibdoc_video_format)

        #------------#
        # EXTRACTION #
        #------------#

        # if there are multiple extraction jobs, all the produced files
        # with the same name will be in the same bibdoc! Make sure that
        # you use different subformats or docname templates to avoid
        # conflicts.

        if job['mode'] == 'extract':
            if getval(job, 'profile'):
                profile = get_extract_profile(job['profile'])
            else:
                profile = {}
            bibdoc_frame_subformat = getval(job, 'bibdoc_subformat')
            _task_write_message("Extracting frames to temporary directory")
            tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4())
            os.mkdir(tmpdir)
            #Move this to the batch description
            bibdoc_frame_docname = getval(job, 'bibdoc_docname',
                                          bibdoc_video_docname)
            tmpfname = (
                tmpdir + "/" + bibdoc_frame_docname + '.' +
                getval(profile, 'extension', getval(job, 'extension', 'jpg')))
            extraction_result = extract_frames(
                input_file=batch_job['input'],
                output_file=tmpfname,
                size=getval(job, 'size'),
                positions=getval(job, 'positions'),
                numberof=getval(job, 'numberof'),
                width=getval(job, 'width'),
                height=getval(job, 'height'),
                aspect=getval(batch_job, 'aspect'),
                profile=getval(job, 'profile'),
                update_fnc=_task_update_overall_status,
            )
            return_code &= extraction_result

            ## only on success:
            if extraction_result:
                ## for every filename in the directorys, create a bibdoc that contains
                ## all sizes of the frame from the two directories
                files = os.listdir(tmpdir)
                for filename in files:
                    ## The docname was altered by BibEncode extract through substitution
                    ## Retrieve it from the filename again
                    bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext(
                        filename)
                    _task_write_message("Creating new bibdoc for %s" %
                                        bibdoc_frame_docname)
                    ## If the bibdoc exists, receive it
                    if bibdoc_frame_docname in recdoc.get_bibdoc_names():
                        bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname)
                    ## Create a new bibdoc if it does not exist
                    else:
                        bibdoc_frame = recdoc.add_bibdoc(
                            docname=bibdoc_frame_docname)

                    ## The filename including path from tmpdir
                    fname = os.path.join(tmpdir, filename)

                    bibdoc_frame_format = compose_format(
                        bibdoc_frame_extension, bibdoc_frame_subformat)
                    ## Same as with the master, if the format allready exists,
                    ## override it, because something went wrong before
                    if bibdoc_frame.format_already_exists_p(
                            bibdoc_frame_format):
                        bibdoc_frame.delete_file(bibdoc_frame_format, 1)
                    _task_write_message("Adding %s jpg;%s to BibDoc" %
                                        (bibdoc_frame_docname,
                                         getval(job, 'bibdoc_subformat')))
                    bibdoc_frame.add_file_new_format(
                        fname,
                        version=1,
                        description=getval(job, 'bibdoc_description'),
                        comment=getval(job, 'bibdoc_comment'),
                        docformat=bibdoc_frame_format)
            ## Remove the temporary folders
            _task_write_message("Removing temporary directory")
            shutil.rmtree(tmpdir)

        _BATCH_STEP = _BATCH_STEP + 1

    #-----------------#
    # FIX BIBDOC/MARC #
    #-----------------#

    _task_write_message("----------- Handling MARCXML -----------")

    ## Fix the BibDoc for all the videos previously created
    _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname)
    bibdoc_video._build_file_list()

    ## Fix the MARC
    _task_write_message("Fixing MARC")
    cli_fix_marc({}, [batch_job['recid']], False)

    if getval(batch_job, 'collection'):
        ## Make the record visible by moving in from the collection
        marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>"
                   "<datafield tag=\"980\" ind1=\" \" ind2=\" \">"
                   "<subfield code=\"a\">%s</subfield></datafield></record>"
                   ) % (batch_job['recid'], batch_job['collection'])
        upload_marcxml_file(marcxml)

    #---------------------#
    # ADD MASTER METADATA #
    #---------------------#

    if getval(batch_job, 'add_master_metadata'):
        _task_write_message("Adding master metadata")
        pbcore = pbcore_metadata(input_file=getval(batch_job, 'input'),
                                 pbcoreIdentifier=batch_job['recid'],
                                 aspect_override=getval(batch_job, 'aspect'))
        marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT)
        upload_marcxml_file(marcxml)

    #------------------#
    # ADD MARC SNIPPET #
    #------------------#

    if getval(batch_job, 'marc_snippet'):
        marc_snippet = open(getval(batch_job, 'marc_snippet'))
        marcxml = marc_snippet.read()
        marc_snippet.close()
        upload_marcxml_file(marcxml)

    #--------------#
    # DELETE INPUT #
    #--------------#

    if getval(batch_job, 'delete_input'):
        _task_write_message("Deleting input file")
        # only if successfull
        if not return_code:
            # only if input matches pattern
            if getval(batch_job, 'delete_input_pattern',
                      '') in getval(batch_job, 'input'):
                try:
                    os.remove(getval(batch_job, 'input'))
                except OSError:
                    pass

    #--------------#
    # NOTIFICATION #
    #--------------#

    ## Send Notification emails on errors
    if not return_code:
        if getval(batch_job, 'notify_user'):
            _notify_error_user(
                getval(batch_job, 'notify_user'),
                getval(batch_job, 'submission_filename', batch_job['input']),
                getval(batch_job, 'recid'),
                getval(batch_job, 'submission_title', ""))
            _task_write_message("Notify user because of an error")
        if getval(batch_job, 'notify_admin'):
            _task_write_message("Notify admin because of an error")
            if type(getval(batch_job, 'notify_admin') == type(str())):
                _notify_error_admin(batch_job, getval(batch_job,
                                                      'notify_admin'))

            else:
                _notify_error_admin(batch_job)
    else:
        if getval(batch_job, 'notify_user'):
            _task_write_message("Notify user because of success")
            _notify_success_user(
                getval(batch_job, 'notify_user'),
                getval(batch_job, 'submission_filename', batch_job['input']),
                getval(batch_job, 'recid'),
                getval(batch_job, 'submission_title', ""))
    return 1
Example #38
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    rules_to_reset = task_get_option("reset_rules")
    if rules_to_reset:
        write_message("Resetting the following rules: %s" % rules_to_reset)
        for rule in rules_to_reset:
            reset_rule_last_run(rule)
    plugins = load_plugins()
    rules = load_rules(plugins)
    write_message("Loaded rules: %s" % rules, verbose=9)
    task_set_option('plugins', plugins)
    recids_for_rules = get_recids_for_rules(rules)
    write_message("recids for rules: %s" % recids_for_rules, verbose=9)

    update_database = not (task_has_option('record_ids') or task_get_option(
        'no_upload', False) or task_get_option('no_tickets', False))

    if update_database:
        next_starting_dates = {}
        for rule_name, rule in rules.iteritems():
            next_starting_dates[rule_name] = get_next_starting_date(rule)

    all_recids = intbitset([])
    single_rules = set()
    batch_rules = set()
    for rule_name, rule_recids in recids_for_rules.iteritems():
        all_recids.union_update(rule_recids)
        if plugins[rules[rule_name]["check"]]["batch"]:
            batch_rules.add(rule_name)
        else:
            single_rules.add(rule_name)

    records_to_upload_holdingpen = []
    records_to_upload_replace = []
    records_to_submit_tickets = []
    for batch in iter_batches(all_recids, CFG_BATCH_SIZE):

        for rule_name in batch_rules:
            rule = rules[rule_name]
            rule_recids = recids_for_rules[rule_name]
            task_sleep_now_if_required(can_stop_too=True)
            records = []
            for i, record_id, record in batch:
                if record_id in rule_recids:
                    records.append(record)
            if len(records):
                check_records(rule, records)

        # Then run them through normal rules
        for i, record_id, record in batch:
            progress_percent = int(float(i) / len(all_recids) * 100)
            task_update_progress("Processing record %s/%s (%i%%)." %
                                 (i, len(all_recids), progress_percent))
            write_message("Processing record %s" % record_id)

            for rule_name in single_rules:
                rule = rules[rule_name]
                rule_recids = recids_for_rules[rule_name]
                task_sleep_now_if_required(can_stop_too=True)
                if record_id in rule_recids:
                    check_record(rule, record)

            if record.amended:
                if record.holdingpen:
                    records_to_upload_holdingpen.append(record)
                else:
                    records_to_upload_replace.append(record)

            if not record.valid:
                records_to_submit_tickets.append(record)

        if len(records_to_submit_tickets) >= CFG_BATCH_SIZE:
            Tickets(records_to_submit_tickets).submit()
            records_to_submit_tickets = []
        if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_holdingpen, True)
            records_to_upload_holdingpen = []
        if len(records_to_upload_replace) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_replace, False)
            records_to_upload_replace = []

    ## In case there are still some remaining amended records
    if records_to_submit_tickets:
        Tickets(records_to_submit_tickets).submit()
    if records_to_upload_holdingpen:
        upload_amendments(records_to_upload_holdingpen, True)
    if records_to_upload_replace:
        upload_amendments(records_to_upload_replace, False)

    # Update the database with the last time each rule was ran
    if update_database:
        for rule_name, rule in rules.iteritems():
            update_rule_last_run(rule_name, next_starting_dates[rule_name])

    return True
Example #39
0
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    # Unify all parameters into a dict using locals
    parameters = locals()

    # 1: We analyze parameters and fetch all requested records from APS
    final_record_list, new_harvest_date = get_records_to_harvest(parameters)
    write_message("Found %d record(s) to download." %
                  (len(final_record_list), ))

    if reportonly:
        write_message("'Report-only' mode. We exit now.")
        return

    if not final_record_list:
        # No records to harvest, quit.
        write_message("Nothing to harvest.")
        return

    # 2: Extract fulltext/metadata XML and upload bunches of
    #    records as configured
    job = APSHarvestJob(CFG_APSHARVEST_DIR)
    count = process_records(job, parameters, final_record_list)

    if parameters.get("from_date") == "last":
        # Harvest of new records from APS successful
        # we update last harvested date
        store_last_updated(None,
                           new_harvest_date,
                           name="apsharvest_api_download")
    # We are done
    write_message("Harvested %d records. (%d failed)" %
                  (count, len(job.records_failed)))
def perform_fulltext_harvest(record_list,
                             add_metadata,
                             attach_fulltext,
                             hidden_fulltext,
                             out_folder,
                             threshold_date=None,
                             journal_mappings=None):
    """
    For every record in given list APSRecord(record ID, DOI, date last
    updated), yield a APSRecord with added FFT dictionary containing URL to
    fulltext/metadata XML downloaded locally.

    If a download is unsuccessful, an error message is given.

    @return: tuple of (APSRecord, error_message)
    """
    count = 0
    request_end = None
    request_start = None
    for record in record_list:
        task_sleep_now_if_required(can_stop_too=False)
        # Unless this is the first request, lets sleep a bit
        if request_end and request_start:
            request_dt = request_end - request_start
            write_message("Checking request time (%d)" % (request_dt, ),
                          verbose=3)
            if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT:
                write_message("Initiating sleep for %.1f seconds" %
                              (request_dt, ),
                              verbose=3)
                time.sleep(request_dt)

        count += 1
        task_update_progress("Harvesting record (%d/%d)" %
                             (count, len(record_list)))

        if not record.doi:
            msg = "No DOI found for record %d" % (record.recid or "", )
            write_message("Error: %s" % (msg, ), stream=sys.stderr)
            yield record, msg
            continue

        url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi}
        result_file = os.path.join(out_folder,
                                   "%s.zip" % (record.doi.replace('/', '_')))
        try:
            request_start = time.time()
            if os.path.exists(result_file):
                # File already downloaded recently, lets see if it is the same
                file_last_modified = get_file_modified_date(result_file)
                if not compare_datetime_to_iso8601_date(
                        file_last_modified, record.last_modified):
                    # File is not older than APS version, we should not download.
                    raise APSHarvesterFileExits

            write_message("Trying to save to %s" % (result_file, ), verbose=5)

            result_file = download_url(url=url,
                                       download_to_file=result_file,
                                       content_type="zip",
                                       retry_count=5,
                                       timeout=60.0)
            write_message("Downloaded %s to %s" % (url, result_file),
                          verbose=2)
        except InvenioFileDownloadError, e:
            msg = "URL could not be opened: %s" % (url, )
            write_message("Error: %s" % (msg, ), stream=sys.stderr)
            yield record, msg
            continue

        except APSHarvesterFileExits:
            write_message("File exists at %s" % (result_file, ), verbose=2)
Example #41
0
def _task_run_core():
    """
    Runs the requested task in the bibsched environment.
    """
    if bibtask.task_get_option('update_personid'):
        record_ids = bibtask.task_get_option('record_ids')
        if record_ids:
            record_ids = map(int, record_ids)
        all_records = bibtask.task_get_option('all_records')

        bibtask.task_update_progress('Updating personid...')
        run_rabbit(record_ids, all_records)
        bibtask.task_update_progress('PersonID update finished!')

    if bibtask.task_get_option("disambiguate"):
        bibtask.task_update_progress('Performing full disambiguation...')
        run_tortoise(bool(bibtask.task_get_option("from_scratch")))
        bibtask.task_update_progress('Full disambiguation finished!')

    if bibtask.task_get_option("merge"):
        bibtask.task_update_progress('Merging results...')
        run_merge()
        bibtask.task_update_progress('Merging finished!')

    return 1
Example #42
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload and bibindex for " +
                    str(rec_id) + " record", 5)
                _print(
                    "** We will upload the following xml code " +
                    repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME +
                         " finished. %s DOIs processed, %s to add" %
                         (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True
Example #43
0
def bst_doi_timestamp(reset=0):
    prepate_doi_table()
    now = datetime.now()
    last_run = ((run_sql("SELECT max(creation_date) FROM doi")[0][0]
                 or datetime(2014, 1, 1)) -
                timedelta(days=4)).strftime("%Y-%m-%d")
    if int(reset):
        last_run = (datetime(2014, 1, 1) -
                    timedelta(days=4)).strftime("%Y-%m-%d")
    write_message("Retrieving DOIs modified since %s" % last_run)
    restart_on_error = True
    while restart_on_error:
        restart_on_error = False
        for publisher, re_match in CFG_SCOAP3_DOIS.items():
            task_update_progress("Retrieving DOIs for %s" % publisher)
            write_message("Retriving DOIs for %s" % publisher)
            try:
                res = get_all_modified_dois(publisher,
                                            last_run,
                                            re_match,
                                            debug=True)
                for doi in res:
                    if publisher == "10.1093":
                        db_entry = run_sql(
                            "SELECT doi, publication_date FROM doi WHERE doi=%s",
                            (doi, ))
                        pub_date = None
                        if 'published-online' in res[doi]:
                            if len(res[doi]['published-online']['date-parts']
                                   [0]) == 3:
                                pub_date = datetime.strptime(
                                    '-'.join(
                                        map(
                                            str, res[doi]['published-online']
                                            ['date-parts'][0])), "%Y-%m-%d")

                        write_message(db_entry)
                        if db_entry:
                            if db_entry[0][
                                    1]:  # publication date is in the system
                                continue
                            else:
                                if pub_date:
                                    run_sql(
                                        "UPDATE doi SET publication_date = %s WHERE doi=%s",
                                        (pub_date, doi))
                                else:
                                    continue
                        else:
                            write_message(
                                "New DOI discovered for publisher %s: %s, publication: %s"
                                % (publisher, doi, pub_date))
                            if pub_date:
                                run_sql(
                                    "INSERT INTO doi(doi, creation_date, publication_date) VALUES(%s, %s, %s)",
                                    (doi, now, pub_date))
                            else:
                                run_sql(
                                    "INSERT INTO doi(doi, creation_date) VALUES(%s, %s)",
                                    (doi, now))
                    else:
                        if run_sql("SELECT doi FROM doi WHERE doi=%s",
                                   (doi, )):
                            continue
                        write_message(
                            "New DOI discovered for publisher %s: %s" %
                            (publisher, doi))
                        run_sql(
                            "INSERT INTO doi(doi, creation_date) VALUES(%s, %s)",
                            (doi, now))
            except URLError as e:
                write_message("%s %s %s" % (publisher, last_run, re_match))
                write_message("Problem with connection! %s" % (e, ))
                #restart_on_error = True
            except socket.timeout as e:
                write_message("Timeout error %s" % (e, ))
                write_message("Finishing and rescheduling")
                #restart_on_error = True
            except ValueError as e:
                write_message("Value error in JSON string! %s" % (e, ))
Example #44
0
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url, ))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)

    if not entries:
        return xml_files

    # look what files already exist
    # there are currently O(10^5) files in the directory tree rooted
    # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards
    # of 5 minutes to walk.
    # might make sense to have a db table with already harvested files
    task_sleep_now_if_required()
    allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY)
    task_sleep_now_if_required()

    for fileUrl, fileName in entries:
        if fileName in allfilenames:
            write_message(
                "Not downloading %s, found file with same name in %s" % (
                    fileName,
                    CFG_CONSYN_OUT_DIRECTORY,
                ))
            continue
        task_sleep_now_if_required()

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        fileUrl = fileUrl.replace(' ', '%20')
        try:
            write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
            download_url(fileUrl, "zip", outFilename, 5, 60.0)
            new_sources.append(outFilename)
        except InvenioFileDownloadError as err:
            _errors_detected.append(err)
            write_message("URL could not be opened: %s" % fileUrl)
            write_message(str(err))
            write_message(traceback.format_exc()[:-1])
            task_update_status("CERROR")
            continue
        try:
            xml_files.extend(extractAll(outFilename, delete_zip, directory))
        except BadZipfile:
            _errors_detected.append(err)
            write_message("Error BadZipfile %s", (outFilename, ))
            task_update_status("CERROR")
            remove(outFilename)

    return xml_files
Example #45
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    host = CFG_DATABASE_HOST
    port = CFG_DATABASE_PORT
    connection = None
    try:
        if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'):
            connection = get_connection_for_dump_on_slave()
            write_message("Dump on slave requested")
            write_message("... checking if slave is well up...")
            check_slave_is_up(connection)
            write_message("... checking if slave is in consistent state...")
            check_slave_is_in_consistent_state(connection)
            write_message("... detaching slave database...")
            detach_slave(connection)
            write_message("... scheduling dump on slave helper...")
            helper_arguments = []
            if task_get_option("number"):
                helper_arguments += ["--number", str(task_get_option("number"))]
            if task_get_option("output"):
                helper_arguments += ["--output", str(task_get_option("output"))]
            if task_get_option("params"):
                helper_arguments += ["--params", str(task_get_option("params"))]
            if task_get_option("ignore_tables"):
                helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))]
            if task_get_option("compress"):
                helper_arguments += ["--compress"]
            if task_get_option("slave"):
                helper_arguments += ["--slave", str(task_get_option("slave"))]
            helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper']
            task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments)
            write_message("Slave scheduled with ID %s" % task_id)
            task_update_progress("DONE")
            return True
        elif task_get_option('dump_on_slave_helper_mode'):
            write_message("Dumping on slave mode")
            connection = get_connection_for_dump_on_slave()
            write_message("... checking if slave is well down...")
            check_slave_is_down(connection)
            host = CFG_DATABASE_SLAVE

        task_update_progress("Reading parameters")
        write_message("Reading parameters started")
        output_dir = task_get_option('output', CFG_LOGDIR)
        output_num = task_get_option('number', 5)
        params = task_get_option('params', None)
        compress = task_get_option('compress', False)
        slave = task_get_option('slave', False)
        ignore_tables = task_get_option('ignore_tables', None)
        if ignore_tables:
            ignore_tables = get_table_names(ignore_tables)
        else:
            ignore_tables = None

        output_file_suffix = task_get_task_param('task_starting_time')
        output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql'
        if compress:
            output_file_suffix = "%s.gz" % (output_file_suffix,)
        write_message("Reading parameters ended")

        # make dump:
        task_update_progress("Dumping database")
        write_message("Database dump started")

        if slave:
            output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,)
        else:
            output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,)
        output_file = output_file_prefix + output_file_suffix
        dump_path = output_dir + os.sep + output_file
        dump_database(dump_path, \
                        host=host,
                        port=port,
                        params=params, \
                        compress=compress, \
                        ignore_tables=ignore_tables)
        write_message("Database dump ended")
    finally:
        if connection and task_get_option('dump_on_slave_helper_mode'):
            write_message("Reattaching slave")
            attach_slave(connection)
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_file_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Example #46
0
def bst_consyn_harvest(feed_url=None,
                       package=None,
                       feed_file=None,
                       package_list_file=None,
                       batch_size='500',
                       delete_zip='False',
                       submit='False',
                       threshold_date=None):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    :param threshold_date: threshold date only converts records that they were
                      published after threshold_date
    :type threshold_date: string in the format YYYY-MM-DD
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if threshold_date:
        import time
        date_format = "%Y-%m-%d"
        try:
            date = datetime(*(time.strptime(threshold_date, date_format)[0:6]))
            threshold_date = date.strftime('%Y-%m-%d')
        except ValueError:
            write_message('Error threshold_date parameter is not '
                          'in the right format. It should be in '
                          'form "YYYY-MM-DD".')
            task_update_status("ERROR")
            return

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    journal_mappings = get_kbs()['journals'][1]
    els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, delete_zip, out_folder,
                                    new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(package_list, delete_zip,
                                              new_sources, out_folder)
    elif feed_file:
        entries = parse_feed(feed_file)
        links = [a[0] for a in entries]
        package_list = [a[1] for a in entries]
        package_list = [
            join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list
        ]
        for package in package_list:
            task_sleep_now_if_required()
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                link = link.replace(' ', '%20')
                try:
                    message = ("Downloading %s to %s\n" % (link, package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(package_list, delete_zip,
                                                  new_sources, out_folder)
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, delete_zip, new_sources,
                                  out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    task_sleep_now_if_required()
    results = convert_files(xml_files,
                            els,
                            prefix=consyn_files,
                            threshold_date=threshold_date)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    task_sleep_now_if_required()
    create_collection(batch_size, new_files, new_sources, out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(error))
def task_run_core():
    """Run the harvesting task.  The row argument is the oaiharvest task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    reposlist = []
    datelist = []
    dateflag = 0
    possible_postmodes = [code for code, dummy in CFG_OAI_POSSIBLE_POSTMODES]
    filepath_prefix = tmpHARVESTpath + "_" + str(
        task_get_task_param("task_id"))
    ### go ahead: build up the reposlist
    if task_get_option("repository") is not None:
        ### user requests harvesting from selected repositories
        write_message("harvesting from selected repositories")
        for reposname in task_get_option("repository"):
            row = get_row_from_reposname(reposname)
            if row == []:
                write_message("source name " + reposname + " is not valid")
                continue
            else:
                reposlist.append(get_row_from_reposname(reposname))
    else:
        ### user requests harvesting from all repositories
        write_message("harvesting from all repositories in the database")
        reposlist = get_all_rows_from_db()

    ### go ahead: check if user requested from-until harvesting
    if task_get_option("dates"):
        ### for each repos simply perform a from-until date harvesting...
        ### no need to update anything
        dateflag = 1
        for element in task_get_option("dates"):
            datelist.append(element)

    error_happened_p = False
    j = 0
    for repos in reposlist:
        j += 1
        task_sleep_now_if_required()
        reponame = str(repos[0][6])
        postmode = str(repos[0][9])
        setspecs = str(repos[0][10])
        harvested_files_list = []
        if postmode in possible_postmodes:
            # Harvest phase
            harvestpath = filepath_prefix + "_" + str(j) + "_" + \
                         time.strftime("%Y%m%d%H%M%S") + "_harvested"
            if dateflag == 1:
                task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \
                                     (reponame, \
                                      str(datelist[0]),
                                      str(datelist[1]),
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix=repos[0][2],
                                                       baseurl=repos[0][1],
                                                       harvestpath=harvestpath,
                                                       fro=str(datelist[0]),
                                                       until=str(datelist[1]),
                                                       setspecs=setspecs)
                if exit_code == 1:
                    write_message("source " + reponame + \
                                  " was harvested from " + str(datelist[0]) \
                                  + " to " + str(datelist[1]))
                    harvested_files_list = file_list
                else:
                    write_message("an error occurred while harvesting "
                                  "from source " + reponame +
                                  " for the dates chosen")
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][7] is None and repos[0][8] != 0:
                write_message("source " + reponame + \
                              " was never harvested before - harvesting whole "
                              "repository")
                task_update_progress("Harvesting %s (%i/%i)" % \
                                     (reponame,
                                      j, \
                                      len(reposlist)))
                exit_code, file_list = oai_harvest_get(prefix=repos[0][2],
                                                       baseurl=repos[0][1],
                                                       harvestpath=harvestpath,
                                                       setspecs=setspecs)
                if exit_code == 1:
                    update_lastrun(repos[0][0])
                    harvested_files_list = file_list
                else:
                    write_message("an error occurred while harvesting from "
                                  "source " + reponame)
                    error_happened_p = True
                    continue

            elif dateflag != 1 and repos[0][8] != 0:
                ### check that update is actually needed,
                ### i.e. lastrun+frequency>today
                timenow = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                lastrundate = re.sub(r'\.[0-9]+$', '',
                                     str(repos[0][7]))  # remove trailing .00
                timeinsec = int(repos[0][8]) * 60 * 60
                updatedue = add_timestamp_and_timelag(lastrundate, timeinsec)
                proceed = compare_timestamps_with_tolerance(updatedue, timenow)
                if proceed == 0 or proceed == -1:  #update needed!
                    write_message("source " + reponame +
                                  " is going to be updated")
                    fromdate = str(repos[0][7])
                    fromdate = fromdate.split()[0]  # get rid of time
                    # of the day for the moment
                    task_update_progress("Harvesting %s (%i/%i)" % \
                                         (reponame,
                                         j, \
                                         len(reposlist)))
                    exit_code, file_list = oai_harvest_get(
                        prefix=repos[0][2],
                        baseurl=repos[0][1],
                        harvestpath=harvestpath,
                        fro=fromdate,
                        setspecs=setspecs)
                    if exit_code == 1:
                        update_lastrun(repos[0][0])
                        harvested_files_list = file_list
                    else:
                        write_message("an error occurred while harvesting "
                                      "from source " + reponame)
                        error_happened_p = True
                        continue
                else:
                    write_message("source " + reponame +
                                  " does not need updating")
                    continue

            elif dateflag != 1 and repos[0][8] == 0:
                write_message("source " + reponame + \
                    " has frequency set to 'Never' so it will not be updated")
                continue

            # Harvesting done, now convert/extract/filter/upload as requested
            if len(harvested_files_list) < 1:
                write_message("No records harvested for %s" % (reponame, ))
                continue
            active_files_list = harvested_files_list
            # Convert phase
            if 'c' in postmode:
                converted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Converting material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    converted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_converted"
                    converted_files_list.append(converted_file)
                    (exitcode,
                     err_msg) = call_bibconvert(config=str(repos[0][5]),
                                                harvestpath=active_file,
                                                convertpath=converted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                                      reponame + " was successfully converted")
                    else:
                        write_message(
                            "an error occurred while converting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for converted_file in converted_files_list:
                    write_message("File %s contains %i records." % \
                                  (converted_file,
                                   get_nb_records_in_file(converted_file)))
                active_files_list = converted_files_list

            if 'e' in postmode:
                # Download tarball for each harvested/converted record, then run plotextrator.
                # Update converted xml files with generated xml or add it for upload
                extracted_files_list = []
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Extracting material harvested from %s (%i/%i)" % \
                                         (reponame, i, len(active_files_list)))
                    extracted_file = filepath_prefix + "_" + str(i) + "_" + \
                        time.strftime("%Y%m%d%H%M%S") + "_extracted"
                    extracted_files_list.append(extracted_file)
                    (exitcode,
                     err_msg) = call_plotextractor(active_file, extracted_file)
                    if exitcode == 0:
                        write_message("material harvested from source " +
                                      reponame + " was successfully extracted")
                    else:
                        write_message(
                            "an error occurred while extracting from " +
                            reponame + ': \n' + err_msg)
                        error_happened_p = True
                        continue
                # print stats:
                for extracted_file in extracted_files_list:
                    write_message("File %s contains %i records." % \
                                  (extracted_file,
                                   get_nb_records_in_file(extracted_file)))
                active_files_list = extracted_files_list

            # Filter-phase
            if 'f' in postmode:
                # first call bibfilter:
                res = 0
                uploaded = False
                i = 0
                for active_file in active_files_list:
                    i += 1
                    task_sleep_now_if_required()
                    task_update_progress("Filtering material harvested from %s (%i/%i)" % \
                                         (reponame, \
                                          i, \
                                          len(active_files_list)))
                    res += call_bibfilter(str(repos[0][11]), active_file)
                if len(active_files_list) > 0:
                    if res == 0:
                        write_message("material harvested from source " +
                                      reponame +
                                      " was successfully bibfiltered")
                    else:
                        write_message("an error occurred while bibfiltering "
                                      "harvest from " + reponame)
                        error_happened_p = True
                        continue
                # print stats:
                for active_file in active_files_list:
                    write_message("File %s contains %i records." % \
                        (active_file + ".insert.xml",
                        get_nb_records_in_file(active_file + ".insert.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".correct.xml",
                        get_nb_records_in_file(active_file + ".correct.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".append.xml",
                        get_nb_records_in_file(active_file + ".append.xml")))
                    write_message("File %s contains %i records." % \
                        (active_file + ".holdingpen.xml",
                        get_nb_records_in_file(active_file + ".holdingpen.xml")))

            # Upload files
            if "u" in postmode:
                if 'f' in postmode:
                    # upload filtered files
                    i = 0
                    for active_file in active_files_list:
                        task_sleep_now_if_required()
                        i += 1
                        if get_nb_records_in_file(active_file +
                                                  ".insert.xml") > 0:
                            task_update_progress("Uploading new records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".insert.xml", \
                                                  ["-i"], oai_src_id = repos[0][0])
                            uploaded = True
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file +
                                                  ".correct.xml") > 0:
                            task_update_progress("Uploading corrections for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".correct.xml", \
                                                  ["-c"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file +
                                                  ".append.xml") > 0:
                            task_update_progress("Uploading additions for records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".append.xml", \
                                                  ["-a"], oai_src_id = repos[0][0])
                            uploaded = True
                        if get_nb_records_in_file(active_file +
                                                  ".holdingpen.xml") > 0:
                            task_update_progress("Uploading records harvested from %s to holding pen (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file + ".holdingpen.xml", \
                                                  ["-o"], oai_src_id = repos[0][0])
                            uploaded = True
                    if len(active_files_list) > 0:
                        if res == 0:
                            if uploaded:
                                write_message(
                                    "material harvested from source " +
                                    reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue
                else:
                    # upload files normally
                    res = 0
                    i = 0
                    uploaded = False
                    for active_file in active_files_list:
                        i += 1
                        task_sleep_now_if_required()
                        if get_nb_records_in_file(active_file) > 0:
                            task_update_progress("Uploading records harvested from %s (%i/%i)" % \
                                                 (reponame, \
                                                  i, \
                                                  len(active_files_list)))
                            res += call_bibupload(active_file,
                                                  oai_src_id=repos[0][0])
                            uploaded = True
                        if res == 0:
                            if uploaded:
                                write_message(
                                    "material harvested from source " +
                                    reponame + " was successfully uploaded")
                            else:
                                write_message("nothing to upload")
                        else:
                            write_message("an error occurred while uploading "
                                          "harvest from " + reponame)
                            error_happened_p = True
                            continue

        else:  ### this should not happen
            write_message("invalid postprocess mode: " + postmode +
                          " skipping repository")
            error_happened_p = True
            continue

    if error_happened_p:
        return False
    else:
        return True
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True,
                         asana_key=CFG_ASANA_API_KEY,
                         asana_parent_id=ASANA_PARENT_TASK_ID,
                         skip_result_types='missing'):
    """Update DOIs on documents harvested from ArXiv.

    Parameters:
    :param input_uri: Link to new URI data
        DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
        NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
    :param log_dir: Directory to store log files in
    :param logging: True or False, default True
    :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY
        NOTE: Passing the value of None for this parameter will skip writing
        to Asana and instead email the instance admin
    :param asana_parent_id: The taskID of the task in Asana to log subtasks to
    :param skip_result_types: Error messages to not bother with during
        reporting, input as Comma Seperated Values CSVs
        Possible values: missing, ambigous, incorrect
    """
    skip_results = verify_skip_results(skip_result_types)

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI: %s" % (input_uri, ))

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a',
                                 user=SCRIPT_NAME,
                                 notimechange=False)

    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retrieving DOI data")
        return False
    except ExpatError:
        _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1)
        task_update_progress("Failed parsing DOI data")
        return False

    root = tree.getroot()

    try:
        date_el = root.find('date')
        date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'),
                                 date_el.get('day'))
        _print("Processing DOIs last updated on date %s" % date_str)
    except AttributeError:
        _print("Warning: Couldn't get last published date of Arxiv DOI feed.")

    doi_count = 0
    new_count = 0

    # Stores any DOIs with have issues with in structure:
    # Missing: (doi, arxiv preprint_id, published date)
    # Ambiguous: (doi, arxiv preprint_id, rec_ids)
    # Incorrect: (rec_id, old-doi, new-doi)
    problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []}

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            try:
                record_xml = append_to_record(rec_id, doi, published_date)
            except DOIError as ex:
                problem_dois['incorrect'].append((rec_id, ex.message, doi))
                continue
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload for " +
                    "%s record" % rec_id, 5)
                _print(
                    "** We will upload the following xml code %s" %
                    repr(record_xml), 9)
                bibupload.add(record_xml)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
            problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id)))
        else:
            _print('No record found matching arxiv ID: %s' % arxiv, 9)
            problem_dois['missing'].append((doi, arxiv, published_date))

    _print("========================| FINAL SCORE |=======================", 1)
    _print("DOIs found and processed: %d" % doi_count, 1)
    _print(
        "Arxiv IDs without corresponding records: %d" %
        len(problem_dois['missing']), 1)
    _print(
        "Arxiv IDs corresponding to multiple records (duplicates): %d" %
        len(problem_dois['ambiguous']), 1)
    _print(
        "Inspire records with an incorrect DOI: %d" %
        len(problem_dois['incorrect']), 1)
    _print("Records without DOIs requiring appends: %d" % new_count, 1)
    _print("==============================================================", 1)

    bibupload.cleanup()

    notify_on_errors(problem_dois, log_dir, doi_count, new_count, asana_key,
                     asana_parent_id, skip_results)

    return True
Example #49
0
def generate_sitemaps(collection_names, fulltext_filter=''):
    """
    Generate sitemaps themselves. Return list of generated sitemaps files
    """
    sitemap_id = 1
    writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id)
    sitemaps = [writer.get_name()]
    nb_urls = 0
    for [lang, lang_name] in language_list_long():
        writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang,
                       lastmod=datetime.today(),
                       changefreq=DEFAULT_CHANGEFREQ_HOME,
                       priority=DEFAULT_PRIORITY_HOME)
        nb_urls += 1

    recids = get_all_public_records(collection_names)
    task_update_progress("Generating urls for %s records" % len(recids))
    #task_sleep_now_if_required(can_stop_too=True)
    for (recid, lastmod) in recids:
        if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0:
            #print nb_urls
            #print writer.get_size()
            if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS:
                writer.close()
                sitemap_id += 1
                writer = SitemapWriter(CFG_WEBDIR +
                                       '/sitemap-%s.xml' % sitemap_id)
                sitemaps.append(writer.get_name())
        nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s' % recid,
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_RECORDS,
                                 priority=DEFAULT_PRIORITY_RECORDS)
        #task_sleep_now_if_required(can_stop_too=False)
    task_update_progress("Generating urls for collections")
    for (collection, lastmod) in get_all_public_collections(collection_names):
        for [lang, lang_name] in language_list_long():
            if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0:
                #print nb_urls
                #print writer.get_size()
                if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS:
                    writer.close()
                    sitemap_id += 1
                    writer = SitemapWriter('%s/sitemap-%s.xml' %
                                           (CFG_WEBDIR, sitemap_id))
                    sitemaps.append(writer.get_name())
            nb_urls = writer.add_url('%s/collection/%s?ln=%s' %
                                     (CFG_SITE_URL, quote(collection), lang),
                                     lastmod=lastmod,
                                     changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS,
                                     priority=DEFAULT_PRIORITY_COLLECTIONS)
            #task_sleep_now_if_required(can_stop_too=False)
    task_update_progress("Generating urls for fulltexts")
    for (recid, lastmod) in filter_fulltexts(recids, fulltext_filter):
        if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0:
            #print nb_urls
            #print writer.get_size()
            if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS:
                writer.close()
                sitemap_id += 1
                writer = SitemapWriter(CFG_WEBDIR +
                                       '/sitemap-%s.xml' % sitemap_id)
                sitemaps.append(writer.get_name())
        nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/files' % recid,
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS,
                                 priority=DEFAULT_PRIORITY_FULLTEXTS)
        #task_sleep_now_if_required(can_stop_too=False)

    task_update_progress("Generating urls for comments")
    for (recid, lastmod) in filter_comments(recids):
        if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0:
            #print nb_urls
            #print writer.get_size()
            if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS:
                writer.close()
                sitemap_id += 1
                writer = SitemapWriter(CFG_WEBDIR +
                                       '/sitemap-%s.xml' % sitemap_id)
                sitemaps.append(writer.get_name())
        nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/comments' % recid,
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_COMMENTS,
                                 priority=DEFAULT_PRIORITY_COMMENTS)
        #task_sleep_now_if_required(can_stop_too=False)
    task_update_progress("Generating urls for reviews")
    for (recid, lastmod) in filter_reviews(recids):
        if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0:
            #print nb_urls
            #print writer.get_size()
            if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS:
                writer.close()
                sitemap_id += 1
                writer = SitemapWriter(CFG_WEBDIR +
                                       '/sitemap-%s.xml' % sitemap_id)
                sitemaps.append(writer.get_name())
        nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/reviews' % recid,
                                 lastmod=lastmod,
                                 changefreq=DEFAULT_CHANGEFREQ_REVIEWS,
                                 priority=DEFAULT_PRIORITY_REVIEWS)
        #task_sleep_now_if_required(can_stop_too=False)
    try:
        writer.close()
    except:
        pass
    return sitemaps
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if threshold_date:
        # Input from user. Validate date
        try:
            harvest_from_date = validate_date(threshold_date)
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e), ),
                          stream=sys.stderr)
            return 1
Example #51
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx % 200 == 0:
            task_sleep_now_if_required(True)

            update_status(
                float(idx) / len(bibrecs),
                "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(
                " - Record was deleted, removing from pid and continuing with next record",
                True)
            remove_papers([rec])
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_author_refs_of_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(split_name_parts(get_name_by_bibref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures),
                 bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures),
                 bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(
                " - - Moving signature: %s on %s to %s as %s" %
                (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        pids_having_rec = set(
            [int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(
                                matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
def ref_analyzer(citation_informations, dicts,
                 updated_recids, tags, do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)
                try:
                    raise ValueError(msg)
                except ValueError:
                    register_exception(alert_admin=True)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                try:
                    raise ValueError(msg)
                except ValueError:
                    register_exception(alert_admin=True)
                continue # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)
                try:
                    raise ValueError(msg)
                except ValueError:
                    register_exception(alert_admin=True)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)
                try:
                    raise ValueError(msg)
                except ValueError:
                    register_exception(alert_admin=True)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(selfcites.iteritems(), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(selfrefs.iteritems(), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(authorcites.iteritems(), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
    write_message("... checking rec journals: %.2f sec" % (t6-t5))
    write_message("... checking rec DOI: %.2f sec" % (t7-t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
    try:
        bibarchive = BibRecDocs(recid)
    except Exception, e:
        write_message("Could not instantiate record #%s: %s" % (recid, e))
        return 0

    write_message("Going to create related file formats for record #%s" %
                  recid)

    i = 0
    for docname in docnames:
        i += 1
        task_sleep_now_if_required()
        msg = "Processing %s (%i/%i)" % (docname, i, len(docnames))
        write_message(msg)
        task_update_progress(msg)
        try:
            bibdoc = bibarchive.get_bibdoc(docname)
        except Exception, e:
            write_message("Could not process docname %s: %s" % (docname, e))
            continue

        (prev_desc, prev_comment) = \
                    get_description_and_comment(bibarchive.get_bibdoc(docname).list_latest_files())

        # List all files that are not icons or subformats
        current_files = [bibdocfile.get_path() for bibdocfile in bibdoc.list_latest_files() if \
                         not bibdocfile.get_subformat() and not bibdocfile.is_icon()]

        ## current_files = []
        ## if not force:
Example #54
0
def rabbit(bibrecs=None,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):

    logger = Logger("Rabbit")

    if verbose:
        logger.verbose = True

    if not bibrecs:
        logger.log("Running on all records")
    else:
        logger.log("Running on %s " % (str(bibrecs)))

    populate_mnames_pids_cache()

    global M_NAME_PIDS_CACHE

    memoized_compare_names = memoized(comp_names)
    compare_names = lambda x, y: memoized_compare_names(*sorted((x, y)))

    def find_pids_by_matchable_name_with_cache(matchable_name):
        try:
            matched_pids = [M_NAME_PIDS_CACHE[matchable_name]]
        except KeyError:
            matched_pids = get_authors_by_name(matchable_name,
                                               use_matchable_name=True)
            if matched_pids:
                M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0]
        return matched_pids

    if USE_EXT_IDS:

        def get_matched_pids_by_external_ids(sig, rec, pids_having_rec):
            '''
            This function returns all the matched pids after iterating
            through all available external IDs of the system.
            '''
            for get_external_id_of_signature in external_id_getters:
                external_id = get_external_id_of_signature(sig + (rec, ))
                if external_id:
                    matched_pids = list(
                        get_author_by_external_id(external_id[0]))
                    if matched_pids and int(
                            matched_pids[0][0]) in pids_having_rec:
                        matched_pids = list()
                    return matched_pids

    threshold = 0.8

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    bibrecs = list(bibrecs)
    for idx, rec in enumerate(bibrecs):

        logger.log("Considering %s" % str(rec))

        if idx % 100 == 0:
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if idx % 1000 == 0:
            destroy_partial_marc_caches()
            populate_partial_marc_caches(bibrecs[idx:idx + 1000])

            logger.log(
                float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs)))

        if rec in deleted:
            remove_papers([rec])
            continue

        author_refs = get_author_refs_of_paper(rec)
        coauthor_refs = get_coauthor_refs_of_paper(rec)

        markrefs = frozenset(
            chain(izip(cycle([100]), imap(itemgetter(0), author_refs)),
                  izip(cycle([700]), imap(itemgetter(0), coauthor_refs))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new, get_name_by_bibref(new)) for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logger.log(" - Deleted signatures: %s" % str(old_signatures))
        logger.log(" - Added signatures: %s" % str(new_signatures))
        logger.log(" - Matrix: %s" % str(matrix))

        #[new_signatures, old_signatures]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logger.log(" - Best match: %s " % str(best_match))

        for new, old in best_match:
            logger.log("  -  -  Moving signature: %s on %s to %s as %s" %
                       (old, rec, new, new_signatures_names[new]))
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)
        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        remaining_personid_rows = ([
            x for x in personid_rows if x[1:3] in old_signatures
        ])

        pids_having_rec = set([int(row[0]) for row in remaining_personid_rows])
        logger.log(" - Not matched: %s" % str(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matchable_name = create_matchable_name(name)
            matched_pids = list()
            if USE_EXT_IDS:
                matched_pids = get_matched_pids_by_external_ids(
                    sig, rec, pids_having_rec)

                if matched_pids:
                    add_signature(list(sig) + [rec],
                                  name,
                                  matched_pids[0][0],
                                  m_name=matchable_name)
                    M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0]
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_matchable_name_with_cache(
                matchable_name)
            if not matched_pids:
                for matching_function in M_NAME_FUNCTIONS[1:]:
                    matchable_name = matching_function(name)
                    matched_pids = find_pids_by_matchable_name_with_cache(
                        matchable_name)
                    if matched_pids:
                        break

            matched_pids = [p for p in matched_pids if int(p) not in used_pids]

            best_matched_pid = None
            for matched_pid in matched_pids:
                # Because of the wrongly labeled data in the db, all
                # of the possible choices have to be checked. If one of the
                # coauthors, who had his signature already considered, claimed
                # in the past one of the signatures of currently considered
                # author, the algorithm will think that two signatures belong
                # to the same person, and, will create an unnecessary new
                # profile.
                if not int(matched_pid) in pids_having_rec:
                    best_matched_pid = matched_pid
                    break

            if not best_matched_pid:
                new_pid = new_person_from_signature(
                    list(sig) + [rec], name, matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = new_pid
                used_pids.add(new_pid)
                updated_pids.add(new_pid)
            else:
                add_signature(list(sig) + [rec],
                              name,
                              best_matched_pid,
                              m_name=matchable_name)
                M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid
                used_pids.add(best_matched_pid)
                updated_pids.add(best_matched_pid)
                pids_having_rec.add(best_matched_pid)

        logger.log('Finished with %s' % str(rec))

    logger.update_status_final()

    destroy_partial_marc_caches()

    if personids_to_update_extids:
        updated_pids |= set(personids_to_update_extids)
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS,
            force_cache_tables=True)

    destroy_partial_marc_caches()
    destroy_mnames_pids_cache()

    remove_empty_authors()

    task_update_progress("Done!")
Example #55
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    plugins = load_plugins()
    rules = load_rules(plugins)
    task_set_option('plugins', plugins)
    recids_for_rules = get_recids_for_rules(rules)

    all_recids = intbitset([])
    single_rules = set()
    batch_rules = set()
    for rule_name, rule_recids in recids_for_rules.iteritems():
        all_recids.union_update(rule_recids)
        if plugins[rules[rule_name]["check"]]["batch"]:
            batch_rules.add(rule_name)
        else:
            single_rules.add(rule_name)

    records_to_upload_holdingpen = []
    records_to_upload_replace = []
    for batch in iter_batches(all_recids, CFG_BATCH_SIZE):

        for rule_name in batch_rules:
            rule = rules[rule_name]
            rule_recids = recids_for_rules[rule_name]
            task_sleep_now_if_required(can_stop_too=True)
            records = []
            for i, record_id, record in batch:
                if record_id in rule_recids:
                    records.append(record)
            if len(records):
                check_records(rule, records)

        # Then run them trught normal rules
        for i, record_id, record in batch:
            progress_percent = int(float(i) / len(all_recids) * 100)
            task_update_progress("Processing record %s/%s (%i%%)." %
                                 (i, len(all_recids), progress_percent))
            write_message("Processing record %s" % record_id)

            for rule_name in single_rules:
                rule = rules[rule_name]
                rule_recids = recids_for_rules[rule_name]
                task_sleep_now_if_required(can_stop_too=True)
                if record_id in rule_recids:
                    check_record(rule, record)

            if record.amended:
                if record.holdingpen:
                    records_to_upload_holdingpen.append(record)
                else:
                    records_to_upload_replace.append(record)

            if not record.valid:
                submit_ticket(record, record_id)

        if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_holdingpen, True)
            records_to_upload_holdingpen = []
        if len(records_to_upload_replace) >= CFG_BATCH_SIZE:
            upload_amendments(records_to_upload_replace, False)
            records_to_upload_replace = []

    ## In case there are still some remaining amended records
    if records_to_upload_holdingpen:
        upload_amendments(records_to_upload_holdingpen, True)
    if records_to_upload_replace:
        upload_amendments(records_to_upload_replace, False)

    # Update the database with the last time the rules was ran
    for rule in rules.keys():
        update_rule_last_run(rule)

    return True
def get_citation_informations(recid_list, tags, fetch_catchup_info=True):
    """scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:
    if run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_journal'][0:2],
               (tags['refs_journal'], )) or \
       run_sql("SELECT value FROM bib%sx WHERE tag=%%s LIMIT 1" % tags['refs_report_number'][0:2],
               (tags['refs_report_number'], )):

        done = 0  # for status reporting
        for recid in recid_list:
            if done % 10 == 0:
                task_sleep_now_if_required()
                # in fact we can sleep any time here

            if done % 1000 == 0:
                mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
                write_message(mesg)
                task_update_progress(mesg)

            done += 1

            if recid in INTBITSET_OF_DELETED_RECORDS:
                # do not treat this record since it was deleted; we
                # skip it like this in case it was only soft-deleted
                # e.g. via bibedit (i.e. when collection tag 980 is
                # DELETED but other tags like report number or journal
                # publication info remained the same, so the calls to
                # get_fieldvalues() below would return old values)
                continue

            if tags['refs_report_number']:
                references_info['report-numbers'][recid] \
                        = get_fieldvalues(recid,
                                          tags['refs_report_number'],
                                          sort=False)
                msg = "references_info['report-numbers'][%s] = %r" \
                            % (recid, references_info['report-numbers'][recid])
                write_message(msg, verbose=9)
            if tags['refs_journal']:
                references_info['journals'][recid] = []
                for ref in get_fieldvalues(recid,
                                           tags['refs_journal'],
                                           sort=False):
                    try:
                        # Inspire specific parsing
                        journal, volume, page = ref.split(',')
                    except ValueError:
                        pass
                    else:
                        alt_volume = get_alt_volume(volume)
                        if alt_volume:
                            alt_ref = ','.join([journal, alt_volume, page])
                            references_info['journals'][recid] += [alt_ref]
                    references_info['journals'][recid] += [ref]
                msg = "references_info['journals'][%s] = %r" \
                                  % (recid, references_info['journals'][recid])
                write_message(msg, verbose=9)
            if tags['refs_doi']:
                references_info['doi'][recid] \
                        = get_fieldvalues(recid, tags['refs_doi'], sort=False)
                msg = "references_info['doi'][%s] = %r" \
                                       % (recid, references_info['doi'][recid])
                write_message(msg, verbose=9)

            if not fetch_catchup_info:
                # We do not need the extra info
                continue

            if tags['record_pri_number'] or tags['record_add_number']:
                records_info['report-numbers'][recid] = []

                if tags['record_pri_number']:
                    records_info['report-numbers'][recid] \
                        += get_fieldvalues(recid,
                                           tags['record_pri_number'],
                                           sort=False)
                if tags['record_add_number']:
                    records_info['report-numbers'][recid] \
                        += get_fieldvalues(recid,
                                           tags['record_add_number'],
                                           sort=False)

                msg = "records_info[%s]['report-numbers'] = %r" \
                            % (recid, records_info['report-numbers'][recid])
                write_message(msg, verbose=9)

            if tags['doi']:
                records_info['doi'][recid] = []
                for tag in tags['doi']:
                    records_info['doi'][recid] += get_fieldvalues(recid,
                                                                  tag,
                                                                  sort=False)
                msg = "records_info[%s]['doi'] = %r" \
                                          % (recid, records_info['doi'][recid])
                write_message(msg, verbose=9)

            # get a combination of
            # journal vol (year) pages
            if tags['publication']:
                records_info['journals'][recid] = get_journal_info(recid, tags)
                msg = "records_info[%s]['journals'] = %r" \
                                     % (recid, records_info['journals'][recid])
                write_message(msg, verbose=9)

    else:
        mesg = "Warning: there are no records with tag values for " \
               "%s or %s. Nothing to do." % \
                            (tags['refs_journal'], tags['refs_report_number'])
        write_message(mesg)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info
def get_citation_weight(rank_method_code, config, chunk_size=20000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()

    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last + 1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        # result_intermediate should be warranted to exists!
        # but if the user entered a "-R" (do all) option, we need to
        # make an empty start set
        if quick:
            dicts = {
                'cites_weight': last_updated_result(rank_method_code),
                'cites': get_cit_dict("citationdict"),
                'refs': get_cit_dict("reversedict"),
                'selfcites': get_cit_dict("selfcitdict"),
                'selfrefs': get_cit_dict("selfcitedbydict"),
                'authorcites': get_initial_author_dict(),
            }
        else:
            dicts = {
                'cites_weight': {},
                'cites': {},
                'refs': {},
                'selfcites': {},
                'selfrefs': {},
                'authorcites': {},
            }

        # Process fully the updated records
        process_and_store(updated_recids, config, dicts, chunk_size, quick)

        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" % \
                                                      (end_time - begin_time))
        task_update_progress("citation analysis done")
        cites_weight = dicts['cites_weight']
    else:
        cites_weight = {}
        write_message("No new records added since last time this " \
                      "rank method was executed")

    return cites_weight, index_update_time
Example #58
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in recids_for_set.iteritems()
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if not no_upload:
        task_sleep_now_if_required(can_stop_too=True)
        if tot > 0:
            task_low_level_submission('bibupload', 'oairepository', '-c',
                                      filename, '-n')
        else:
            os.remove(filename)

    return True
Example #59
0
def _analyze_documents(records, taxonomy_name, collection,
                       output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message("WARNING: No records were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None


        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message('INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message('WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(bibclassify_engine._output_marc(keywords.items(), (), akws, acro,
                                                      spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                    record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)
Example #60
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)
        if task_get_option('missing'):
            recids |= fetch_records_missing_arxiv_fulltext()
        else:
            recids |= fetch_records_missing_arxiv_fulltext() & \
                fetch_records_modified_since(last_date)

    updated_recids = set()

    try:

        for count, recid in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True