Exemple #1
0
def bst_twitter_fetcher(query):
    """
    Fetch the tweets related to the user and upload them into Invenio.
    @param user: the user
    """
    ## We prepare a temporary MARCXML file to upload.
    fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR)
    tweets = get_tweets(query)
    if tweets:
        os.write(fd, """<collection>\n""")
        for i, tweet in enumerate(tweets):
            ## For every tweet we transform it to MARCXML and we dump it in the file.
            task_update_progress('DONE: tweet %s out %s' % (i, len(tweets)))
            os.write(fd, tweet_to_record(tweet, query))

        os.write(fd, """</collection\n>""")
        os.close(fd)

        ## Invenio magic: we schedule an upload of the created MARCXML to be inserted
        ## ASAP in the system.
        task_low_level_submission('bibupload', 'admin', '-i', '-r', name,
                                  '-P5')
        write_message("Uploaded file %s with %s new tweets about %s" %
                      (name, len(tweets), query))
    else:
        write_message("No new tweets about %s" % query)
Exemple #2
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
Exemple #3
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    The task prints Fibonacci numbers for up to NUM on the stdout, and some
    messages on stderr.
    Return 1 in case of success and 0 in case of failure."""
    n = int(task_get_option('number'))
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
            if task_get_option('error'):
                1 / 0
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Exemple #4
0
def iterate_over_new(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """
    global total_rec

    formatted_records = ''      # (string-)List of formatted record of an iteration
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call
    start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted

    tot = len(list)
    count = 0
    for recID in list:
        t1 = os.times()[4]
        start_date = time.strftime('%Y-%m-%d %H:%M:%S')
        format_record(recID, fmt, on_the_fly=True)
        formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True))
        run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)',
                (recID, fmt, start_date, formatted_record))
        t2 = os.times()[4]
        tbibformat += (t2 - t1)
        count += 1
        if (count % 100) == 0:
            write_message("   ... formatted %s records out of %s" % (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)
    if (tot % 100) != 0:
        write_message("   ... formatted %s records out of %s" % (count, tot))
    return (tot, tbibformat, tbibupload)
Exemple #5
0
def fetch_concerned_arxiv_records(name):
    task_update_progress("Fetching arxiv record ids")

    dummy, last_date = fetch_last_updated(name)

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
        "WHERE `modification_date` >= %s " \
        "AND `creation_date` > NOW() - INTERVAL 7 DAY " \
        "ORDER BY `modification_date`" \
        "LIMIT 5000"
    records = run_sql(sql, [last_date.isoformat()])

    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False

    def check_pdf_date(recid):
        doc = get_pdf_doc(recid)
        if doc:
            return doc.md > last_date
        return False

    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
    records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
    write_message("recids %s" % repr([(r, mod_date.isoformat()) \
                                               for r, mod_date in records]))
    task_update_progress("Done fetching arxiv record ids")
    return records
def solr_commit_if_necessary(next_commit_counter,
                             final_commit=False,
                             recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (
            final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Exemple #7
0
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
Exemple #8
0
def bst_fibonacci(n=30):
    """
    Small tasklets that prints the the Fibonacci sequence for n.
    @param n: how many Fibonacci numbers to print.
    @type n: int
    """
    ## Since it's tasklet, the parameter might be passed as a string.
    ## it should then be converted to an int.
    n = int(n)
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.",
                          sys.stderr,
                          verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message(
                "Error: floppy drive dropped on the floor.  Ignoring and continuing.",
                sys.stderr)
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid, upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None):
    # Counter full or final commit if counter set
    if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0):
        recid_info = ''
        if recid:
            recid_info = ' for recid=%s' % recid
        status_msg = 'Solr ranking indexer COMMITTING' + recid_info
        write_message(status_msg)
        task_update_progress(status_msg)

        try:
            # Commits might cause an exception, most likely a
            # timeout while hitting a background merge
            # Changes will then be committed later by the
            # calling (periodical) task
            # Also, autocommits can be used in the solrconfig
            SOLR_CONNECTION.commit()
        except:
            register_exception(alert_admin=True)
        next_commit_counter = 0

        task_sleep_now_if_required(can_stop_too=True)
    else:
        next_commit_counter = next_commit_counter + 1
    return next_commit_counter
Exemple #11
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Exemple #12
0
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS,
                    old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS):
    """ Checks a folder job files, parses and executes them
    @param new_job_dir: path to the directory with new jobs
    @type new_job_dir: string
    @param old_job_dir: path to the directory where the old jobs are moved
    @type old_job_dir: string
    """
    global _NUMBER, _TASKID
    write_message('Checking directory %s for new jobs' % new_job_dir)
    task_update_progress('Checking for new jobs')
    _TASKID = task_get_task_param('task_id')
    files = os.listdir(new_job_dir)
    for file in files:
        file_fullpath = os.path.join(new_job_dir, file)
        if has_signature(file_fullpath):
            write_message('New Job found: %s' % file)
            job = json_decode_file(file_fullpath)
            if not getval(job, 'isbatch'):
                args = job_to_args(job)
                if not launch_task(args):
                    write_message('Error submitting task')
            else:
                ## We need the job description for the batch engine
                ## So we need to use the new path inside the oldjobs dir
                process_batch(os.path.join(old_job_dir, file))
            ## Move the file to the done dir
            shutil.move(file_fullpath, os.path.join(old_job_dir, file))
            ## Update number for next job
            _NUMBER += 1
    return 1
Exemple #13
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
def solr_add_ranges(id_ranges):
    sub_range_length = task_get_option("flush")
    id_ranges_to_index = []
    for id_range in id_ranges:
        lower_recid = id_range[0]
        upper_recid = id_range[1]
        i_low = lower_recid
        while i_low <= upper_recid:
            i_up = min(i_low + sub_range_length - 1, upper_recid)
            id_ranges_to_index.append((i_low, i_up))
            i_low += sub_range_length

    tags_to_index = get_tags()
    # Indexes latest records first by reversing
    # This allows the ranker to return better results during long indexing
    # runs as the ranker cuts the hitset using latest records
    id_ranges_to_index.reverse()
    next_commit_counter = 0
    for id_range_to_index in id_ranges_to_index:
        lower_recid = id_range_to_index[0]
        upper_recid = id_range_to_index[1]
        status_msg = "Solr ranking indexer called for %s-%s" % (lower_recid,
                                                                upper_recid)
        write_message(status_msg)
        task_update_progress(status_msg)
        next_commit_counter = solr_add_range(lower_recid, upper_recid,
                                             tags_to_index,
                                             next_commit_counter)

    solr_commit_if_necessary(next_commit_counter, final_commit=True)
Exemple #15
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""

    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, "037__9"):
            if report_number == "arXiv":
                return True
        return False

    # Fetch all records inserted since last run
    sql = (
        "SELECT `id`, `modification_date` FROM `bibrec` "
        "WHERE `modification_date` >= %s "
        "ORDER BY `modification_date`"
    )
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option("verbose") >= 9:
        write_message("recids:", verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
Exemple #16
0
def fetch_concerned_records(name):
    task_update_progress("Fetching record ids")

    last_recid, last_date = fetch_last_updated(name)

    if task_get_option('new'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
            "WHERE `creation_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `creation_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    elif task_get_option('modified'):
        # Fetch all records inserted since last run
        sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
            "WHERE `modification_date` >= %s " \
            "AND `id` > %s " \
            "ORDER BY `modification_date`"
        records = run_sql(sql, (last_date.isoformat(), last_recid))
    else:
        given_recids = task_get_option('recids')
        for collection in task_get_option('collections'):
            given_recids.add(get_collection_reclist(collection))

        if given_recids:
            format_strings = ','.join(['%s'] * len(given_recids))
            records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
                "WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
                    list(given_recids))
        else:
            records = []

    task_update_progress("Done fetching record ids")

    return records
Exemple #17
0
def iterate_over_new(recIDs, fmt):
    """Iterate over list of IDs.

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken
        to insert)
    """
    tbibformat = 0     # time taken up by external call
    tbibupload = 0     # time taken up by external call

    tot = len(recIDs)
    reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
        fmt.lower(), _update_format)
    for count, recID in enumerate(recIDs):
        t1 = os.times()[4]
        reformat_function(recID, fmt)
        t2 = os.times()[4]
        tbibformat += t2 - t1
        if count % 100 == 0:
            write_message("   ... formatted %s records out of %s" %
                          (count, tot))
            task_update_progress('Formatted %s out of %s' % (count, tot))
            task_sleep_now_if_required(can_stop_too=True)

    if tot % 100 != 0:
        write_message("   ... formatted %s records out of %s" % (tot, tot))

    return tot, tbibformat, tbibupload
Exemple #18
0
def fill_self_cites_tables(config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon
    """
    algorithm = config['algorithm']
    tags = get_authors_tags()
    all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')]
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun)
Exemple #19
0
def fetch_concerned_arxiv_records(name):
    task_update_progress("Fetching arxiv record ids")

    dummy, last_date = fetch_last_updated(name)

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
        "WHERE `modification_date` >= %s " \
        "AND `creation_date` > NOW() - INTERVAL 7 DAY " \
        "ORDER BY `modification_date`" \
        "LIMIT 5000"
    records = run_sql(sql, [last_date.isoformat()])

    def check_arxiv(recid):
        record = get_record(recid)

        for report_tag in record_get_field_instances(record, "037"):
            for category in field_get_subfield_values(report_tag, 'a'):
                if category.startswith('arXiv'):
                    return True
        return False

    def check_pdf_date(recid):
        doc = get_pdf_doc(recid)
        if doc:
            return doc.md > last_date
        return False

    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
    records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
    write_message("recids %s" % repr([(r, mod_date.isoformat()) \
                                               for r, mod_date in records]))
    task_update_progress("Done fetching arxiv record ids")
    return records
Exemple #20
0
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name):
    """
    Generate sitemaps themselves.

    @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps
    @param records: the list of (recid, modification_date) tuples to process
    @param output_directory: directory where to store the sitemaps
    @param sitemap_name: the name (prefix) of the sitemap files(s)
    """
    sitemap_id = 1
    writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
    sitemap_index_writer.add_url(writer.get_sitemap_url())
    nb_urls = 0
    write_message("... Getting sitemap '%s'..." % sitemap_name)
    write_message("... Generating urls for %s records..." % len(records))
    task_sleep_now_if_required(can_stop_too=True)
    for i, (recid, lastmod) in enumerate(records):
        if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS):
            sitemap_id += 1
            writer = SitemapWriter(sitemap_id, output_directory, sitemap_name)
            sitemap_index_writer.add_url(writer.get_sitemap_url())
        nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid),
                                lastmod = lastmod,
                                changefreq = DEFAULT_CHANGEFREQ_RECORDS,
                                priority = DEFAULT_PRIORITY_RECORDS)
        if i % 100 == 0:
            task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records)))
            task_sleep_now_if_required(can_stop_too=True)
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last + 1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(
                updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
Exemple #22
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ),
                  verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" % (len(records_concerned), ))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress(
            "Processing records %s/%s (%i%%)" %
            (records_processed, len(records_concerned),
             int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" %
                              (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" %
                                      (ticket, ),
                                      verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                          (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid, ))
                    else:
                        write_message("Skipping record %s", (recid, ))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" %
                                  (recid, ))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" %
                                                (ticket_name, ))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
def _task_update_overall_status(message):
    """ Generates an overall update message for the BibEncode task.
        Stores the messages in a global list for notifications
        @param message: the message that should be printed as task status
        @type message: string
    """
    message = "[%d/%d]%s" % (_BATCH_STEP, _BATCH_STEPS, message)
    task_update_progress(message)
    global _UPD_HISTORY
    _UPD_HISTORY.append(message)
    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)
Exemple #26
0
def _task_update_overall_status(message):
    """ Generates an overall update message for the BibEncode task.
        Stores the messages in a global list for notifications
        @param message: the message that should be printed as task status
        @type message: string
    """
    message = "[%d/%d]%s" % (_BATCH_STEP, _BATCH_STEPS, message)
    task_update_progress(message)
    global _UPD_HISTORY
    _UPD_HISTORY.append(message)
Exemple #27
0
def rebuild_tables(rank_method_code, config):
    """Rebuild the tables from scratch

    Called by bibrank -w selfcites -R
    """
    task_update_progress('emptying tables')
    empty_self_cites_tables()
    task_update_progress('filling tables')
    fill_self_cites_tables(rank_method_code, config)
    return True
Exemple #28
0
def rebuild_tables(rank_method_code, config):
    """Rebuild the tables from scratch

    Called by bibrank -w selfcites -R
    """
    task_update_progress('emptying tables')
    empty_self_cites_tables()
    task_update_progress('filling tables')
    fill_self_cites_tables(rank_method_code, config)
    return True
Exemple #29
0
def get_citation_weight(rank_method_code, config, chunk_size=25000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    quick = task_get_option("quick") != "no"

    # id option forces re-indexing a certain range
    # even if there are no new recs
    if task_get_option("id"):
        # construct a range of records to index
        updated_recids = []
        for first, last in task_get_option("id"):
            updated_recids += range(first, last+1)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message('Records to process: %s' % str_updated_recids)
        index_update_time = None
    else:
        bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
        if not quick:
            bibrank_update_time = "0000-00-00 00:00:00"
        write_message("bibrank: %s" % bibrank_update_time)
        index_update_time = get_bibindex_update_time()
        write_message("bibindex: %s" % index_update_time)
        if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
            index_update_time = "0000-00-00 00:00:00"
        updated_recids = get_modified_recs(bibrank_update_time,
                                           index_update_time)
        if len(updated_recids) > 10000:
            str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
        else:
            str_updated_recids = str(updated_recids)
        write_message("%s records to update" % str_updated_recids)

    if updated_recids:
        begin_time = time.time()
        try:
            function = config.get("rank_method", "function")
            config.get(function, 'collections')
        except ConfigParser.NoOptionError:
            config.set(function, 'collections', None)
        # Process fully the updated records
        weights = process_and_store(updated_recids, config, chunk_size)
        end_time = time.time()
        write_message("Total time of get_citation_weight(): %.2f sec" %
                                                      (end_time - begin_time))
        task_update_progress("citation analysis done")
    else:
        weights = None
        write_message("No new records added since last time this "
                      "rank method was executed")

    return weights, index_update_time
Exemple #30
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Exemple #31
0
def process_records(name, records, func, extra_vars):
    count = 1
    total = len(records)
    for recid, date in records:
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count, total)
        task_update_progress(msg)
        write_message(msg)
        func(recid, **extra_vars)
        if date:
            store_last_updated(recid, date, name)
        count += 1
Exemple #32
0
def task_run_core():
    """
    Main daemon task.

    Returns True when run successfully. False otherwise.
    """
    # Dictionary of "plugin_name" -> func
    tickets_to_apply = task_get_option('tickets')
    write_message("Ticket plugins found: %s" %
                  (str(tickets_to_apply),), verbose=9)

    task_update_progress("Loading records")
    records_concerned = get_recids_to_load()
    write_message("%i record(s) found" %
                  (len(records_concerned),))

    records_processed = 0
    for record, last_date in load_records_from_id(records_concerned):
        records_processed += 1
        recid = record_id_from_record(record)
        task_update_progress("Processing records %s/%s (%i%%)"
                             % (records_processed, len(records_concerned),
                                int(float(records_processed) / len(records_concerned) * 100)))
        task_sleep_now_if_required(can_stop_too=True)
        for ticket_name, plugin in tickets_to_apply.items():
            if plugin:
                write_message("Running template %s for %s" % (ticket_name, recid),
                              verbose=5)
                try:
                    ticket = BibCatalogTicket(recid=int(recid))
                    if plugin['check_record'](ticket, record):
                        ticket = plugin['generate_ticket'](ticket, record)
                        write_message("Ticket to be generated: %s" % (ticket,), verbose=5)
                        res = ticket.submit()
                        if res:
                            write_message("Ticket #%s created for %s" %
                                         (ticket.ticketid, recid))
                        else:
                            write_message("Ticket already exists for %s" %
                                          (recid,))
                    else:
                        write_message("Skipping record %s", (recid,))
                except Exception, e:
                    write_message("Error submitting ticket for record %s:" % (recid,))
                    write_message(traceback.format_exc())
                    raise e
            else:
                raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name,))

        if last_date:
            store_last_updated(recid, last_date, name="bibcatalog")
Exemple #33
0
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" %
                                 (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" %
                                 (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
Exemple #34
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    task_update_progress("Reading parameters")
    write_message("Reading parameters started")
    output_dir = task_get_option('output', CFG_LOGDIR)
    output_num = task_get_option('number', 5)
    output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-'
    output_fil_suffix = task_get_task_param('task_starting_time').replace(' ', '_') + '.sql.gz'
    output_fil = output_fil_prefix + output_fil_suffix
    write_message("Reading parameters ended")
    # make dump:
    task_update_progress("Dumping database")
    write_message("Database dump started")
    _dump_database(output_dir, output_fil)
    write_message("Database dump ended")
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_fil_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
Exemple #35
0
def task_run_core():
    """Run the specific tasklet."""
    tasklet = task_get_option('tasklet')
    arguments = task_get_option('arguments', {})
    write_message('Starting tasklet "%s" (with arguments %s)' % (
        tasklet, arguments))
    task_update_progress('%s started' % tasklet)
    ret = _TASKLETS[tasklet](**arguments)
    task_update_progress('%s finished' % tasklet)
    write_message('Finished tasklet "%s" (with arguments %s)' % (
        tasklet, arguments))
    if ret is not None:
        return ret
    return True
Exemple #36
0
def task_run_core():
    """
    When this function is called, the tool has entered BibSched mode, which means
    that we're going to cache events according to the parameters.
    """
    write_message("Initiating rawdata caching")
    task_update_progress("Initating rawdata caching")

    # Cache key events
    keyevents = task_get_option("keyevents")
    if keyevents and len(keyevents) > 0:
        for i in range(len(keyevents)):
            write_message("Caching key event 1: %s" % keyevents[i])
            webstat.cache_keyevent_trend(keyevents)
            task_update_progress("Part 1/2: done %d/%d" % (i + 1, len(keyevents)))

    # Cache custom events
    customevents = task_get_option("customevents")
    if len(customevents) > 0:
        for i in range(len(customevents)):
            write_message("Caching custom event 1: %s" % customevents[i])
            webstat.cache_customevent_trend(customevents)
            task_update_progress("Part 2/2: done %d/%d" % (i + 1, len(customevents)))

    write_message("Finished rawdata caching succesfully")
    task_update_progress("Finished rawdata caching succesfully")

    return True
Exemple #37
0
def create_update_jobs_by_recids(recids, batch_template_file, job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS):
    """ Creates the job description files to update all given recids
    @param recids: Iterable set of recids
    @type recids: iterable
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    batch_template = json_decode_file(batch_template_file)
    for recid in recids:
        task_update_progress("Creating Update Job for %d" % recid)
        write_message("Creating Update Job for %d" % recid)
        job = dict(batch_template)
        job["recid"] = recid
        timestamp = generate_timestamp()
        job_filename = "update_%d_%s.job" % (recid, timestamp)
        create_job_from_dictionary(job, job_filename, job_directory)
    return 1
Exemple #38
0
def run_bibsort_rebalance(method_list=None):
    """Rebalances all buckets for the methods in method_list"""
    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    if not bibsort_methods:
        write_message('No methods found.. exiting rebalancing.')
        return True
    #check if there are only ranking methods -> no need for recids
    rnk_methods = get_rnk_methods(bibsort_methods)
    non_rnk_method = [
        method for method in bibsort_methods.keys()
        if method not in rnk_methods
    ]

    write_message('Running rebalancing for methods: %s' %
                  bibsort_methods.keys())

    if non_rnk_method:  # we have also 'normal' (no RNK) methods, so we need the recids
        recids = get_all_recids(including_deleted=False)
        write_message('Rebalancing will run for %s records.' \
                      %str(len(recids)), verbose=5)
        task_sleep_now_if_required(can_stop_too=True)
    else:
        recids = intbitset([])
        write_message('Rebalancing will run only for RNK methods')
    for name in bibsort_methods:
        task_update_progress('Rebalancing %s method.' % name)
        write_message('Starting sorting the data for %s method ... ' \
                          %name.upper())
        executed_ok = run_sorting_method(recids, name,
                                         bibsort_methods[name]['id'],
                                         bibsort_methods[name]['definition'],
                                         bibsort_methods[name]['washer'])
        if not executed_ok:
            write_message('Method %s could not be executed correctly.' \
                          %name, sys.stderr)
            return False
        write_message('Done.')
        task_sleep_now_if_required(can_stop_too=True)
    task_update_progress('Rebalancing done.')
    return True
Exemple #39
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    write_message("Running rank method: %s" % rank_method_code, verbose=0)

    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm':
        selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold':
        selfcites_config.get(rank_method_code, "friends_threshold")
    }
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(rank_method_code, config)

    tags = get_authors_tags()
    recids, end_date = fetch_concerned_records(rank_method_code,
                                               task_get_option("id"))
    citations_fun = get_citations_fun(config['algorithm'])
    weights = fromDB(rank_method_code)

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun, weights)

    intoDB(weights, end_date, rank_method_code)
    store_weights_cache(weights)

    write_message("Complete")
    return True
Exemple #40
0
def task_run_core():
    """Runs the task by fetching arguments from the BibSched task queue.  This is
    what BibSched will be invoking via daemon call.
    The task prints Fibonacci numbers for up to NUM on the stdout, and some
    messages on stderr.
    Return 1 in case of success and 0 in case of failure."""
    n = int(task_get_option('number'))
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.", sys.stderr, verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message("Error: floppy drive dropped on the floor.  Ignoring and continuing.", sys.stderr)
            if task_get_option('error'):
                1 / 0
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Exemple #41
0
def bst_fibonacci(n=30):
    """
    Small tasklets that prints the the Fibonacci sequence for n.
    @param n: how many Fibonacci numbers to print.
    @type n: int
    """
    ## Since it's tasklet, the parameter might be passed as a string.
    ## it should then be converted to an int.
    n = int(n)
    write_message("Printing %d Fibonacci numbers." % n, verbose=9)
    for i in range(0, n):
        if i > 0 and i % 4 == 0:
            write_message("Error: water in the CPU.  Ignoring and continuing.", sys.stderr, verbose=3)
        elif i > 0 and i % 5 == 0:
            write_message("Error: floppy drive dropped on the floor.  Ignoring and continuing.", sys.stderr)
        write_message("fib(%d)=%d" % (i, fib(i)))
        task_update_progress("Done %d out of %d." % (i, n))
        task_sleep_now_if_required(can_stop_too=True)
        time.sleep(1)
    task_update_progress("Done %d out of %d." % (n, n))
    return 1
Exemple #42
0
def halt(err=StandardError, msg=None, exit_code=1):
    """ Stop extraction, and deal with the error in the appropriate
    manner, based on whether Refextract is running in standalone or
    bibsched mode.
    @param err: (exception) The exception raised from an error, if any
    @param msg: (string) The brief error message, either displayed
    on the bibsched interface, or written to stderr.
    @param exit_code: (integer) Either 0 or 1, depending on the cause
    of the halting. This is only used when running standalone."""
    # If refextract is running independently, exit.
    # 'RUNNING_INDEPENDENTLY' is a global variable
    if RUNNING_INDEPENDENTLY:
        if msg:
            write_message(msg, stream=sys.stderr, verbose=0)
        sys.exit(exit_code)
    # Else, raise an exception so Bibsched will flag this task.
    else:
        if msg:
            # Update the status of refextract inside the Bibsched UI
            task_update_progress(msg.strip())
        raise err(msg)
Exemple #43
0
def halt(err=StandardError, msg=None, exit_code=1):
    """ Stop extraction, and deal with the error in the appropriate
    manner, based on whether Refextract is running in standalone or
    bibsched mode.
    @param err: (exception) The exception raised from an error, if any
    @param msg: (string) The brief error message, either displayed
    on the bibsched interface, or written to stderr.
    @param exit_code: (integer) Either 0 or 1, depending on the cause
    of the halting. This is only used when running standalone."""
    # If refextract is running independently, exit.
    # 'RUNNING_INDEPENDENTLY' is a global variable
    if RUNNING_INDEPENDENTLY:
        if msg:
            write_message(msg, stream=sys.stderr, verbose=0)
        sys.exit(exit_code)
    # Else, raise an exception so Bibsched will flag this task.
    else:
        if msg:
            # Update the status of refextract inside the Bibsched UI
            task_update_progress(msg.strip())
        raise err(msg)
Exemple #44
0
def fill_self_cites_tables(rank_method_code, config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon

    This is an optimization when running on empty tables, and we hope the
    result is the same as the compute_and_store_self_citations.
    """
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    algorithm = config['algorithm']
    tags = get_authors_tags()
    selfcites_dic = {}
    all_ids = intbitset(run_sql('SELECT id FROM bibrec ORDER BY id'))
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid,
                                         tags,
                                         citations_fun,
                                         selfcites_dic)
    intoDB(selfcites_dic, begin_date, rank_method_code)
    store_weights_cache(selfcites_dic)
Exemple #45
0
def create_update_jobs_by_recids(recids,
                                 batch_template_file,
                                 job_directory=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS
                                 ):
    """ Creates the job description files to update all given recids
    @param recids: Iterable set of recids
    @type recids: iterable
    @param batch_template_file: fullpath to the template for the update
    @type batch_tempalte_file: string
    @param job_directory: fullpath to the directory storing the job files
    @type job_directory: string
    """
    batch_template = json_decode_file(batch_template_file)
    for recid in recids:
        task_update_progress("Creating Update Job for %d" % recid)
        write_message("Creating Update Job for %d" % recid)
        job = dict(batch_template)
        job['recid'] = recid
        timestamp = generate_timestamp()
        job_filename = "update_%d_%s.job" % (recid, timestamp)
        create_job_from_dictionary(job, job_filename, job_directory)
    return 1
Exemple #46
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    write_message("Running rank method: %s" % rank_method_code, verbose=0)

    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm': selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold")
    }
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(rank_method_code, config)

    tags = get_authors_tags()
    recids, end_date = fetch_concerned_records(rank_method_code,
                                               task_get_option("id"))
    citations_fun = get_citations_fun(config['algorithm'])
    weights = fromDB(rank_method_code)

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun, weights)

    intoDB(weights, end_date, rank_method_code)
    store_weights_cache(weights)

    write_message("Complete")
    return True
Exemple #47
0
def fill_self_cites_tables(rank_method_code, config):
    """
    This will fill the self-cites tables with data

    The purpose of this function is to fill these tables on a website that
    never ran the self-cites daemon

    This is an optimization when running on empty tables, and we hope the
    result is the same as the compute_and_store_self_citations.
    """
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    algorithm = config['algorithm']
    tags = get_authors_tags()
    selfcites_dic = {}
    all_ids = intbitset(run_sql('SELECT id FROM bibrec ORDER BY id'))
    citations_fun = get_citations_fun(algorithm)
    write_message('using %s' % citations_fun.__name__)
    if algorithm == 'friends':
        # We only needs this table for the friends algorithm or assimilated
        # Fill intermediary tables
        for index, recid in enumerate(all_ids):
            if index % 1000 == 0:
                msg = 'intermediate %d/%d' % (index, len(all_ids))
                task_update_progress(msg)
                write_message(msg)
                task_sleep_now_if_required()
            update_self_cites_tables(recid, config, tags)
    # Fill self-cites table
    for index, recid in enumerate(all_ids):
        if index % 1000 == 0:
            msg = 'final %d/%d' % (index, len(all_ids))
            task_update_progress(msg)
            write_message(msg)
            task_sleep_now_if_required()
        compute_and_store_self_citations(recid, tags, citations_fun,
                                         selfcites_dic)
    intoDB(selfcites_dic, begin_date, rank_method_code)
    store_weights_cache(selfcites_dic)
Exemple #48
0
def run_bibsort_rebalance(method_list = None):
    """Rebalances all buckets for the methods in method_list"""
    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    if not bibsort_methods:
        write_message('No methods found.. exiting rebalancing.')
        return True
    #check if there are only ranking methods -> no need for recids
    rnk_methods = get_rnk_methods(bibsort_methods)
    non_rnk_method = [method for method in bibsort_methods.keys() if method not in rnk_methods]

    write_message('Running rebalancing for methods: %s' %bibsort_methods.keys())

    if non_rnk_method:# we have also 'normal' (no RNK) methods, so we need the recids
        recids = get_all_recids(including_deleted=False)
        write_message('Rebalancing will run for %s records.' \
                      %str(len(recids)), verbose=5)
        task_sleep_now_if_required(can_stop_too=True)
    else:
        recids = intbitset([])
        write_message('Rebalancing will run only for RNK methods')
    for name in bibsort_methods:
        task_update_progress('Rebalancing %s method.' %name)
        write_message('Starting sorting the data for %s method ... ' \
                          %name.upper())
        executed_ok = run_sorting_method(recids, name,
                                bibsort_methods[name]['id'],
                                bibsort_methods[name]['definition'],
                                bibsort_methods[name]['washer'])
        if not executed_ok:
            write_message('Method %s could not be executed correctly.' \
                          %name, sys.stderr)
            return False
        write_message('Done.')
        task_sleep_now_if_required(can_stop_too=True)
    task_update_progress('Rebalancing done.')
    return True
Exemple #49
0
def process_updates(rank_method_code):
    """
    This is what gets executed first when the task is started.
    It handles the --rebuild option. If that option is not specified
    we fall back to the process_one()
    """
    selfcites_config = read_configuration(rank_method_code)
    config = {
        'algorithm': selfcites_config.get(rank_method_code, "algorithm"),
        'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold")
    }
    begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    quick = task_get_option("quick") != "no"
    if not quick:
        return rebuild_tables(config)

    write_message("Starting")

    tags = get_authors_tags()
    recids = fetch_concerned_records(rank_method_code)
    citations_fun = get_citations_fun(config['algorithm'])

    write_message("recids %s" % str(recids))

    total = len(recids)
    for count, recid in enumerate(recids):
        task_sleep_now_if_required(can_stop_too=True)
        msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total)
        task_update_progress(msg)
        write_message(msg)

        process_one(recid, tags, citations_fun)

    store_last_updated(rank_method_code, begin_date)

    write_message("Complete")
    return True
Exemple #50
0
def fetch_updated_arxiv_records(date):
    """Fetch all the arxiv records modified since the last run"""
    def check_arxiv(recid):
        """Returns True for arxiv papers"""
        for report_number in get_fieldvalues(recid, '037__9'):
            if report_number == 'arXiv':
                return True
        return False

    # Fetch all records inserted since last run
    sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
          "WHERE `modification_date` >= %s " \
          "ORDER BY `modification_date`"
    records = run_sql(sql, [date.isoformat()])
    records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]

    # Show all records for debugging purposes
    if task_get_option('verbose') >= 9:
        write_message('recids:', verbose=9)
        for recid, mod_date in records:
            write_message("* %s, %s" % (recid, mod_date), verbose=9)

    task_update_progress("Done fetching %s arxiv record ids" % len(records))
    return records
def bst_twitter_fetcher(query):
    """
    Fetch the tweets related to the user and upload them into Invenio.
    @param user: the user
    """
    ## We prepare a temporary MARCXML file to upload.
    fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR)
    tweets = get_tweets(query)
    if tweets:
        os.write(fd, """<collection>\n""")
        for i, tweet in enumerate(tweets):
            ## For every tweet we transform it to MARCXML and we dump it in the file.
            task_update_progress('DONE: tweet %s out %s' % (i, len(tweets)))
            os.write(fd, tweet_to_record(tweet, query))

        os.write(fd, """</collection\n>""")
        os.close(fd)

        ## Invenio magic: we schedule an upload of the created MARCXML to be inserted
        ## ASAP in the system.
        task_low_level_submission('bibupload', 'admin', '-i', '-r', name, '-P5')
        write_message("Uploaded file %s with %s new tweets about %s" % (name, len(tweets), query))
    else:
        write_message("No new tweets about %s" % query)
Exemple #52
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \
                 or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR
    # Check if directory /batchupload exists
    if not task_get_option('documents'):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        for folder in ["insert/", "append/", "correct/", "replace/"]:
            files_dir = parent_dir + folder
            files_done_dir = files_dir + "DONE/"
            try:
                files = os.listdir(files_dir)
            except OSError as e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(prefix=metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR)
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "-" + folder[0]
                    jobid = str(task_low_level_submission('bibupload', 'batchupload', mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile), os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of 4." % progress)
    else:
        # Documents upload
        parent_dir = daemon_dir + "/documents/"
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        matching_order = CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY
        for folder in ["append/", "revise/"]:
            try:
                os.mkdir(parent_dir + folder)
            except:
                pass
            for matching in matching_order:
                errors = document_upload(folder=parent_dir + folder, matching=matching, mode=folder[:-1])[0]
                if not errors:
                    break # All documents succedeed with that matching
                for error in errors:
                    write_message("File: %s - %s with matching %s" % (error[0], error[1], matching), sys.stderr)
            task_sleep_now_if_required(can_stop_too=True)
    return 1
Exemple #53
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
Exemple #54
0
def run_bibsort_update(recids=None, method_list=None):
    """Updates bibsort tables for the methods in method_list
    and for the records in recids.

    If recids is None: recids = all records that have been modified
    or inserted since last update

    If method_list is None: method_list = all the methods available
    in bsrMETHOD table"""

    write_message('Initial data for run_bibsort_update method: ' \
                  'number of recids = %s; method_list=%s' \
                  %(str(len(recids)), method_list), verbose=5)
    write_message('Updating sorting data.')

    bibsort_methods, errors = get_bibsort_methods_details(method_list)
    if errors:
        return False
    method_list = bibsort_methods.keys()
    if not method_list:
        write_message('No methods found in bsrMETHOD table.. exiting.')
        return True

    #we could have 4 types of methods:
    #(i) RNK methods -> they should be rebalanced, not updated
    #(ii) RNK methods to delete -> we should delete their data
    #(iii) non RNK methods to update
    #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated
    #check which of the methods are RNK methods (they do not need modified recids)
    rnk_methods = get_rnk_methods(bibsort_methods)
    rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(rnk_methods, bibsort_methods)
    #check which of the methods have no data, so they are actually new,
    #so they need balancing(sorting) instead of updating
    non_rnk_methods = [method for method in bibsort_methods.keys() if method not in rnk_methods]
    non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(non_rnk_methods)

    #(i) + (iv)
    methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted
    if methods_to_balance: # several methods require rebalancing(sorting) and not updating
        return run_bibsort_rebalance(methods_to_balance)

    #(ii)
    #remove the data for the ranking methods that have been deleted
    for method in rnk_methods_deleted:
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Deleting data for method %s" %method)
        write_message('Starting deleting the data for RNK method %s' %method, verbose=5)
        executed_ok = delete_bibsort_data_for_method(bibsort_methods[method]['id'])
        if not executed_ok:
            write_message('Method %s could not be deleted correctly, aborting..' \
                          %method, sys.stderr)
            return False

    #(iii)
    #methods to actually update
    if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids
        update_timestamp = False
        if not recids:
            recids = get_modified_or_inserted_recs(non_rnk_methods_updated)
            if recids == 0: #error signal
                return False
            if not recids:
                write_message("No records inserted or modified in bibrec table " \
                          "since the last update of bsrMETHODDATA.")
                return True
            write_message("These records have been recently modified/inserted: %s" \
                  %str(recids), verbose=5)
            update_timestamp = True
        recids_i = intbitset(recids)
        for method in non_rnk_methods_updated:
            task_sleep_now_if_required(can_stop_too=True)
            task_update_progress("Updating method %s" %method)
            write_message('Starting updating method %s' %method, verbose=5)
            executed_ok = update_bibsort_tables(recids_i, method, update_timestamp)
            if not executed_ok:
                write_message('Method %s could not be executed correctly, aborting..' \
                          %method, sys.stderr)
                return False
    return True
def get_citation_informations(recid_list,
                              tags,
                              config,
                              fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'hdl': {},
        'isbn': {},
        'record_id': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'record_id': {},
        'isbn': {},
        'hdl': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info['record_id'][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, 'collections'):
            if recid not in recids_cache(config.get(function, 'collections')):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags['refs_report_number']:
            references_info['report-numbers'][recid] = [
                t.value
                for t in record.find_subfields(tags['refs_report_number'])
            ]
            msg = "references_info['report-numbers'][%s] = %r" \
                        % (recid, references_info['report-numbers'][recid])
            write_message(msg, verbose=9)
        if tags['refs_journal']:
            references_info['journals'][recid] = []
            for ref in record.find_subfields(tags['refs_journal']):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(',')
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ','.join([journal, alt_volume, page])
                        references_info['journals'][recid] += [alt_ref]
                references_info['journals'][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" \
                              % (recid, references_info['journals'][recid])
            write_message(msg, verbose=9)
        if tags['refs_doi']:
            references = [
                t.value for t in record.find_subfields(tags['refs_doi'])
            ]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info['doi'][recid] = dois
            references_info['hdl'][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags['refs_record_id']:
            references_info['record_id'][recid] = [
                t.value for t in record.find_subfields(tags['refs_record_id'])
            ]
            msg = "references_info['record_id'][%s] = %r" \
                                   % (recid, references_info['record_id'][recid])
            write_message(msg, verbose=9)
        if tags['refs_isbn']:
            references_info['isbn'][recid] = [
                t.value for t in record.find_subfields(tags['refs_isbn'])
            ]
            msg = "references_info['isbn'][%s] = %r" \
                                   % (recid, references_info['isbn'][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags['record_pri_number'] or tags['record_add_number']:
            records_info['report-numbers'][recid] = []

            if tags['record_pri_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_pri_number'])
                ]

            if tags['record_add_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_add_number'])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" \
                        % (recid, records_info['report-numbers'][recid])
            write_message(msg, verbose=9)

        if tags['doi']:
            records_info['doi'][recid] = []
            records_info['hdl'][recid] = []
            for tag in tags['doi']:
                for field in record.find_fields(tag[:5]):
                    if 'DOI' in field.get_subfield_values('2'):
                        dois = field.get_subfield_values('a')
                        records_info['doi'][recid].extend(dois)
                    elif 'HDL' in field.get_subfield_values('2'):
                        hdls = field.get_subfield_values('a')
                        records_info['hdl'][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" \
                                      % (recid, records_info['doi'][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" \
                                      % (recid, records_info['hdl'][recid])
            write_message(msg, verbose=9)

        if tags['isbn']:
            records_info['isbn'][recid] = []
            for tag in tags['isbn']:
                values = [t.value for t in record.find_subfields(tag)]
                records_info['isbn'][recid] += values

            msg = "records_info[%s]['isbn'] = %r" \
                                      % (recid, records_info['isbn'][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags['publication']:
            records_info['journals'][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" \
                                 % (recid, records_info['journals'][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info
def ref_analyzer(citation_informations, updated_recids, tags, config):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations = {}
    for recid in updated_recids:
        citations[recid] = set()
    references = {}
    for recid in updated_recids:
        references[recid] = set()

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_cites(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        citations[citee].add(citer)
        if citer in updated_recids:
            references[citer].add(citee)

    def add_to_refs(citer, citee):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == citee:
            return

        if citee in updated_recids:
            citations[citee].add(citer)
        references[citer].add(citee)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in iteritems(references_info['report-numbers']):
        step("Report numbers references", thisrecid, done,
             len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber,
                                               f=field,
                                               config=config)
            write_message("These match searching %s in %s: %s" %
                          (refnumber, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in iteritems(references_info['journals']):
        step("Journal references", thisrecid, done,
             len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in iteritems(references_info['doi']):
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Try to find references based on 999C5a (hdl references)
    # e.g. 4263537/4000
    write_message("Phase 4: HDL references")
    done = 0
    for thisrecid, refs in references_info['hdl'].iteritems():
        step("HDL references", thisrecid, done, len(references_info['hdl']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'hdl'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' HDL value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t5 = os.times()[4]

    # Try to find references based on 999C50
    # e.g. 1244
    write_message("Phase 5: Record ID references")
    done = 0
    for thisrecid, refs in references_info['record_id'].iteritems():
        step("Record ID references", thisrecid, done,
             len(references_info['record_id']))
        done += 1
        field = "001"
        for recid in (r for r in refs if r):
            valid = get_recids_matching_query(p=recid, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (recid, field, list(valid)),
                          verbose=9)
            if valid:
                add_to_refs(thisrecid, valid[0])

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t6 = os.times()[4]

    # Try to find references based on 999C5i
    # e.g. 978-3-942171-73-1
    write_message("Phase 6: ISBN references")
    done = 0
    for thisrecid, refs in references_info['isbn'].iteritems():
        step("ISBN references", thisrecid, done, len(references_info['isbn']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'isbn'

            recids = get_recids_matching_query(p=p, f=field, config=config)
            write_message("These match searching %s in %s: %s" %
                          (reference, field, list(recids)),
                          verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' ISBN value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_refs(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t7 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 7: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in iteritems(records_info['report-numbers']):
        step("Report numbers catchup", thisrecid, done,
             len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(
                    p=report_pattern,
                    f=tags['refs_report_number'],
                    m='r',
                    config=config)
            else:
                recids = get_recids_matching_query(
                    p=reportcode, f=tags['refs_report_number'], config=config)
            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 8: journals catchup")
    done = 0
    t8 = os.times()[4]
    for thisrecid, rec_journals in iteritems(records_info['journals']):
        step("Journals catchup", thisrecid, done,
             len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = get_recids_matching_query(p=journal,
                                               f=tags['refs_journal'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (journal, tags['refs_journal'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 9: DOI catchup")
    done = 0
    t9 = os.times()[4]
    for thisrecid, dois in iteritems(records_info['doi']):
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            recids = get_recids_matching_query(p=doi,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (doi, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 10: HDL catchup")
    done = 0
    t10 = os.times()[4]
    for thisrecid, hdls in records_info['hdl'].iteritems():
        step("HDL catchup", thisrecid, done, len(records_info['hdl']))
        done += 1

        for hdl in hdls:
            recids = get_recids_matching_query(p=hdl,
                                               f=tags['refs_doi'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (hdl, tags['refs_doi'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 11: ISBN catchup")
    done = 0
    t11 = os.times()[4]
    for thisrecid, isbns in records_info['isbn'].iteritems():
        step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
        done += 1

        for isbn in isbns:
            recids = get_recids_matching_query(p=isbn,
                                               f=tags['refs_isbn'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (isbn, tags['refs_isbn'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    write_message("Phase 12: Record ID catchup")
    done = 0
    t12 = os.times()[4]
    for thisrecid, record_ids in records_info['record_id'].iteritems():
        step("Record ID catchup", thisrecid, done,
             len(records_info['record_id']))
        done += 1

        for record_id in record_ids:
            recids = get_recids_matching_query(p=record_id,
                                               f=tags['refs_record_id'],
                                               config=config)
            write_message("These records match %s in %s: %s" %
                          (record_id, tags['refs_record_id'], list(recids)),
                          verbose=9)

            for recid in recids:
                add_to_cites(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(iteritems(citations), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(iteritems(references), 10)))
        write_message("size: %s" % len(references))

    t13 = os.times()[4]

    write_message("Execution time for analyzing the citation information "
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking ref HDL: %.2f sec" % (t5 - t4))
    write_message("... checking ref Record ID: %.2f sec" % (t6 - t5))
    write_message("... checking ref ISBN: %.2f sec" % (t7 - t6))
    write_message("... checking rec report numbers: %.2f sec" % (t8 - t7))
    write_message("... checking rec journals: %.2f sec" % (t9 - t8))
    write_message("... checking rec DOI: %.2f sec" % (t10 - t9))
    write_message("... checking rec HDL: %.2f sec" % (t11 - t10))
    write_message("... checking rec ISBN: %.2f sec" % (t12 - t11))
    write_message("... checking rec Record ID: %.2f sec" % (t13 - t12))
    write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1))

    return citations, references
Exemple #57
0
def task_run_core():
    """ Walks through all directories where metadata files are located
        and uploads them.
        Files are then moved to the corresponding DONE folders.
    """
    daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \
                 or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR
    # Check if directory /batchupload exists
    if not task_get_option('documents'):
        # Metadata upload
        parent_dir = daemon_dir + "/metadata/"
        progress = 0
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        list_of_folders = [
            "insert", "append", "correct", "replace", "holdingpen"
        ]
        for folder in list_of_folders:
            files_dir = os.path.join(parent_dir, folder)
            files_done_dir = os.path.join(files_dir, "DONE")
            try:
                files = os.listdir(files_dir)
            except OSError as e:
                os.mkdir(files_dir)
                files = []
                write_message(e, sys.stderr)
                write_message("Created new folder %s" % (files_dir, ))
            # Create directory DONE/ if doesn't exist
            try:
                os.mkdir(files_done_dir)
            except OSError:
                # Directory exists
                pass
            for metafile in files:
                if os.path.isfile(os.path.join(files_dir, metafile)):
                    # Create temporary file to be uploaded
                    (fd, filename) = tempfile.mkstemp(
                        prefix=metafile + "_" +
                        time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
                        dir=CFG_TMPSHAREDDIR)
                    shutil.copy(os.path.join(files_dir, metafile), filename)
                    # Send bibsched task
                    mode = "--" + folder
                    jobid = str(
                        task_low_level_submission('bibupload', 'batchupload',
                                                  mode, filename))
                    # Move file to done folder
                    filename = metafile + "_" + time.strftime(
                        "%Y%m%d%H%M%S", time.localtime()) + "_" + jobid
                    os.rename(os.path.join(files_dir, metafile),
                              os.path.join(files_done_dir, filename))
                    task_sleep_now_if_required(can_stop_too=True)
            progress += 1
            task_update_progress("Done %d out of %d." %
                                 (progress, len(list_of_folders)))
    else:
        # Documents upload
        parent_dir = daemon_dir + "/documents/"
        try:
            os.makedirs(parent_dir)
        except OSError:
            pass
        matching_order = CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY
        for folder in ["append/", "revise/"]:
            try:
                os.mkdir(parent_dir + folder)
            except:
                pass
            for matching in matching_order:
                errors = document_upload(folder=parent_dir + folder,
                                         matching=matching,
                                         mode=folder[:-1])[0]
                if not errors:
                    break  # All documents succedeed with that matching
                for error in errors:
                    write_message(
                        "File: %s - %s with matching %s" %
                        (error[0], error[1], matching), sys.stderr)
            task_sleep_now_if_required(can_stop_too=True)
    return 1
Exemple #58
0
def process_batch_job(batch_job_file):
    """ Processes a batch job description dictionary

    @param batch_job_file: a fullpath to a batch job file
    @type batch_job_file: string
    @return: 1 if the process was successful, 0 if not
    @rtype; int
    """
    from invenio.legacy.bibdocfile.cli import cli_fix_marc

    def upload_marcxml_file(marcxml):
        """ Creates a temporary marcxml file and sends it to bibupload
        """
        xml_filename = 'bibencode_'+ str(batch_job['recid']) + '_' + str(uuid.uuid4()) + '.xml'
        xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename)
        xml_file = file(xml_filename, 'w')
        xml_file.write(marcxml)
        xml_file.close()
        targs = ['-c', xml_filename]
        task_low_level_submission('bibupload', 'bibencode', *targs)

    #---------#
    # GENERAL #
    #---------#

    _task_write_message("----------- Handling Master -----------")

    ## Check the validity of the batch file here
    batch_job = json_decode_file(batch_job_file)

    ## Sanitise batch description and raise errrors
    batch_job = sanitise_batch_job(batch_job)

    ## Check if the record exists
    # if record_exists(batch_job['recid']) < 1:
    #     raise Exception("Record not found")

    recdoc = BibRecDocs(batch_job['recid'])

    #--------------------#
    # UPDATE FROM MASTER #
    #--------------------#

    ## We want to add new stuff to the video's record, using the master as input
    if getval(batch_job, 'update_from_master'):
        found_master = False
        bibdocs = recdoc.list_bibdocs()
        for bibdoc in bibdocs:
            bibdocfiles = bibdoc.list_all_files()
            for bibdocfile in bibdocfiles:
                comment = bibdocfile.get_comment()
                description = bibdocfile.get_description()
                subformat = bibdocfile.get_subformat()
                m_comment = getval(batch_job, 'bibdoc_master_comment', comment)
                m_description = getval(batch_job, 'bibdoc_master_description', description)
                m_subformat = getval(batch_job, 'bibdoc_master_subformat', subformat)
                if (comment == m_comment and
                    description == m_description and
                    subformat == m_subformat):
                    found_master = True
                    batch_job['input'] = bibdocfile.get_full_path()
                    ## Get the aspect of the from the record
                    try:
                        ## Assumes pbcore metadata mapping
                        batch_job['aspect'] = get_fieldvalues(124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0]
                    except IndexError:
                        pass
                    break
            if found_master:
                break
        if not found_master:
            _task_write_message("Video master for record %d not found"
                          % batch_job['recid'])
            task_update_progress("Video master for record %d not found"
                                 % batch_job['recid'])
            ## Maybe send an email?
            return 1

    ## Clean the job to do no upscaling etc
    if getval(batch_job, 'assure_quality'):
        batch_job = clean_job_for_quality(batch_job)

    global _BATCH_STEPS
    _BATCH_STEPS = len(batch_job['jobs'])

    ## Generate the docname from the input filename's name or given name
    bibdoc_video_docname, bibdoc_video_extension = decompose_file(batch_job['input'])[1:]
    if not bibdoc_video_extension or getval(batch_job, 'bibdoc_master_extension'):
        bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension')
    if getval(batch_job, 'bibdoc_master_docname'):
        bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname')

    write_message("Creating BibDoc for %s" % bibdoc_video_docname)
    ## If the bibdoc exists, receive it
    if bibdoc_video_docname in recdoc.get_bibdoc_names():
        bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname)
    ## Create a new bibdoc if it does not exist
    else:
        bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname)

    ## Get the directory auf the newly created bibdoc to copy stuff there
    bibdoc_video_directory = bibdoc_video.get_base_dir()

    #--------#
    # MASTER #
    #--------#
    if not getval(batch_job, 'update_from_master'):
        if getval(batch_job, 'add_master'):
            ## Generate the right name for the master
            ## The master should be hidden first an then renamed
            ## when it is really available
            ## !!! FIX !!!
            _task_write_message("Adding %s master to the BibDoc"
                          % bibdoc_video_docname)
            master_format = compose_format(
                                    bibdoc_video_extension,
                                    getval(batch_job, 'bibdoc_master_subformat', 'master')
                                    )
            ## If a file of the same format is there, something is wrong, remove it!
            ## it might be caused by a previous corrupted submission etc.
            if bibdoc_video.format_already_exists_p(master_format):
                bibdoc_video.delete_file(master_format, 1)
            bibdoc_video.add_file_new_format(
                    batch_job['input'],
                    version=1,
                    description=getval(batch_job, 'bibdoc_master_description'),
                    comment=getval(batch_job, 'bibdoc_master_comment'),
                    docformat=master_format
                    )

    #-----------#
    # JOBS LOOP #
    #-----------#

    return_code = 1
    global _BATCH_STEP

    for job in batch_job['jobs']:

        _task_write_message("----------- Job %s of %s -----------"
                           % (_BATCH_STEP, _BATCH_STEPS))

        ## Try to substitute docname with master docname
        if getval(job, 'bibdoc_docname'):
            job['bibdoc_docname'] = Template(job['bibdoc_docname']).safe_substitute({'bibdoc_master_docname': bibdoc_video_docname})

        #-------------#
        # TRANSCODING #
        #-------------#

        if job['mode'] == 'encode':

            ## Skip the job if assure_quality is not set and marked as fallback
            if not getval(batch_job, 'assure_quality') and getval(job, 'fallback'):
                continue

            if getval(job, 'profile'):
                profile = get_encoding_profile(job['profile'])
            else:
                profile = None
            ## We need an extension defined fot the video container
            bibdoc_video_extension = getval(job, 'extension',
                                            getval(profile, 'extension'))
            if not bibdoc_video_extension:
                raise Exception("No container/extension defined")
            ## Get the docname and subformat
            bibdoc_video_subformat = getval(job, 'bibdoc_subformat')
            bibdoc_slave_video_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            ## The subformat is incompatible with ffmpegs name convention
            ## We do the encoding without and rename it afterwards
            bibdoc_video_fullpath = compose_file(
                                                 bibdoc_video_directory,
                                                 bibdoc_video_extension
                                                 )
            _task_write_message("Transcoding %s to %s;%s" % (bibdoc_slave_video_docname,
                                bibdoc_video_extension,
                                bibdoc_video_subformat))
            ## We encode now directly into the bibdocs directory
            encoding_result = encode_video(
                 input_file=batch_job['input'],
                 output_file=bibdoc_video_fullpath,
                 acodec=getval(job, 'audiocodec'),
                 vcodec=getval(job, 'videocodec'),
                 abitrate=getval(job, 'videobitrate'),
                 vbitrate=getval(job, 'audiobitrate'),
                 resolution=getval(job, 'resolution'),
                 passes=getval(job, 'passes', 1),
                 special=getval(job, 'special'),
                 specialfirst=getval(job, 'specialfirst'),
                 specialsecond=getval(job, 'specialsecond'),
                 metadata=getval(job, 'metadata'),
                 width=getval(job, 'width'),
                 height=getval(job, 'height'),
                 aspect=getval(batch_job, 'aspect'), # Aspect for every job
                 profile=getval(job, 'profile'),
                 update_fnc=_task_update_overall_status,
                 message_fnc=_task_write_message
                 )
            return_code &= encoding_result
            ## only on success
            if  encoding_result:
                ## Rename it, adding the subformat
                os.rename(bibdoc_video_fullpath,
                          compose_file(bibdoc_video_directory,
                                       bibdoc_video_extension,
                                       bibdoc_video_subformat,
                                       1,
                                       bibdoc_slave_video_docname)
                          )
                #bibdoc_video._build_file_list()
                bibdoc_video.touch()
                bibdoc_video._sync_to_db()
                bibdoc_video_format = compose_format(bibdoc_video_extension,
                                                     bibdoc_video_subformat)
                if getval(job, 'bibdoc_comment'):
                    bibdoc_video.set_comment(getval(job, 'bibdoc_comment'),
                                              bibdoc_video_format)
                if getval(job, 'bibdoc_description'):
                    bibdoc_video.set_description(getval(job, 'bibdoc_description'),
                                                 bibdoc_video_format)

        #------------#
        # EXTRACTION #
        #------------#

        # if there are multiple extraction jobs, all the produced files
        # with the same name will be in the same bibdoc! Make sure that
        # you use different subformats or docname templates to avoid
        # conflicts.

        if job['mode'] == 'extract':
            if getval(job, 'profile'):
                profile = get_extract_profile(job['profile'])
            else:
                profile = {}
            bibdoc_frame_subformat = getval(job, 'bibdoc_subformat')
            _task_write_message("Extracting frames to temporary directory")
            tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4())
            os.mkdir(tmpdir)
            #Move this to the batch description
            bibdoc_frame_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            tmpfname = (tmpdir + "/" + bibdoc_frame_docname + '.'
                        + getval(profile, 'extension',
                        getval(job, 'extension', 'jpg')))
            extraction_result = extract_frames(input_file=batch_job['input'],
                           output_file=tmpfname,
                           size=getval(job, 'size'),
                           positions=getval(job, 'positions'),
                           numberof=getval(job, 'numberof'),
                           width=getval(job, 'width'),
                           height=getval(job, 'height'),
                           aspect=getval(batch_job, 'aspect'),
                           profile=getval(job, 'profile'),
                           update_fnc=_task_update_overall_status,
                           )
            return_code &= extraction_result

            ## only on success:
            if extraction_result:
                ## for every filename in the directorys, create a bibdoc that contains
                ## all sizes of the frame from the two directories
                files = os.listdir(tmpdir)
                for filename in files:
                    ## The docname was altered by BibEncode extract through substitution
                    ## Retrieve it from the filename again
                    bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext(filename)
                    _task_write_message("Creating new bibdoc for %s" % bibdoc_frame_docname)
                    ## If the bibdoc exists, receive it
                    if bibdoc_frame_docname in recdoc.get_bibdoc_names():
                        bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname)
                    ## Create a new bibdoc if it does not exist
                    else:
                        bibdoc_frame = recdoc.add_bibdoc(docname=bibdoc_frame_docname)

                    ## The filename including path from tmpdir
                    fname = os.path.join(tmpdir, filename)

                    bibdoc_frame_format = compose_format(bibdoc_frame_extension, bibdoc_frame_subformat)
                    ## Same as with the master, if the format allready exists,
                    ## override it, because something went wrong before
                    if bibdoc_frame.format_already_exists_p(bibdoc_frame_format):
                        bibdoc_frame.delete_file(bibdoc_frame_format, 1)
                    _task_write_message("Adding %s jpg;%s to BibDoc"
                                  % (bibdoc_frame_docname,
                                     getval(job, 'bibdoc_subformat')))
                    bibdoc_frame.add_file_new_format(
                                    fname,
                                    version=1,
                                    description=getval(job, 'bibdoc_description'),
                                    comment=getval(job, 'bibdoc_comment'),
                                    docformat=bibdoc_frame_format)
            ## Remove the temporary folders
            _task_write_message("Removing temporary directory")
            shutil.rmtree(tmpdir)

        _BATCH_STEP = _BATCH_STEP + 1

    #-----------------#
    # FIX BIBDOC/MARC #
    #-----------------#

    _task_write_message("----------- Handling MARCXML -----------")

    ## Fix the BibDoc for all the videos previously created
    _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname)
    bibdoc_video._build_file_list()

    ## Fix the MARC
    _task_write_message("Fixing MARC")
    cli_fix_marc({}, [batch_job['recid']], False)

    if getval(batch_job, 'collection'):
        ## Make the record visible by moving in from the collection
        marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>"
                   "<datafield tag=\"980\" ind1=\" \" ind2=\" \">"
                   "<subfield code=\"a\">%s</subfield></datafield></record>"
                   ) % (batch_job['recid'], batch_job['collection'])
        upload_marcxml_file(marcxml)

    #---------------------#
    # ADD MASTER METADATA #
    #---------------------#

    if getval(batch_job, 'add_master_metadata'):
        _task_write_message("Adding master metadata")
        pbcore = pbcore_metadata(input_file = getval(batch_job, 'input'),
                                 pbcoreIdentifier = batch_job['recid'],
                                 aspect_override = getval(batch_job, 'aspect'))
        from invenio_formatter.engines.xslt import format
        marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT)
        upload_marcxml_file(marcxml)

    #------------------#
    # ADD MARC SNIPPET #
    #------------------#

    if getval(batch_job, 'marc_snippet'):
        marc_snippet = open(getval(batch_job, 'marc_snippet'))
        marcxml = marc_snippet.read()
        marc_snippet.close()
        upload_marcxml_file(marcxml)

    #--------------#
    # DELETE INPUT #
    #--------------#

    if getval(batch_job, 'delete_input'):
        _task_write_message("Deleting input file")
        # only if successfull
        if not return_code:
            # only if input matches pattern
            if getval(batch_job, 'delete_input_pattern', '') in getval(batch_job, 'input'):
                try:
                    os.remove(getval(batch_job, 'input'))
                except OSError:
                    pass

    #--------------#
    # NOTIFICATION #
    #--------------#

    ## Send Notification emails on errors
    if not return_code:
        if getval(batch_job, 'notify_user'):
            _notify_error_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
            _task_write_message("Notify user because of an error")
        if getval(batch_job, 'notify_admin'):
            _task_write_message("Notify admin because of an error")
            if type(getval(batch_job, 'notify_admin') == type(str()) ):
                _notify_error_admin(batch_job,
                                    getval(batch_job, 'notify_admin'))

            else:
                _notify_error_admin(batch_job)
    else:
        if getval(batch_job, 'notify_user'):
            _task_write_message("Notify user because of success")
            _notify_success_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
    return 1
Exemple #59
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    host = CFG_DATABASE_HOST
    port = CFG_DATABASE_PORT
    connection = None
    active_queues = []
    try:
        if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'):
            connection = get_connection_for_dump_on_slave()
            write_message("Dump on slave requested")
            write_message("... checking if slave is well up...")
            check_slave_is_up(connection)
            write_message("... checking if slave is in consistent state...")
            check_slave_is_in_consistent_state(connection)
            write_message("... detaching slave database...")
            detach_slave(connection)
            write_message("... scheduling dump on slave helper...")
            helper_arguments = []
            if task_get_option("number"):
                helper_arguments += ["--number", str(task_get_option("number"))]
            if task_get_option("output"):
                helper_arguments += ["--output", str(task_get_option("output"))]
            if task_get_option("params"):
                helper_arguments += ["--params", str(task_get_option("params"))]
            if task_get_option("ignore_tables"):
                helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))]
            if task_get_option("compress"):
                helper_arguments += ["--compress"]
            if task_get_option("slave"):
                helper_arguments += ["--slave", str(task_get_option("slave"))]
            helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper']
            task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments)
            write_message("Slave scheduled with ID %s" % task_id)
            task_update_progress("DONE")
            return True
        elif task_get_option('dump_on_slave_helper_mode'):
            write_message("Dumping on slave mode")
            connection = get_connection_for_dump_on_slave()
            write_message("... checking if slave is well down...")
            check_slave_is_down(connection)
            host = CFG_DATABASE_SLAVE

        task_update_progress("Reading parameters")
        write_message("Reading parameters started")
        output_dir = task_get_option('output', CFG_LOGDIR)
        output_num = task_get_option('number', 5)
        params = task_get_option('params', None)
        compress = task_get_option('compress', False)
        slave = task_get_option('slave', False)
        ignore_tables = task_get_option('ignore_tables', None)
        if ignore_tables:
            ignore_tables = get_table_names(ignore_tables)
        else:
            ignore_tables = None

        output_file_suffix = task_get_task_param('task_starting_time')
        output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql'
        if compress:
            output_file_suffix = "%s.gz" % (output_file_suffix,)
        write_message("Reading parameters ended")

        if task_get_option('disable_workers'):
            active_queues = get_queues()
            if active_queues:
                write_message("Suspend workers and wait for any running tasks to complete")
                suspend_queues(active_queues)
                write_message("Workers suspended")

        # make dump:
        task_update_progress("Dumping database")
        write_message("Database dump started")

        if slave:
            output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,)
        else:
            output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,)
        output_file = output_file_prefix + output_file_suffix
        dump_path = output_dir + os.sep + output_file
        dump_database(dump_path, \
                        host=host,
                        port=port,
                        params=params, \
                        compress=compress, \
                        ignore_tables=ignore_tables)
        write_message("Database dump ended")
    finally:
        for queue in active_queues:
            enable_queue(queue)
        if connection and task_get_option('dump_on_slave_helper_mode'):
            write_message("Reattaching slave")
            attach_slave(connection)
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_file_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True