def fetch_index_update():
    """Fetch last runtime of given task"""
    end_date = get_bibrankmethod_lastupdate('citation')

    if CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID:
        bibauthorid_end_date = fetch_bibauthorid_last_update()
        end_date = min(end_date, bibauthorid_end_date)

    return end_date
Exemple #2
0
def fetch_index_update():
    """Fetch last runtime of given task"""
    end_date = get_bibrankmethod_lastupdate('citation')

    if CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID:
        bibauthorid_end_date = fetch_bibauthorid_last_update()
        end_date = min(end_date, bibauthorid_end_date)

    return end_date
def fetch_concerned_records(name, ids_param):
    """Fetch records that have been updated since the last run of the daemon"""
    if ids_param:
        recids = intbitset()
        for first, last in ids_param:
            recids += range(first, last+1)
        end_date = None
    else:
        start_date = get_bibrankmethod_lastupdate(name)
        end_date = fetch_index_update()
        recids = fetch_records(start_date, end_date)
    return recids, end_date
Exemple #4
0
def fetch_concerned_records(name, ids_param):
    """Fetch records that have been updated since the last run of the daemon"""
    if ids_param:
        recids = intbitset()
        for first, last in ids_param:
            recids += range(first, last+1)
        end_date = None
    else:
        start_date = get_bibrankmethod_lastupdate(name)
        end_date = fetch_index_update()
        recids = fetch_records(start_date, end_date)
    return recids, end_date
def fetch_concerned_records(name):
    start_date = get_bibrankmethod_lastupdate(name)
    end_date = fetch_index_update()
    return fetch_records(start_date, end_date)
Exemple #6
0
def bibreformat_task(fmt, recids, without_fmt, process):
    """BibReformat main task.

    @param fmt: output format to use
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    latest_bibrank_run = get_bibrankmethod_lastupdate('citation')

    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime(
                    "%Y-%m-%d %H:%M:%S") < latest_bibrank_run

            rel_recids = intbitset([
                recid for recid, mod_date in run_sql(sql)
                if check_date(mod_date)
            ])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids

    def recid_chunker(recids):
        recids_processed = intbitset()
        chunk = intbitset()

        for recid in recids:
            if len(chunk) == 5000:
                for r in related_records(chunk, recids_processed):
                    yield r
                recids_processed += chunk
                chunk = intbitset()

            if recid not in recids_processed:
                chunk.add(recid)

        if chunk:
            for r in related_records(chunk, recids_processed):
                yield r

    recIDs = list(recid_chunker(recids))

    ### list of corresponding record IDs was retrieved
    ### now format the selected records

    if without_fmt:
        write_message("Records to be processed: %d" % len(recIDs))
        write_message("Out of it records without existing cache: %d" %
                      len(without_fmt))
    else:
        write_message("Records to be processed: %d" % len(recIDs))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #7
0
def bibreformat_task(fmt, recids, without_fmt, process):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    latest_bibrank_run = get_bibrankmethod_lastupdate('citation')

    def related_records(recids, recids_processed):
        if fmt == "HDREF" and recids:
            # HDREF represents the references tab
            # the tab needs to be recomputed not only when the record changes
            # but also when one of the citations changes
            sql = """SELECT id, modification_date FROM bibrec
                     WHERE id in (%s)""" % ','.join(str(r) for r in recids)

            def check_date(mod_date):
                return mod_date.strftime("%Y-%m-%d %H:%M:%S") < latest_bibrank_run
            rel_recids = intbitset([recid for recid, mod_date in run_sql(sql)
                                                    if check_date(mod_date)])
            for r in rel_recids:
                recids |= intbitset(get_cited_by(r))

        # To not process recids twice
        recids -= recids_processed
        # Adds to the set of processed recids
        recids_processed += recids

        return recids

    def recid_chunker(recids):
        recids_processed = intbitset()
        chunk = intbitset()

        for recid in recids:
            if len(chunk) == 5000:
                for r in related_records(chunk, recids_processed):
                    yield r
                recids_processed += chunk
                chunk = intbitset()

            if recid not in recids_processed:
                chunk.add(recid)

        if chunk:
            for r in related_records(chunk, recids_processed):
                yield r

    recIDs = list(recid_chunker(recids))

### list of corresponding record IDs was retrieved
### now format the selected records

    if without_fmt:
        write_message("Records to be processed: %d" % len(recIDs))
        write_message("Out of it records without existing cache: %d" %
                                                        len(without_fmt))
    else:
        write_message("Records to be processed: %d" % len(recIDs))


### Initialize main loop

    total_rec   = 0     # Total number of records
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call


### Iterate over all records prepared in lists I (option)
    if process:
        total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)

### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #8
0
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    t1 = os.times()[4]


### Query the database
###
    task_update_progress('Fetching records to process')
    if process_format: # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = recids

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(perform_request_search(req=None, of='id',
                                         c=cds_query['collection'],
                                         p=cds_query['pattern'],
                                         f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF":
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" % len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec   = 0     # Total number of records
    tbibformat  = 0     # time taken up by external call
    tbibupload  = 0     # time taken up by external call


### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs,
                                                                         fmt)
        else:
            (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs,
                                                                         fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this
                                            # when migration from php to
                                            # python bibformat is done
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format,
                                                                         fmt)
        else:
            (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format,
                                                                         fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process,
                     recids):
    """
    BibReformat main task

    @param fmt: output format to use
    @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case
    @param sql_queries: a list of sql queries to be executed to select records to reformat.
    @param cds_query: a search query to be executed to select records to reformat
    @param process_format:
    @param process:
    @param recids: a list of record IDs to reformat
    @return: None
    """
    write_message("Processing format %s" % fmt)

    t1 = os.times()[4]

    start_date = datetime.now()

    ### Query the database
    ###
    task_update_progress('Fetching records to process')
    if process_format:  # '-without' parameter
        write_message("Querying database for records without cache...")
        without_format = without_fmt(sql)

    recIDs = intbitset(recids)

    if cds_query['field']      != "" or  \
       cds_query['collection'] != "" or  \
       cds_query['pattern']    != "":

        write_message("Querying database (CDS query)...")

        if cds_query['collection'] == "":
            # use search_pattern() whenever possible, as it can search
            # even in private collections
            res = search_pattern(p=cds_query['pattern'],
                                 f=cds_query['field'],
                                 m=cds_query['matching'])
        else:
            # use perform_request_search when '-c' argument has been
            # defined, as it is not supported by search_pattern()
            res = intbitset(
                perform_request_search(req=None,
                                       of='id',
                                       c=cds_query['collection'],
                                       p=cds_query['pattern'],
                                       f=cds_query['field']))

        recIDs |= res

    for sql_query in sql_queries:
        write_message("Querying database (%s) ..." % sql_query, verbose=2)
        recIDs |= intbitset(run_sql(sql_query))

    if fmt == "HDREF" and recIDs:
        # HDREF represents the references tab
        # the tab needs to be recomputed not only when the record changes
        # but also when one of the citations changes
        latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
        start_date = latest_bibrank_run
        sql = """SELECT id, modification_date FROM bibrec
                 WHERE id in (%s)""" % ','.join(str(r) for r in recIDs)

        def check_date(mod_date):
            return mod_date < latest_bibrank_run
        recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \
                                                    if check_date(mod_date)])
        for r in recIDs:
            recIDs |= intbitset(get_cited_by(r))

### list of corresponding record IDs was retrieved
### now format the selected records

    if process_format:
        write_message("Records to be processed: %d" % (len(recIDs) \
                                               + len(without_format)))
        write_message("Out of it records without existing cache: %d" %
                      len(without_format))
    else:
        write_message("Records to be processed: %d" % (len(recIDs)))

### Initialize main loop

    total_rec = 0  # Total number of records
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call

    ### Iterate over all records prepared in lists I (option)
    if process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_old(recIDs, fmt)
        else:
            (total_rec_1, tbibformat_1,
             tbibupload_1) = iterate_over_new(recIDs, fmt)
        total_rec += total_rec_1
        tbibformat += tbibformat_1
        tbibupload += tbibupload_1

### Iterate over all records prepared in list II (no_format)
    if process_format and process:
        if CFG_BIBFORMAT_USE_OLD_BIBFORMAT:  # FIXME: remove this
            # when migration from php to
            # python bibformat is done
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_old(without_format, fmt)
        else:
            (total_rec_2, tbibformat_2,
             tbibupload_2) = iterate_over_new(without_format, fmt)
        total_rec += total_rec_2
        tbibformat += tbibformat_2
        tbibupload += tbibupload_2

### Store last run time
    if task_has_option("last"):
        write_message("storing run date to %s" % start_date)
        store_last_updated(fmt, start_date)


### Final statistics

    t2 = os.times()[4]

    elapsed = t2 - t1
    message = "total records processed: %d" % total_rec
    write_message(message)

    message = "total processing time: %2f sec" % elapsed
    write_message(message)

    message = "Time spent on external call (os.system):"
    write_message(message)

    message = " bibformat: %2f sec" % tbibformat
    write_message(message)

    message = " bibupload: %2f sec" % tbibupload
    write_message(message)
Exemple #10
0
def fetch_concerned_records(name):
    start_date = get_bibrankmethod_lastupdate(name)
    end_date = fetch_index_update()
    return fetch_records(start_date, end_date)

def wait_for_task(task_id):
    sql = 'SELECT status FROM schTASK WHERE id = %s'
    while run_sql(sql, [task_id])[0][0] != 'DONE':
        time.sleep(5)


def submit(recids):
    print 'submitting %s' % str(recids)
    task_id = task_low_level_submission('bibreformat', 'catchup-doi', '-o', FORMAT, '-P', '5', '-i', ','.join(str(r) for r in recids))
    wait_for_task(task_id)


max_id = run_sql("SELECT max(id) FROM bibrec")[0][0]
latest_bibrank_run = get_bibrankmethod_lastupdate('citation')


recids = xrange(1, max_id + 1)
to_update = []

for recid in recids:
    if recid % 50 == 0:
        print '%s of %s' % (recid, max_id)

    ret = run_sql('SELECT id FROM bibrec WHERE id = %s', [recid])
    if not ret:
        continue

    ret = run_sql('SELECT id_bibrec FROM bibfmt WHERE format = %s AND id_bibrec = %s', [FORMAT, recid])
    if not ret: