Esempio n. 1
0
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, )))
            compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, )))
                modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d'
                                                      ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Esempio n. 2
0
def create(default_data=True):
    """Creates database tables from sqlalchemy models"""

    print ">>> Going to create tables..."

    from sqlalchemy import event
    from invenio.dateutils import get_time_estimator
    from invenio.inveniocfg import test_db_connection
    from invenio.sqlalchemyutils import db, autodiscover_models

    test_db_connection()
    autodiscover_models()

    def cfv_after_create(target, connection, **kw):
        print
        print ">>> Modifing table structure..."
        from invenio.dbquery import run_sql
        run_sql('ALTER TABLE collection_field_fieldvalue DROP PRIMARY KEY')
        run_sql(
            'ALTER TABLE collection_field_fieldvalue ADD INDEX id_collection(id_collection)'
        )
        run_sql(
            'ALTER TABLE collection_field_fieldvalue CHANGE id_fieldvalue id_fieldvalue mediumint(9) unsigned'
        )
        #print run_sql('SHOW CREATE TABLE collection_field_fieldvalue')

    from invenio.websearch_model import CollectionFieldFieldvalue
    event.listen(CollectionFieldFieldvalue.__table__, "after_create",
                 cfv_after_create)

    tables = db.metadata.sorted_tables
    N = len(tables)

    prefix = '>>> Creating %d tables ...' % N

    e = get_time_estimator(N)
    created = 0

    for i, table in enumerate(tables):
        try:
            print_progress(1.0 * i / N,
                           prefix=prefix,
                           suffix=str(datetime.timedelta(seconds=e()[0])))
            table.create(bind=db.engine)
            created += 1
        except:
            print '\r', '>>> problem with creating table', table

    print

    if created == N:
        print ">>> Tables created successfully."
    else:
        print "ERROR: not all tables were properly created."
        print ">>> Created", created, 'out of', N

    populate(default_data)
Esempio n. 3
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, 'dumps'))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
                      "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            'prefix': CFG_SITE_URL,
            'collection': collection,
            'date': time.ctime()
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, 'dumps',
                                   '.%s-records.xml.gz' % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            with run_ro_on_slave_db():
                print >> output, format_record(recid, 'xme', user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s" %
                    (collection, recid, (i + 1) * 100 / tot,
                     time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time_estimation))))
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >> open(output_path + '.md5', "w"), calculate_md5(output_path)
        os.rename(
            output_path,
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz' % collection))
        os.rename(
            output_path + '.md5',
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz.md5' % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
              os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
Esempio n. 4
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, "dumps"))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            "prefix": CFG_SITE_URL,
            "collection": collection,
            "date": time.ctime(),
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            print >> output, format_record(recid, "xme", user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s"
                    % (
                        collection,
                        recid,
                        (i + 1) * 100 / tot,
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)),
                    )
                )
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >>open(output_path + ".md5", "w"), calculate_md5(output_path)
        os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection))
        os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(
        os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html")
    )
Esempio n. 5
0
def bst_prodsync(method='afs'):
    """
    Synchronize to either 'afs' or 'redis'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR,
                                'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        modified_records = intbitset(
            run_sql("SELECT id FROM bibrec WHERE modification_date>=%s",
                    (last_run, )))
        for citee, citer in run_sql(
                "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s",
            (last_run, )):
            modified_records.add(citer)
        modified_records |= intbitset(
            run_sql(
                "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s",
                (last_run, )))
    except IOError:
        # Default to the epoch
        modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Esempio n. 6
0
def bst_prodsync(method='afs'):
    """
    Synchronize to either 'afs' or 'redis'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, )))
        for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )):
            modified_records.add(citer)
        modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, )))
    except IOError:
        # Default to the epoch
        modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
    else:
        redis_sync(reversed(modified_records), time_estimator, tot)
    open(lastrun_path, "w").write(future_lastrun)
    write_message("DONE!")
Esempio n. 7
0
def bst_prodsync(method='afs',
                 with_citations='yes',
                 with_claims='yes',
                 skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR,
                                'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(
                run_sql("SELECT id FROM bibrec WHERE modification_date>=%s",
                        (last_run, )))
            compacttime = last_run.replace('-',
                                           '').replace(' ',
                                                       '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime,
                                           f='005',
                                           m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql(
                        "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s",
                    (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(
                    run_sql(
                        "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s",
                        (last_run, )))
                modified_records |= intbitset(
                    run_sql(
                        "SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d"
                        " ON p.personid = d.personid WHERE d.last_updated>=%s",
                        (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Esempio n. 8
0
def drop(yes_i_know=False):
    """Drops database tables"""

    print ">>> Going to drop tables and related data on filesystem ..."

    from sqlalchemy import event
    from invenio.dateutils import get_time_estimator
    from invenio.textutils import wrap_text_in_a_box, wait_for_user
    from invenio.webstat import destroy_customevents
    from invenio.inveniocfg import test_db_connection
    from invenio.sqlalchemyutils import db
    from invenio.bibdocfile import _make_base_dir

    ## Step 0: confirm deletion
    wait_for_user(
        wrap_text_in_a_box(
            """WARNING: You are going to destroy your database tables and related data on filesystem!"""
        ))

    ## Step 1: test database connection
    test_db_connection()

    ## Step 2: disable foreign key checks
    if db.engine.name == 'mysql':
        db.engine.execute('SET FOREIGN_KEY_CHECKS=0;')

    ## Step 3: destroy associated data
    try:
        msg = destroy_customevents()
        if msg:
            print msg
    except:
        print "ERROR: Could not destroy customevents."

    ## FIXME: move to bibedit_model
    def bibdoc_before_drop(target, connection_dummy, **kw_dummy):
        print
        print ">>> Going to remove records data..."
        for (docid, ) in db.session.query(target.c.id).all():
            directory = _make_base_dir(docid)
            if os.path.isdir(directory):
                print '    >>> Removing files for docid =', docid
                shutil.rmtree(directory)
        db.session.commit()
        print ">>> Data has been removed."

    from invenio.bibedit_model import Bibdoc
    event.listen(Bibdoc.__table__, "before_drop", bibdoc_before_drop)

    tables = list(reversed(db.metadata.sorted_tables))
    N = len(tables)

    prefix = '>>> Dropping %d tables ...' % N

    e = get_time_estimator(N)
    dropped = 0

    for i, table in enumerate(tables):
        try:
            print_progress(1.0 * i / N,
                           prefix=prefix,
                           suffix=str(datetime.timedelta(seconds=e()[0])))
            table.drop(bind=db.engine)
            dropped += 1
        except:
            print '\r', '>>> problem with dropping table', table

    print
    if dropped == N:
        print ">>> Tables dropped successfully."
    else:
        print "ERROR: not all tables were properly dropped."
        print ">>> Dropped", dropped, 'out of', N