def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''): """ Synchronize to either 'afs' or 'redis' with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs with_claims: yes/no, whether record involved in some new claim need to be re-exported. skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored, e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '') notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a') modified_records += notimechangerecs if with_citations.lower() == 'yes': for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) if with_claims.lower() == 'yes': modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d' ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, ))) except IOError: # Default to everything with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") skip_collections = skip_collections.split(',') skip_collections.remove('') for collection in skip_collections: modified_records -= search_pattern(p='980:%s' % collection) if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
def create(default_data=True): """Creates database tables from sqlalchemy models""" print ">>> Going to create tables..." from sqlalchemy import event from invenio.dateutils import get_time_estimator from invenio.inveniocfg import test_db_connection from invenio.sqlalchemyutils import db, autodiscover_models test_db_connection() autodiscover_models() def cfv_after_create(target, connection, **kw): print print ">>> Modifing table structure..." from invenio.dbquery import run_sql run_sql('ALTER TABLE collection_field_fieldvalue DROP PRIMARY KEY') run_sql( 'ALTER TABLE collection_field_fieldvalue ADD INDEX id_collection(id_collection)' ) run_sql( 'ALTER TABLE collection_field_fieldvalue CHANGE id_fieldvalue id_fieldvalue mediumint(9) unsigned' ) #print run_sql('SHOW CREATE TABLE collection_field_fieldvalue') from invenio.websearch_model import CollectionFieldFieldvalue event.listen(CollectionFieldFieldvalue.__table__, "after_create", cfv_after_create) tables = db.metadata.sorted_tables N = len(tables) prefix = '>>> Creating %d tables ...' % N e = get_time_estimator(N) created = 0 for i, table in enumerate(tables): try: print_progress(1.0 * i / N, prefix=prefix, suffix=str(datetime.timedelta(seconds=e()[0]))) table.create(bind=db.engine) created += 1 except: print '\r', '>>> problem with creating table', table print if created == N: print ">>> Tables created successfully." else: print "ERROR: not all tables were properly created." print ">>> Created", created, 'out of', N populate(default_data)
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, 'dumps')) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { 'prefix': CFG_SITE_URL, 'collection': collection, 'date': time.ctime() } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, 'dumps', '.%s-records.xml.gz' % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): with run_ro_on_slave_db(): print >> output, format_record(recid, 'xme', user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % (collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)))) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >> open(output_path + '.md5', "w"), calculate_md5(output_path) os.rename( output_path, os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz' % collection)) os.rename( output_path + '.md5', os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz.md5' % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, "dumps")) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { "prefix": CFG_SITE_URL, "collection": collection, "date": time.ctime(), } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): print >> output, format_record(recid, "xme", user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % ( collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)), ) ) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >>open(output_path + ".md5", "w"), calculate_md5(output_path) os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection)) os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename( os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html") )
def bst_prodsync(method='afs'): """ Synchronize to either 'afs' or 'redis' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) modified_records = intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) for citee, citer in run_sql( "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) modified_records |= intbitset( run_sql( "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) except IOError: # Default to the epoch modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
def bst_prodsync(method='afs'): """ Synchronize to either 'afs' or 'redis' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) except IOError: # Default to the epoch modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) else: redis_sync(reversed(modified_records), time_estimator, tot) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!")
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''): """ Synchronize to either 'afs' or 'redis' with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs with_claims: yes/no, whether record involved in some new claim need to be re-exported. skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored, e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) with run_ro_on_slave_db(): modified_records = intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '') notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a') modified_records += notimechangerecs if with_citations.lower() == 'yes': for citee, citer in run_sql( "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) if with_claims.lower() == 'yes': modified_records |= intbitset( run_sql( "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) modified_records |= intbitset( run_sql( "SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d" " ON p.personid = d.personid WHERE d.last_updated>=%s", (last_run, ))) except IOError: # Default to everything with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") skip_collections = skip_collections.split(',') skip_collections.remove('') for collection in skip_collections: modified_records -= search_pattern(p='980:%s' % collection) if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
def drop(yes_i_know=False): """Drops database tables""" print ">>> Going to drop tables and related data on filesystem ..." from sqlalchemy import event from invenio.dateutils import get_time_estimator from invenio.textutils import wrap_text_in_a_box, wait_for_user from invenio.webstat import destroy_customevents from invenio.inveniocfg import test_db_connection from invenio.sqlalchemyutils import db from invenio.bibdocfile import _make_base_dir ## Step 0: confirm deletion wait_for_user( wrap_text_in_a_box( """WARNING: You are going to destroy your database tables and related data on filesystem!""" )) ## Step 1: test database connection test_db_connection() ## Step 2: disable foreign key checks if db.engine.name == 'mysql': db.engine.execute('SET FOREIGN_KEY_CHECKS=0;') ## Step 3: destroy associated data try: msg = destroy_customevents() if msg: print msg except: print "ERROR: Could not destroy customevents." ## FIXME: move to bibedit_model def bibdoc_before_drop(target, connection_dummy, **kw_dummy): print print ">>> Going to remove records data..." for (docid, ) in db.session.query(target.c.id).all(): directory = _make_base_dir(docid) if os.path.isdir(directory): print ' >>> Removing files for docid =', docid shutil.rmtree(directory) db.session.commit() print ">>> Data has been removed." from invenio.bibedit_model import Bibdoc event.listen(Bibdoc.__table__, "before_drop", bibdoc_before_drop) tables = list(reversed(db.metadata.sorted_tables)) N = len(tables) prefix = '>>> Dropping %d tables ...' % N e = get_time_estimator(N) dropped = 0 for i, table in enumerate(tables): try: print_progress(1.0 * i / N, prefix=prefix, suffix=str(datetime.timedelta(seconds=e()[0]))) table.drop(bind=db.engine) dropped += 1 except: print '\r', '>>> problem with dropping table', table print if dropped == N: print ">>> Tables dropped successfully." else: print "ERROR: not all tables were properly dropped." print ">>> Dropped", dropped, 'out of', N