def handle(self, **options): LOGGER.info("indexing titles") index_titles() LOGGER.info("finished indexing titles") LOGGER.info("indexing pages") index_pages() LOGGER.info("finished indexing pages")
def handle(self, **options): _logger.info("indexing titles") index_titles() _logger.info("finished indexing titles") _logger.info("indexing pages") index_pages() _logger.info("finished indexing pages")
def xml_file_handler(self, marc_xml, skip_index): self.xml_start = datetime.now() results = title_loader.load(marc_xml) if not skip_index: # need to index any titles that we just created self.stdout.write("indexing new titles") index_titles(since=self.xml_start) return results
def handle(self, *args, **options): def get_immediate_subdirectories(a_dir): return [ name for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name)) ] def slack(message): sc.api_call("chat.postMessage", channel="#ghnp", text=message) start = datetime.now() sc = SlackClient(settings.SLACK_KEY) loader = BatchLoader() new_batches_location = '/opt/chronam/data/chronamftp/new_batches/' replacement_batches_location = '/opt/chronam/data/chronamftp/replacement_batches/' nonlccn_location = '/opt/chronam/data/nonlccn/' batch_drop = '/opt/chronam/data/dlg_batches/drop/' # GET LIST OF BATCHES TO LOAD new_batches = get_immediate_subdirectories(new_batches_location) replacement_batches = get_immediate_subdirectories( replacement_batches_location) # CHECK new_batches FOR finalMARC FOLDERS new_title_folders = [] for folder in new_batches: if 'MARC' in folder: new_title_folders.append(folder) new_batches.remove(folder) # ISSUE STARTING NOTIFICATIONS slack( 'Starting DLG Batch Load Process! Found `%s` new batches and `%s` replacement batches available to load.' % (len(new_batches), len(replacement_batches))) # RUN KEVIN'S RSYNC COMMANDS, WAIT slack('RSync of batches is starting') start_time = time.time() slack('Copying new batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/new_batches/* /opt/chronam/data/dlg_batches/drop/' ]) slack('Copying replacement batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/replacement_batches/* /opt/chronam/data/dlg_batches/drop/' ]) duration = time.time() - start_time slack('RSync of new and replacement batches completed in %s seconds' % duration) # LOAD NEW TITLES IF PRESENT if new_title_folders: slack('Also found `%s` title MARC files to process.' % len(new_title_folders)) for nt in new_title_folders: for nt_f in os.listdir(os.path.join(new_batches_location, nt)): if nt_f.endswith('.xml'): marc_file = os.path.join(nonlccn_location, nt_f) copyfile(os.path.join(new_batches_location, nt, nt_f), marc_file) title_load_results = title_loader.load(marc_file) if title_load_results[1]: slack('New title created from `%s`.' % nt_f) if title_load_results[2]: slack('Title updated from `%s`.' % nt_f) if title_load_results[3]: slack('Error on title load from `%s`' % nt_f) index_titles(start) slack('Finished loading titles.') # PURGE REPLACEMENT BATCHES if replacement_batches: slack('Purging batches destined for replacement.') for r_b in replacement_batches: batch_to_purge = r_b.replace('ver02','ver01')\ .replace('ver03','ver02')\ .replace('ver04','ver03')\ .replace('ver05','ver04')\ .replace('ver06','ver05')\ .replace('ver07','ver06')\ .replace('ver08','ver07') slack('Purging `%s`.' % batch_to_purge) loader.purge_batch(batch_to_purge) start_time = time.time() solr = SolrConnection(settings.SOLR) solr.optimize() slack('Index optimize complete in `%s` seconds.' % time.time() - start_time) # LOAD ALL BATCHES # start with replacement batches final_loader = batch_loader.BatchLoader(process_ocr=True, process_coordinates=True) if replacement_batches: replace_start = time.time() for replacement in replacement_batches: final_loader.load_batch('drop/%s' % replacement, strict=False) slack('Loaded replacement batch `%s`.' % replacement) slack('All replacement batches loaded in `%s` seconds.' % time.time() - replace_start) # load new batches if new_batches: new_start = time.time() for new in new_batches: final_loader.load_batch('drop/%s' % new, strict=False) slack('Loaded new batch `%s`.' % new) slack('All new batches loaded in `%s` seconds.' % time.time() - new_start) slack('Batch loading job complete!')
class Command(BaseCommand): skip_essays = make_option('--skip-essays', action='store_true', dest='skip_essays', default=False, help='Skip essay loading.') pull_title_updates = make_option('--pull-title-updates', action='store_true', dest='pull_title_updates', default=False, help='Pull down a new set of titles.') option_list = BaseCommand.option_list + (skip_essays, pull_title_updates) help = 'Runs title pull and title load for a complete title refresh.' args = '' def find_titles_not_updated(self, limited=True): _logger.info("Looking for titles not yet updated.") if Title.objects.count() == 0: _logger.info("Total number of titles not updated: 0") return Title.objects.values() elif limited: titles = Title.objects.order_by('-version').values( 'lccn_orig', 'oclc', 'version') end = titles[0]['version'] else: titles = Title.objects.order_by('-version') end = titles[0].version start = end - timedelta(weeks=2) titles = titles.exclude(version__range=(start, end)) _logger.info("Total number of titles not updated: %s" % len(titles)) return titles def pull_lccn_updates(self, titles): start = datetime.now() for t in titles: call_command('pull_titles', lccn=t['lccn_orig'], oclc=t['oclc']) end = datetime.now() total_time = end - start _logger.info('total time for pull_lccn_updates: %s' % total_time) return def handle(self, *args, **options): start = datetime.now() _logger.info("Starting title sync process.") # only load titles if the BIB_STORAGE is there, not always the case # for folks in the opensource world bib_in_settings = validate_bib_dir() if bib_in_settings: worldcat_dir = bib_in_settings + '/worldcat_titles/' pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY")) if pull_titles: call_command('pull_titles', ) _logger.info("Starting load of OCLC titles.") bulk_dir = worldcat_dir + 'bulk' if os.path.isdir(bulk_dir): call_command('load_titles', bulk_dir, skip_index=True) tnu = self.find_titles_not_updated() # Only update by individual lccn if there are records that need updating. if pull_titles and len(tnu): _logger.info( "Pulling titles from OCLC by individual lccn & oclc num.") self.pull_lccn_updates(tnu) _logger.info("Loading titles from second title pull.") lccn_dir = worldcat_dir + 'lccn' if os.path.isdir(lccn_dir): call_command('load_titles', lccn_dir, skip_index=True) tnu = self.find_titles_not_updated(limited=False) _logger.info("Running pre-deletion checks for these titles.") if bib_in_settings: if len(tnu): # Delete titles haven't been update & issues attached. for title in tnu: issues = title.issues.all() error = "DELETION ERROR: Title %s has " % title error_end = "It will not be deleted." if issues: _logger.warning(error + 'issues.' + error_end) continue # Load holdings for all remaining titles. call_command('load_holdings') # overlay place info harvested from dbpedia onto the places table try: self.load_place_links() except Exception, e: _logger.exception(e) index.index_titles() # Time of full process run end = datetime.now() total_time = end - start _logger.info('start time: %s' % start) _logger.info('end time: %s' % end) _logger.info('total time: %s' % total_time) _logger.info("title_sync done.")
def handle(self, *args, **options): start = datetime.now() LOGGER.info("Starting title sync process.") # only load titles if the BIB_STORAGE is there, not always the case # for folks in the opensource world bib_in_settings = validate_bib_dir() if bib_in_settings: worldcat_dir = bib_in_settings + '/worldcat_titles/' pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY")) if pull_titles: call_command('pull_titles') LOGGER.info("Starting load of OCLC titles.") bulk_dir = worldcat_dir + 'bulk' if os.path.isdir(bulk_dir): call_command('load_titles', bulk_dir, skip_index=True) tnu = self.find_titles_not_updated() # Only update by individual lccn if there are records that need updating. if pull_titles and len(tnu): LOGGER.info( "Pulling titles from OCLC by individual lccn & oclc num.") self.pull_lccn_updates(tnu) LOGGER.info("Loading titles from second title pull.") lccn_dir = worldcat_dir + 'lccn' if os.path.isdir(lccn_dir): call_command('load_titles', lccn_dir, skip_index=True) tnu = self.find_titles_not_updated(limited=False) LOGGER.info("Running pre-deletion checks for these titles.") # Make sure that our essays are up to date if not options['skip_essays']: load_essays(settings.ESSAYS_FEED) if bib_in_settings: if len(tnu): # Delete titles haven't been update & do not have essays or issues attached. for title in tnu: essays = title.essays.all() issues = title.issues.all() error = "DELETION ERROR: Title %s has " % title error_end = "It will not be deleted." if not essays or not issues: delete_txt = (title.name, title.lccn, title.oclc) LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' % delete_txt) title.delete() elif essays: LOGGER.warning(error + 'essays.' + error_end) continue elif issues: LOGGER.warning(error + 'issues.' + error_end) continue # Load holdings for all remaining titles. call_command('load_holdings') # overlay place info harvested from dbpedia onto the places table try: self.load_place_links() except Exception as e: LOGGER.exception(e) index.index_titles() # Time of full process run end = datetime.now() total_time = end - start LOGGER.info('start time: %s' % start) LOGGER.info('end time: %s' % end) LOGGER.info('total time: %s' % total_time) LOGGER.info("title_sync done.")
def handle(self, *args, **options): start = datetime.now() LOGGER.info("Starting title sync process.") # only load titles if the BIB_STORAGE is there, not always the case # for folks in the opensource world bib_in_settings = validate_bib_dir() if bib_in_settings: worldcat_dir = bib_in_settings + '/worldcat_titles/' pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY")) if pull_titles: call_command('pull_titles') LOGGER.info("Starting load of OCLC titles.") bulk_dir = worldcat_dir + 'bulk' if os.path.isdir(bulk_dir): call_command('load_titles', bulk_dir, skip_index=True) tnu = self.find_titles_not_updated() # Only update by individual lccn if there are records that need updating. if pull_titles and len(tnu): LOGGER.info("Pulling titles from OCLC by individual lccn & oclc num.") self.pull_lccn_updates(tnu) LOGGER.info("Loading titles from second title pull.") lccn_dir = worldcat_dir + 'lccn' if os.path.isdir(lccn_dir): call_command('load_titles', lccn_dir, skip_index=True) tnu = self.find_titles_not_updated(limited=False) LOGGER.info("Running pre-deletion checks for these titles.") # Make sure that our essays are up to date if not options['skip_essays']: load_essays(settings.ESSAYS_FEED) if bib_in_settings: if len(tnu): # Delete titles haven't been update & do not have essays or issues attached. for title in tnu: essays = title.essays.all() issues = title.issues.all() error = "DELETION ERROR: Title %s has " % title error_end = "It will not be deleted." if not essays or not issues: delete_txt = (title.name, title.lccn, title.oclc) LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' % delete_txt) title.delete() elif essays: LOGGER.warning(error + 'essays.' + error_end) continue elif issues: LOGGER.warning(error + 'issues.' + error_end) continue # Load holdings for all remaining titles. call_command('load_holdings') # overlay place info harvested from dbpedia onto the places table try: self.load_place_links() except Exception as e: LOGGER.exception(e) index.index_titles() # Time of full process run end = datetime.now() total_time = end - start LOGGER.info('start time: %s' % start) LOGGER.info('end time: %s' % end) LOGGER.info('total time: %s' % total_time) LOGGER.info("title_sync done.")