def solr_reindex (self): """ Reindex all entries. Used when switching to/from "private" status. """ solr_conn = SolrConnection(settings.SOLR_URL) # Start by deleting 'em all solr_conn.delete_query('user:%s' % self.user.id) entries = Entry.objects.filter(user=self.user) docs = [] # Arbitrary assignment of a constant, here. SLICE_SIZE = 50 slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: docs.append(entry.solr_doc) if len(docs) == SLICE_SIZE: try: solr_conn.add_many(docs) except: # should log appropriately, huh pass del(docs) docs = [] # Don't miss the leftovers solr_conn.add_many(docs) solr_conn.commit() solr_conn.optimize()
def handle(self, *args, **options): self.stdout.write("Optimizing Solr index %s" % settings.SOLR) solr = SolrConnection(settings.SOLR) start_time = default_timer() solr.optimize() elapsed = default_timer() - start_time self.stdout.write("Solr took %0.3f seconds to optimize %s" % (elapsed, settings.SOLR))
class Command(BaseCommand): user_option = optparse.make_option( '--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option, ) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s + SLICE_SIZE) entry_slice = entries[s:s + SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del (docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
class Command(BaseCommand): user_option = optparse.make_option('--user', action='store', dest='user', help='name of user whose entries to purge') option_list = BaseCommand.option_list + (user_option,) help = "index all or user-specific entries in solr" args = 'an optional username' def handle(self, *args, **options): self.solr = SolrConnection(SOLR_URL) self.cursor = connection.cursor() if options['user']: print "indexing user" self.index_entries(user=options['user']) else: print 'indexing everything' self.index_entries() print 'committing' self.solr.commit() print 'optimizing' self.solr.optimize() def index_entries(self, user=''): counter = 0 entries = m.Entry.objects.all() if user: entries = entries.filter(user__username=user) docs = [] print 'entry count:', entries.count() SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: print 'indexing %s to %s...' % (s, s+SLICE_SIZE) entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: counter += 1 docs.append(entry.solr_doc) if len(docs) == MAX_DOCS_PER_ADD: try: self.solr.add_many(docs) except: print 'BAD RECORD:', [d['id'] for d in docs] del(docs) docs = [] reset_queries() if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0: print 'committing at count:', counter self.solr.commit() # Don't miss the leftovers self.solr.add_many(docs)
def handle(self, batch_name=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch '%s'", batch_name) loader.purge_batch(batch_name) if options['optimize']: log.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() log.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") log.info("finished optimizing") except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def handle(self, batch_location=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch %s", batch_location) loader.purge_batch(batch_location) if options['optimize']: log.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() log.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") log.info("finished optimizing") except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def handle(self, *args, **options): def get_immediate_subdirectories(a_dir): return [ name for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name)) ] def slack(message): sc.api_call("chat.postMessage", channel="#ghnp", text=message) start = datetime.now() sc = SlackClient(settings.SLACK_KEY) loader = BatchLoader() new_batches_location = '/opt/chronam/data/chronamftp/new_batches/' replacement_batches_location = '/opt/chronam/data/chronamftp/replacement_batches/' nonlccn_location = '/opt/chronam/data/nonlccn/' batch_drop = '/opt/chronam/data/dlg_batches/drop/' # GET LIST OF BATCHES TO LOAD new_batches = get_immediate_subdirectories(new_batches_location) replacement_batches = get_immediate_subdirectories( replacement_batches_location) # CHECK new_batches FOR finalMARC FOLDERS new_title_folders = [] for folder in new_batches: if 'MARC' in folder: new_title_folders.append(folder) new_batches.remove(folder) # ISSUE STARTING NOTIFICATIONS slack( 'Starting DLG Batch Load Process! Found `%s` new batches and `%s` replacement batches available to load.' % (len(new_batches), len(replacement_batches))) # RUN KEVIN'S RSYNC COMMANDS, WAIT slack('RSync of batches is starting') start_time = time.time() slack('Copying new batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/new_batches/* /opt/chronam/data/dlg_batches/drop/' ]) slack('Copying replacement batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/replacement_batches/* /opt/chronam/data/dlg_batches/drop/' ]) duration = time.time() - start_time slack('RSync of new and replacement batches completed in %s seconds' % duration) # LOAD NEW TITLES IF PRESENT if new_title_folders: slack('Also found `%s` title MARC files to process.' % len(new_title_folders)) for nt in new_title_folders: for nt_f in os.listdir(os.path.join(new_batches_location, nt)): if nt_f.endswith('.xml'): marc_file = os.path.join(nonlccn_location, nt_f) copyfile(os.path.join(new_batches_location, nt, nt_f), marc_file) title_load_results = title_loader.load(marc_file) if title_load_results[1]: slack('New title created from `%s`.' % nt_f) if title_load_results[2]: slack('Title updated from `%s`.' % nt_f) if title_load_results[3]: slack('Error on title load from `%s`' % nt_f) index_titles(start) slack('Finished loading titles.') # PURGE REPLACEMENT BATCHES if replacement_batches: slack('Purging batches destined for replacement.') for r_b in replacement_batches: batch_to_purge = r_b.replace('ver02','ver01')\ .replace('ver03','ver02')\ .replace('ver04','ver03')\ .replace('ver05','ver04')\ .replace('ver06','ver05')\ .replace('ver07','ver06')\ .replace('ver08','ver07') slack('Purging `%s`.' % batch_to_purge) loader.purge_batch(batch_to_purge) start_time = time.time() solr = SolrConnection(settings.SOLR) solr.optimize() slack('Index optimize complete in `%s` seconds.' % time.time() - start_time) # LOAD ALL BATCHES # start with replacement batches final_loader = batch_loader.BatchLoader(process_ocr=True, process_coordinates=True) if replacement_batches: replace_start = time.time() for replacement in replacement_batches: final_loader.load_batch('drop/%s' % replacement, strict=False) slack('Loaded replacement batch `%s`.' % replacement) slack('All replacement batches loaded in `%s` seconds.' % time.time() - replace_start) # load new batches if new_batches: new_start = time.time() for new in new_batches: final_loader.load_batch('drop/%s' % new, strict=False) slack('Loaded new batch `%s`.' % new) slack('All new batches loaded in `%s` seconds.' % time.time() - new_start) slack('Batch loading job complete!')