Ejemplo n.º 1
0
 def solr_reindex (self):
     """
     Reindex all entries.  Used when switching to/from "private" status.
     """
     solr_conn = SolrConnection(settings.SOLR_URL)
     # Start by deleting 'em all
     solr_conn.delete_query('user:%s' % self.user.id)
     entries = Entry.objects.filter(user=self.user)
     docs = []
     # Arbitrary assignment of a constant, here.
     SLICE_SIZE = 50
     slices = [x for x in range(entries.count()) \
         if x % SLICE_SIZE == 0]
     for s in slices:
         entry_slice = entries[s:s+SLICE_SIZE]
         for entry in entry_slice:
             docs.append(entry.solr_doc)
             if len(docs) == SLICE_SIZE:
                 try:
                     solr_conn.add_many(docs)
                 except:
                     # should log appropriately, huh
                     pass
                 del(docs)
                 docs = []
     # Don't miss the leftovers
     solr_conn.add_many(docs)
     solr_conn.commit()
     solr_conn.optimize()
Ejemplo n.º 2
0
 def handle(self, *args, **options):
     self.stdout.write("Optimizing Solr index %s" % settings.SOLR)
     solr = SolrConnection(settings.SOLR)
     start_time = default_timer()
     solr.optimize()
     elapsed = default_timer() - start_time
     self.stdout.write("Solr took %0.3f seconds to optimize %s" %
                       (elapsed, settings.SOLR))
Ejemplo n.º 3
0
class Command(BaseCommand):
    user_option = optparse.make_option(
        '--user',
        action='store',
        dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option, )
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s + SLICE_SIZE)
            entry_slice = entries[s:s + SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del (docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
Ejemplo n.º 4
0
class Command(BaseCommand):
    user_option = optparse.make_option('--user',
        action='store', dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option,)
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY 
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s+SLICE_SIZE)
            entry_slice = entries[s:s+SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del(docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
Ejemplo n.º 5
0
    def handle(self, batch_name=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch '%s'", batch_name)
            loader.purge_batch(batch_name)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
Ejemplo n.º 6
0
    def handle(self, batch_location=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch %s", batch_location)
            loader.purge_batch(batch_location)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
Ejemplo n.º 7
0
    def handle(self, *args, **options):
        def get_immediate_subdirectories(a_dir):
            return [
                name for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name))
            ]

        def slack(message):
            sc.api_call("chat.postMessage", channel="#ghnp", text=message)

        start = datetime.now()

        sc = SlackClient(settings.SLACK_KEY)

        loader = BatchLoader()

        new_batches_location = '/opt/chronam/data/chronamftp/new_batches/'
        replacement_batches_location = '/opt/chronam/data/chronamftp/replacement_batches/'
        nonlccn_location = '/opt/chronam/data/nonlccn/'
        batch_drop = '/opt/chronam/data/dlg_batches/drop/'

        # GET LIST OF BATCHES TO LOAD
        new_batches = get_immediate_subdirectories(new_batches_location)
        replacement_batches = get_immediate_subdirectories(
            replacement_batches_location)

        # CHECK new_batches FOR finalMARC FOLDERS
        new_title_folders = []
        for folder in new_batches:
            if 'MARC' in folder:
                new_title_folders.append(folder)
                new_batches.remove(folder)

        # ISSUE STARTING NOTIFICATIONS
        slack(
            'Starting DLG Batch Load Process! Found `%s` new batches and `%s` replacement batches available to load.'
            % (len(new_batches), len(replacement_batches)))

        # RUN KEVIN'S RSYNC COMMANDS, WAIT
        slack('RSync of batches is starting')
        start_time = time.time()
        slack('Copying new batches')
        subprocess.call([
            'rsync -rav --progress /opt/chronam/data/chronamftp/new_batches/* /opt/chronam/data/dlg_batches/drop/'
        ])
        slack('Copying replacement batches')
        subprocess.call([
            'rsync -rav --progress /opt/chronam/data/chronamftp/replacement_batches/* /opt/chronam/data/dlg_batches/drop/'
        ])
        duration = time.time() - start_time
        slack('RSync of new and replacement batches completed in %s seconds' %
              duration)

        # LOAD NEW TITLES IF PRESENT
        if new_title_folders:
            slack('Also found `%s` title MARC files to process.' %
                  len(new_title_folders))
            for nt in new_title_folders:
                for nt_f in os.listdir(os.path.join(new_batches_location, nt)):
                    if nt_f.endswith('.xml'):
                        marc_file = os.path.join(nonlccn_location, nt_f)
                        copyfile(os.path.join(new_batches_location, nt, nt_f),
                                 marc_file)
                        title_load_results = title_loader.load(marc_file)
                        if title_load_results[1]:
                            slack('New title created from `%s`.' % nt_f)
                        if title_load_results[2]:
                            slack('Title updated from `%s`.' % nt_f)
                        if title_load_results[3]:
                            slack('Error on title load from `%s`' % nt_f)
            index_titles(start)
            slack('Finished loading titles.')

        # PURGE REPLACEMENT BATCHES
        if replacement_batches:
            slack('Purging batches destined for replacement.')
            for r_b in replacement_batches:
                batch_to_purge = r_b.replace('ver02','ver01')\
                    .replace('ver03','ver02')\
                    .replace('ver04','ver03')\
                    .replace('ver05','ver04')\
                    .replace('ver06','ver05')\
                    .replace('ver07','ver06')\
                    .replace('ver08','ver07')
                slack('Purging `%s`.' % batch_to_purge)
                loader.purge_batch(batch_to_purge)
            start_time = time.time()
            solr = SolrConnection(settings.SOLR)
            solr.optimize()
            slack('Index optimize complete in `%s` seconds.' % time.time() -
                  start_time)

        # LOAD ALL BATCHES
        # start with replacement batches
        final_loader = batch_loader.BatchLoader(process_ocr=True,
                                                process_coordinates=True)
        if replacement_batches:
            replace_start = time.time()
            for replacement in replacement_batches:
                final_loader.load_batch('drop/%s' % replacement, strict=False)
                slack('Loaded replacement batch `%s`.' % replacement)
            slack('All replacement batches loaded in `%s` seconds.' %
                  time.time() - replace_start)
        # load new batches
        if new_batches:
            new_start = time.time()
            for new in new_batches:
                final_loader.load_batch('drop/%s' % new, strict=False)
                slack('Loaded new batch `%s`.' % new)
            slack('All new batches loaded in `%s` seconds.' % time.time() -
                  new_start)

        slack('Batch loading job complete!')