Example #1
0
    def handle(self, *file_or_urls, **options):
        new = options.get('new')
        if new:
            data = '<delete><query>*:*</query></delete>'
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request:"
            print f.read()
        if file_or_urls:
            parser = options.get('parser')
            module = None
            if parser:
                if parser.endswith('.py'):
                    parser = parser[:-3]
                module = __import__('kochief.discovery.parsers.' + parser, globals(), 
                        locals(), [parser])
        for file_or_url in file_or_urls:
            if not module:
                # guess parser based on file extension
                if file_or_url.endswith('.mrc'):
                    import kochief.discovery.parsers.marc as module
            if not module:
                raise mb.CommandError("Please specify a parser.")
            print "Converting %s to CSV ..." % file_or_url
            t1 = time.time()
            data_handle = urllib.urlopen(file_or_url)
            try:
                csv_handle = open(CSV_FILE, 'w')
                record_count = module.write_csv(data_handle, csv_handle, 
                        collections=options.get('collections'))
            finally:
                csv_handle.close()
            t2 = time.time()
            load_solr(CSV_FILE)
            t3 = time.time()
            os.remove(CSV_FILE)
            p_time = (t2 - t1) / 60
            l_time = (t3 - t2) / 60
            t_time = p_time + l_time
            rate = record_count / (t3 - t1)
            print """Processing took %0.3f minutes.
Loading took %0.3f minutes.  
That's %0.3f minutes total for %d records, 
at a rate of %0.3f records per second.
""" % (p_time, l_time, t_time, record_count, rate)
Example #2
0
    def handle(self, *file_or_urls, **options):
        new = options.get('new')
        if new:
            # create/replace index
            pass
        if file_or_urls:
            parser = options.get('parser')
            module = None
            if parser:
                if parser.endswith('.py'):
                    parser = parser[:-3]
                module = __import__('kochief.discovery.parsers.' + parser, globals(), 
                        locals(), [parser])
        for file_or_url in file_or_urls:
            if not module:
                # guess parser based on file extension
                if file_or_url.endswith('.mrc'):
                    import kochief.discovery.parsers.marc as module
            if not module:
                raise mb.CommandError("Please specify a parser.")
            print "Converting %s to CSV ..." % file_or_url
            t1 = time.time()
            data_handle = urllib.urlopen(file_or_url)
            try:
                csv_handle = open(CSV_FILE, 'w')
                record_count = module.write_csv(data_handle, csv_handle)
            finally:
                csv_handle.close()
            t2 = time.time()
            load_solr(CSV_FILE)
            t3 = time.time()
            os.remove(CSV_FILE)
            p_time = (t2 - t1) / 60
            l_time = (t3 - t2) / 60
            t_time = p_time + l_time
            rate = record_count / (t3 - t1)
            print """Processing took %0.3f minutes.
Loading took %0.3f minutes.  
That's %0.3f minutes total for %d records, 
at a rate of %0.3f records per second.
""" % (p_time, l_time, t_time, record_count, rate)
Example #3
0
    def handle(self, *args, **options):
        print( 'options, ```%s```' % options )
        new = options.get('new')
        delete_rec = options.get('delete_rec')
        expired = options.get('expired')
        not_updated = options.get('not_updated')
        delete_set_url = options.get('delete_set')
        optimize = options.get('optimize')
        if delete_rec:
            data = '<delete><query>id:%s</query></delete>' % delete_rec
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request for rec with id: %s" % delete_rec
            print f.read()
        if delete_set_url:
            #Pass in result set url
            # delete_set_response = simplejson.load(urllib2.urlopen(delete_set_url.strip('"')))
            file_handler = urllib2.urlopen( delete_set_url.strip('"') )
            delete_set_response = json.loads( file_handler.read() )
            delete_set = []
            solr_data = ""
            for doc in delete_set_response['response']['docs']:
                delete_set.append(doc['id'])
                solr_data += '<query>id:%s</query>' % doc['id']
            delete_set = ",".join(delete_set)
            print "Deleting %s" % delete_set
            data = '<delete>%s</delete>' % solr_data
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request"
            print f.read()
        if new:
            data = '<delete><query>*:*</query></delete>'
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request:"
            print f.read()
        if expired:
            data = '<delete><query>%s</query></delete>' % conf.settings.EXPIRED_RECORDS_QUERY
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request for records with a cat date older than the time specified in settings.py."
            print f.read()
        if not_updated:
            data = '<delete><query>%s</query></delete>' % conf.settings.NOT_UPDATED_RECORDS_QUERY
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to deletion request for records not updated since the time specified in settings.py."
            print f.read()
        if optimize:
            print "Will optimize: %s." % conf.settings.SOLR_URL
            data = '<optimize/>'
            r = urllib2.Request(conf.settings.SOLR_URL + 'update?commit=true')
            r.add_header('Content-Type', 'text/xml')
            r.add_data(data)
            f = urllib2.urlopen(r)
            print "Solr response to optimize request."
            print f.read()
        if file_or_urls:
            parser = options.get('parser')
            module = None
            if parser:
                if parser.endswith('.py'):
                    parser = parser[:-3]
                module = __import__('kochief.discovery.parsers.' + parser, globals(),
                        locals(), [parser])
        for file_or_url in file_or_urls:
            if not module:
                # guess parser based on file extension
                if file_or_url.endswith('.mrc'):
                    import kochief.discovery.parsers.marc as module
            if not module:
                raise mb.CommandError("Please specify a parser.")
            print "Converting %s to CSV ..." % file_or_url
            t1 = time.time()
            try:
                data_handle = urllib.urlopen(file_or_url)
            #For Windows.  Urllib will fail on opening a local file.
            except IOError:
                data_handle = open(file_or_url)
            try:
                #For Windows open as binary or else blank rows will be created in the CSV.
                csv_handle = open(CSV_FILE, 'wb')
                record_count = module.write_csv(data_handle, csv_handle,
                        collections=options.get('collections'))
            finally:
                csv_handle.close()
            t2 = time.time()
            load_solr(CSV_FILE)
            t3 = time.time()
            os.remove(CSV_FILE)
            p_time = (t2 - t1) / 60
            l_time = (t3 - t2) / 60
            t_time = p_time + l_time
            rate = record_count / (t3 - t1)
            print """Processing took %0.3f minutes.
Loading took %0.3f minutes.
That's %0.3f minutes total for %d records,
at a rate of %0.3f records per second.
""" % (p_time, l_time, t_time, record_count, rate)