def oralookup(
    pid=None,
    uuid=None,
    fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection",
):
    s = SolrConnection("ora.ouls.ox.ac.uk:8080")
    results = {}
    query = ""
    if pid:
        pid = "\:".join(pid.split(":"))
        query = "id:%s" % pid
    elif uuid:
        query = "id:uuid\:%s" % uuid
    else:
        return results
    # Running actual query (3 tries, failover)
    tries = 0
    while tries != 3:
        try:
            r = s.search(q=query, wt="json", fl=fields_to_return)
            logger.debug("Solr response: %s" % r)
            tries = 3
        except BadStatusLine:
            sleep(0.5)
            tries = tries + 1
    try:
        results = simplejson.loads(r)
        assert results["response"]["numFound"] == 1
        return results["response"]["docs"][0]
    except ValueError:
        logger.warn("Couldn't parse json response from Solr endpoint: %s" % r)
        return {}
    except AssertionError:
        logger.warn("Couldn't assert that only a single result was fetched: %s" % results)
        return {}
def titlelookup(pid):
    s = SolrConnection("ora.ouls.ox.ac.uk:8080")
    results = {}
    query = ""
    if pid:
        pid = "\:".join(pid.split(":"))
        query = "id:%s" % pid
    elif uuid:
        query = "id:uuid\:%s" % uuid
    else:
        return results
    # Running actual query (3 tries, failover)
    tries = 0
    while (tries != 3):
        try:
            r = s.search(q=query, wt="json", fl="title")
            logger.debug("Solr response: %s" % r)
            tries = 3
        except BadStatusLine:
            sleep(0.5)
            tries = tries + 1
    try:
        results = simplejson.loads(r)
        assert results['response']['numFound'] == 1
        doc = results['response']['docs'][0]
        return doc['title']
    except ValueError:
        logger.warn("Couldn't parse json response from Solr endpoint: %s" % r)
        return {}
    except AssertionError:
        logger.warn(
            "Couldn't assert that only a single result was fetched: %s" %
            results)
        return {}
Beispiel #3
0
 def solr_delete(self):
     """
     Remove from solr index
     """
     solr_conn = SolrConnection(settings.SOLR_URL, persistent=False)
     solr_conn.delete_query('id:%s' % self.id)
     solr_conn.commit()
Beispiel #4
0
def word_matches_for_page(page_id, words):
    """
    Gets a list of pre-analyzed words for a list of words on a particular
    page. So if you pass in 'manufacturer' you can get back a list like
    ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ...
    """
    solr = SolrConnection(settings.SOLR)

    # Make sure page_id is of type str, else the following string
    # operation may result in a UnicodeDecodeError. For example, see
    # ticket #493
    if not isinstance(page_id, str):
        page_id = str(page_id)

    ocr_list = ["ocr"]
    ocr_list.extend(["ocr_%s" % l for l in settings.SOLR_LANGUAGES])
    ocrs = " OR ".join([query_join(words, o) for o in ocr_list])
    q = "id:%s AND (%s)" % (page_id, ocrs)
    params = {
        "hl.snippets": 100,
        "hl.requireFieldMatch": "true",
        "hl.maxAnalyzedChars": "102400"
    }
    response = solr.query(q, fields=["id"], highlight=ocr_list, **params)

    if page_id not in response.highlighting:
        return []

    words = set()
    for ocr in ocr_list:
        if ocr in response.highlighting[page_id]:
            for context in response.highlighting[page_id][ocr]:
                words.update(find_words(context))
    return list(words)
Beispiel #5
0
def main():
    opts, args = parse_opts()
    logging.basicConfig(filename=opts.log_file, level=logging.DEBUG,
                        format='[%(asctime)s|%(levelname)s|%(name)s|%(threadName)s|%(message)s]')

    solr = SolrConnection(opts.solr_uri)
    protocol = LineProtocol()
    for request in protocol.input():
        try:
            query = build_query(request)
            if query is None:
                protocol.output(query_failed(), True)
                continue
            log.debug("Running query: " + str(query))
            results = solr.search(**query)
            if results is None:
                protocol.output({'code' : 400})
                continue
            resp = json.loads(results)
            ret = {
                'code' : 200,
                'json' : resp['response']
            }
            protocol.output(ret, True)
        except Exception:
            log.exception("Uncaught exception")
    return 0
def titlelookup(pid):
    s = SolrConnection("ora.ouls.ox.ac.uk:8080")
    results = {}
    query = ""
    if pid:
        pid = "\:".join(pid.split(":"))
        query = "id:%s" % pid
    elif uuid:
        query = "id:uuid\:%s" % uuid
    else:
        return results
    # Running actual query (3 tries, failover)
    tries = 0
    while tries != 3:
        try:
            r = s.search(q=query, wt="json", fl="title")
            logger.debug("Solr response: %s" % r)
            tries = 3
        except BadStatusLine:
            sleep(0.5)
            tries = tries + 1
    try:
        results = simplejson.loads(r)
        assert results["response"]["numFound"] == 1
        doc = results["response"]["docs"][0]
        return doc["title"]
    except ValueError:
        logger.warn("Couldn't parse json response from Solr endpoint: %s" % r)
        return {}
    except AssertionError:
        logger.warn("Couldn't assert that only a single result was fetched: %s" % results)
        return {}
Beispiel #7
0
 def solr_index(self):
     """
     Write out to solr
     """
     solr_conn = SolrConnection(settings.SOLR_URL, persistent=False)
     solr_conn.add(**self.solr_doc)
     solr_conn.commit()
Beispiel #8
0
 def _get_count(self):
     "Returns the total number of objects, across all pages."
     if self._count is None:
         solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
         solr_response = solr.query(self._q, fields=['id'])
         self._count = int(solr_response.results.numFound)
     return self._count
Beispiel #9
0
 def _get_count(self):
     "Returns the total number of objects, across all pages."
     if self._count is None:
         solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
         solr_response = solr.query(self._q, fields=['id'])
         self._count = int(solr_response.results.numFound)
     return self._count
Beispiel #10
0
def word_matches_for_page(page_id, words):
    """
    Gets a list of pre-analyzed words for a list of words on a particular
    page. So if you pass in 'manufacturer' you can get back a list like
    ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ...
    """
    solr = SolrConnection(settings.SOLR)

    # Make sure page_id is of type str, else the following string
    # operation may result in a UnicodeDecodeError. For example, see
    # ticket #493
    if not isinstance(page_id, str):
        page_id = str(page_id)

    ocr_list = ['ocr', ]
    ocr_list.extend(['ocr_%s' % l for l in settings.SOLR_LANGUAGES])
    ocrs = ' OR '.join([query_join(words, o) for o in ocr_list])
    q = 'id:%s AND (%s)' % (page_id, ocrs)
    params = {"hl.snippets": 100, "hl.requireFieldMatch": 'true', "hl.maxAnalyzedChars": '102400'}
    response = solr.query(q, fields=['id'], highlight=ocr_list, **params)

    if page_id not in response.highlighting:
        return []

    words = set()
    for ocr in ocr_list:
        if ocr in response.highlighting[page_id]:
            for context in response.highlighting[page_id][ocr]:
                words.update(find_words(context))
    return list(words)
Beispiel #11
0
def index_evidence(evidence):
    evidence_medicine_list = []

    evidence_medicine = MedicineEvidenceSummary.objects.filter(evidence=evidence.id)
    for evimed in evidence_medicine: 
        if evimed.medicine.name not in evidence_medicine_list:
                evidence_medicine_list.append(evimed.medicine.name)

    # try to create a connection to a solr server and send medicine
    try:
        solr = SolrConnection(settings.SOLR_URL)
        solr.add(
            id = "evidence-%s-%s" % (evidence.language, evidence.id), 
            type = "evidence",
            title = evidence.title,            
            description = evidence.description,
            context = evidence.context,
            question = evidence.question,
            link = evidence.link,
            file = evidence.file,
            language = evidence.language,
            evidence_medicine = evidence_medicine_list,
        )
        response = solr.commit()
    except Exception as ex: 
        return False

    return True
Beispiel #12
0
def oralookup(pid=None, uuid=None, fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection", endpoint="http://ora.ouls.ox.ac.uk:8080/solr/select"):
  s = SolrConnection(endpoint)
  results = {}
  query = ""
  if pid:
    pid = "\:".join(pid.split(":"))
    query = "id:%s" % pid
  elif uuid:
    query = "id:uuid\:%s" % uuid
  else:
    return results
  # Running actual query (3 tries, failover)
  tries = 0
  while(tries != 3):
    try:
      r = s.query(q = query, fields = fields_to_return)
      logger.debug("Solr response: %s" % r.header)
      tries = 3
    except BadStatusLine:
      sleep(0.5)
      tries = tries + 1
  try:
    assert len(r.results) == 1
    return r.results[0]
  except ValueError:
    logger.warn("Couldn't parse json response from Solr endpoint: %s" % r)
    return {}
  except AssertionError:
    logger.warn("Couldn't assert that only a single result was fetched: %s" % results)
    return {}
def index_title(title, solr=None):
    if solr == None:
        solr = SolrConnection(settings.SOLR)
    _log.info("indexing title: lccn=%s" % title.lccn)
    try:
        solr.add(**title.solr_doc)
    except Exception as e:
        _log.exception(e)
Beispiel #14
0
def _solr_connection():
    from solr import SolrConnection
    url = config['proactive_disclosure.solr_url']
    user = config.get('proactive_disclosure.solr_user')
    password = config.get('proactive_disclosure.solr_password')
    if user is not None and password is not None:
        return SolrConnection(url, http_user=user, http_pass=password)
    return SolrConnection(url)
Beispiel #15
0
def index_title(title, solr=None):
    if solr==None:
        solr = SolrConnection(settings.SOLR)
    _log.info("indexing title: lccn=%s" % title.lccn)
    try:
        solr.add(**title.solr_doc)
    except Exception, e:
        _log.exception(e)
Beispiel #16
0
def make_connection():
    from solr import SolrConnection
    if solr_user is not None and solr_password is not None:
        return SolrConnection(solr_url,
                              http_user=solr_user,
                              http_pass=solr_password)
    else:
        return SolrConnection(solr_url)
Beispiel #17
0
def _solr_connection():
    from solr import SolrConnection
    url = config['ati_summaries.solr_url']
    user = config.get('ati_summaries.solr_user')
    password = config.get('ati_summaries.solr_password')
    if user is not None and password is not None:
        return SolrConnection(url, http_user=user, http_pass=password)
    return SolrConnection(url)
Beispiel #18
0
def index_title(title, solr=None):
    if solr is None:
        solr = SolrConnection(settings.SOLR)
    LOGGER.info("indexing title: lccn=%s", title.lccn)
    try:
        solr.add(**title.solr_doc)
    except Exception as e:
        LOGGER.exception(e)
Beispiel #19
0
 def handle(self, *args, **options):
     self.stdout.write("Optimizing Solr index %s" % settings.SOLR)
     solr = SolrConnection(settings.SOLR)
     start_time = default_timer()
     solr.optimize()
     elapsed = default_timer() - start_time
     self.stdout.write("Solr took %0.3f seconds to optimize %s" %
                       (elapsed, settings.SOLR))
Beispiel #20
0
def index_title(title, solr=None):
    if solr is None:
        solr = SolrConnection(settings.SOLR)
    LOGGER.info("indexing title: lccn=%s", title.lccn)
    try:
        solr.add(**title.solr_doc)
    except Exception as e:
        LOGGER.exception(e)
Beispiel #21
0
    def __init__(self, process_ocr=True, process_coordinates=True):
        """Create a BatchLoader.

        The process_ocr parameter is used (mainly in testing) when we don't
        want to spend time actually extracting ocr text and indexing.
        """
        self.PROCESS_OCR = process_ocr
        self.solr = SolrConnection(settings.SOLR)
        self.PROCESS_COORDINATES = process_coordinates
Beispiel #22
0
def get_page_text(page):
    no_text = ["Text not available"]
    solr = SolrConnection(settings.SOLR)
    query = 'id:"%s"' % page.url
    solr_results = solr.query(query)
    results_attribute = getattr(solr_results, 'results', None)
    if isinstance(results_attribute, list) and len(results_attribute) > 0:
        return results_attribute[0].get('ocr', no_text)
    else:
        return no_text
Beispiel #23
0
def index_title(title, solr=None):
    if solr is None:
        solr = SolrConnection(settings.SOLR)

    LOGGER.debug("indexing title: lccn=%s", title.lccn)

    try:
        solr.add(**title.solr_doc)
    except Exception:
        LOGGER.exception("Unable to index title %s", title)
Beispiel #24
0
def make_connection():
    from solr import SolrConnection
    solr_url, solr_user, solr_password = SolrSettings.get()
    assert solr_url is not None
    if solr_user is not None and solr_password is not None:
        return SolrConnection(solr_url,
                              http_user=solr_user,
                              http_pass=solr_password)
    else:
        return SolrConnection(solr_url)
Beispiel #25
0
def get_page_text(page):
    no_text = ["Text not available"]
    solr = SolrConnection(settings.SOLR)
    query = 'id:"%s"' % page.url
    solr_results = solr.query(query)
    results_attribute = getattr(solr_results, 'results', None)
    if isinstance(results_attribute, list) and len(results_attribute) > 0:
        return results_attribute[0].get('ocr', no_text)
    else:
        return no_text
Beispiel #26
0
    def solrconn(self, core='books'):
        try:
            solr_base_and_core = "%s/%s" % (solr_base, core)

            self.conn = SolrConnection( host=solr_host, solrBase=solr_base_and_core, \
                                        username=solr_uname, password=solr_pswd )
            self.connstatus = True
            return True
        except:
            #print "solr connection error!"
            return False
Beispiel #27
0
def similar_pages(page):
    solr = SolrConnection(settings.SOLR)
    d = page.issue.date_issued
    year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) 
    date = ''.join(map(str, (year, month, day)))

    query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, 
                                           page.issue.title.places.all()), 'city'), page.issue.title.lccn)
    response = solr.query(query, rows=25)
    results = response.results
    return map(lambda kwargs: utils.get_page(**kwargs), 
               map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
Beispiel #28
0
class Command(BaseCommand):
    user_option = optparse.make_option(
        '--user',
        action='store',
        dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option, )
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s + SLICE_SIZE)
            entry_slice = entries[s:s + SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del (docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
Beispiel #29
0
def similar_pages(page):
    solr = SolrConnection(settings.SOLR)
    d = page.issue.date_issued
    year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) 
    date = ''.join(map(str, (year, month, day)))

    query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, 
                                           page.issue.title.places.all()), 'city'), page.issue.title.lccn)
    response = solr.query(query, rows=25)
    results = response.results
    return map(lambda kwargs: utils.get_page(**kwargs), 
               map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
Beispiel #30
0
def delete_title(title, solr=None):
    if not solr:
        solr = SolrConnection(settings.SOLR)

    if isinstance(title, models.Title):
        title_id = title.url
    else:
        title_id = title

    q = "+type:title +id:%s" % title_id
    solr.delete_query(q)
    LOGGER.info("deleted title %s from the index", title)
Beispiel #31
0
 def handle(self, *args, **options):
     self.solr = SolrConnection(SOLR_URL)
     self.cursor = connection.cursor()
     if options['user']:
         print "indexing user"
         self.index_entries(user=options['user'])
     else:
         print 'indexing everything'
         self.index_entries()
     print 'committing'
     self.solr.commit()
     print 'optimizing'
     self.solr.optimize()
Beispiel #32
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=['lccn', 'title',
                                       'edition',
                                       'place_of_publication',
                                       'start_year', 'end_year',
                                       'language'],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Beispiel #33
0
def _refresh(field=None, data=None, path = None, isCron = None):
	from solr import SolrConnection
	from ID3 import *
	s = SolrConnection(SOLR_URL)
	if path and path != '*':
		#called by user		
		pathsArr = path.split(',')		
	else:
		#called from cron		
		pathsArr = folderpaths
	matches = []
	#handles modify, add
	#deletion will be handled in search when file in solr but not in path
	time.time()
	for path in pathsArr:
		for root, dirnames, filenames in os.walk(path):
			for extension in ['txt', 'log', 'py', 'pl', 'sql', 'mp3']:
				for filename in fnmatch.filter(filenames, '*.' + extension):				
					fullName = os.path.join(root, filename)
					if os.path.getsize(fullName) > 8800000:
						continue
					#print fullName
					if not isCron or (time.time() - os.path.getmtime(fullName) < 24*60*60):				
						try:
							#data = open(fullName, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ')
							if filename.endswith(('.txt', '.log', '.py', '.pl', '.sql')):								
								data = open(fullName, 'r').read()
								data = filterTxt(data)
							else:								
								audiofile = ID3(fullName)
								audiofilekeys = audiofile.keys()
								if 'TITLE' in audiofilekeys:
									data = audiofile['TITLE'] + " "
								if 'ARTIST' in audiofilekeys:
									data += audiofile['ARTIST'] + " "
								if 'ALBUM' in audiofilekeys:
									data += audiofile['ALBUM'] + " "
								if not data:
									data = ''
								data = data.strip()
							fullName = filterTxt(fullName)
							filename = filterTxt(filename)						
							s.add(id = fullName, name = filename, txt = data)
							s.commit()
						except:																	
							pass
							#print data
							#print traceback.format_exc()
							#print fullName	
							#sys.exit()					
						gc.collect()
Beispiel #34
0
class Command(BaseCommand):
    user_option = optparse.make_option('--user',
        action='store', dest='user',
        help='name of user whose entries to purge')
    option_list = BaseCommand.option_list + (user_option,)
    help = "index all or user-specific entries in solr"
    args = 'an optional username'

    def handle(self, *args, **options):
        self.solr = SolrConnection(SOLR_URL)
        self.cursor = connection.cursor()
        if options['user']:
            print "indexing user"
            self.index_entries(user=options['user'])
        else:
            print 'indexing everything'
            self.index_entries()
        print 'committing'
        self.solr.commit()
        print 'optimizing'
        self.solr.optimize()

    def index_entries(self, user=''):
        counter = 0
        entries = m.Entry.objects.all()
        if user:
            entries = entries.filter(user__username=user)
        docs = []
        print 'entry count:', entries.count()
        SLICE_SIZE = MAX_DOCS_PER_ADD * COMMIT_FREQUENCY 
        slices = [x for x in range(entries.count()) \
            if x % SLICE_SIZE == 0]
        for s in slices:
            print 'indexing %s to %s...' % (s, s+SLICE_SIZE)
            entry_slice = entries[s:s+SLICE_SIZE]
            for entry in entry_slice:
                counter += 1
                docs.append(entry.solr_doc)
                if len(docs) == MAX_DOCS_PER_ADD:
                    try:
                        self.solr.add_many(docs)
                    except:
                        print 'BAD RECORD:', [d['id'] for d in docs]
                    del(docs)
                    docs = []
                    reset_queries()
                    if counter % (COMMIT_FREQUENCY * MAX_DOCS_PER_ADD) == 0:
                        print 'committing at count:', counter
                        self.solr.commit()
        # Don't miss the leftovers
        self.solr.add_many(docs)
Beispiel #35
0
 def handle(self, **options):
     solr = SolrConnection(SOLR_URL)
     if options['user']:
         solr.delete_query('user:%s' % options['user'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
Beispiel #36
0
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     if options['batch']:
         solr.delete_query('batch: %s' % options['batch'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
	def __init__(self, channel_id):

		self.channel_id = channel_id

		# Build up a Solr query
		filters = []
		filters.append('type:request')
		filters.append('channel_id:%s' % channel_id)

		# Make the request to Solr
		solr = SolrConnection(settings.SOLR_URL)
		response = solr.select(q = ' AND '.join(filters), rows = 10, fields = 'datetime, id', sort = 'datetime', sort_order = 'asc')

		self.requests = response.results
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """
    cursor = connection.cursor()
    solr = SolrConnection(settings.SOLR)
    if since:
        cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" %
                       since)
    else:
        solr.delete_query('type:title')
        cursor.execute("SELECT lccn FROM core_title")

    count = 0
    while True:
        row = cursor.fetchone()
        if row == None:
            break
        title = models.Title.objects.get(lccn=row[0])
        index_title(title, solr)
        count += 1
        if count % 100 == 0:
            _log.info("indexed %s titles" % count)
            reset_queries()
            solr.commit()
    solr.commit()
Beispiel #39
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    # default arg_separator - underscore wont work if fields to facet on
    # themselves have underscore in them
    solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=[
                                   'lccn', 'title', 'edition',
                                   'place_of_publication', 'start_year',
                                   'end_year', 'language'
                               ],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Beispiel #40
0
def index_missing_pages():
    """
    index all pages that are missing from solr in the database
    """
    solr = SolrConnection(settings.SOLR)
    count = 0
    pages = models.Page.objects.filter(indexed=False).all()
    number_of_pages = len(pages)
    for page in pages:
        LOGGER.info("[%s of %s] indexing page: %s", count, number_of_pages, page.url)
        solr.add(**page.solr_doc)
        count += 1
        page.indexed = True
        page.save()
    solr.commit()
Beispiel #41
0
def execute_solr_query(query, fields, sort, sort_order, rows, start):
    # default arg_separator - underscore wont work if fields to facet on
    # themselves have underscore in them
    solr = SolrConnection(settings.SOLR)  # TODO: maybe keep connection around?
    solr_response = solr.query(query,
                               fields=['lccn', 'title',
                                       'edition',
                                       'place_of_publication',
                                       'start_year', 'end_year',
                                       'language'],
                               rows=rows,
                               sort=sort,
                               sort_order=sort_order,
                               start=start)
    return solr_response
Beispiel #42
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(
            settings.SOLR)  # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {
            "hl.snippets": 100,  # TODO: make this unlimited
            "hl.requireFieldMatch": 'true',  # limits highlighting slop
            "hl.maxAnalyzedChars": '102400',  # increased from default 51200
        }
        sort_field, sort_order = _get_sort(self.query.get('sort'),
                                           in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=[
                                       'id', 'title', 'date', 'sequence',
                                       'edition_label', 'section_label'
                                   ],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)

        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url, page.words,
                                                    number, len(pages))
            pages.append(page)

        return Page(pages, number, self)
	def finished(self, **kwargs):

		source_id = kwargs['source_id']
		track_id = kwargs['track_id']

		# Build up a Solr query
		filters = []
		filters.append('type:request')
		filters.append('channel_id:%s' % self.channel_id)
		filters.append('request_source_id:%s' % source_id)
		filters.append('request_track_id:%s' % track_id)

		# Make the request to Solr
		solr = SolrConnection(settings.SOLR_URL)
		solr.delete_query(' AND '.join(filters))
		solr.commit()
Beispiel #44
0
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """
    cursor = connection.cursor()
    solr = SolrConnection(settings.SOLR)
    if since:
        cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since)
    else:
        solr.delete_query('type:title')
        cursor.execute("SELECT lccn FROM core_title")

    count = 0
    while True:
        row = cursor.fetchone()
        if row is None:
            break
        title = models.Title.objects.get(lccn=row[0])
        index_title(title, solr)
        count += 1
        if count % 100 == 0:
            LOGGER.info("indexed %s titles", count)
            reset_queries()
            solr.commit()
    solr.commit()
Beispiel #45
0
 def handle(self, **options):
     solr = SolrConnection(SOLR_URL)
     if options["user"]:
         solr.delete_query("user:%s" % options["user"])
     else:
         solr.delete_query("id:[* TO *]")
     solr.commit()
Beispiel #46
0
 def handle(self, **options):
     solr = SolrConnection(settings.SOLR)
     if options['batch']:
         solr.delete_query('batch: %s' % options['batch'])
     else:
         solr.delete_query('id:[* TO *]')
     solr.commit()
Beispiel #47
0
def build_index(**kwargs):
    """
    gets product/sku information from cps DB and indexes them in solr

    existing solr index is wiped before indexing. Revisit if this strategy
    does not work
    """
    # index status log message granularity
    log_index_status_chunks = 25000
    solr = SolrConnection(settings.SOLR)
    clear_index(solr=solr)
    count = 0
    fieldnames = (
        "name",
        "id",
        "description",
        "long_description",
        "age",
        "gender",
        "brand",
        "str_brand",
        "merchant",
        "str_merchant",
        "category",
        "str_category",
        "price",
        "sale_price",
        "buy_url",
        "image",
    )
    start = datetime.now()
    log.info("Reading product info from the database.....")
    products = db.get_cps_data()
    log.info("Building SOLR index.....")
    for product in products:
        try:
            product_record = dict(zip(fieldnames, product))
            solr.add(**product_record)
            count += 1
        except Exception, e:
            log.exception(e)
            continue
        if count % log_index_status_chunks == 0:
            log.info("Indexed %d products in %s" % (count, datetime.now() - start))
Beispiel #48
0
    def page(self, number):
        """
        Override the page method in Paginator since Solr has already
        paginated stuff for us.
        """

        number = self.validate_number(number)

        # figure out the solr query and execute it
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        start = self.per_page * (number - 1)
        params = {"hl.snippets": 100, # TODO: make this unlimited
            "hl.requireFieldMatch": 'true', # limits highlighting slop
            "hl.maxAnalyzedChars": '102400', # increased from default 51200
            }
        sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True)
        solr_response = solr.query(self._q,
                                   fields=['id', 'title', 'date', 'sequence',
                                           'edition_label', 'section_label'],
                                   highlight=self._ocr_list,
                                   rows=self.per_page,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start,
                                   **params)

        pages = []
        for result in solr_response.results:
            page = models.Page.lookup(result['id'])
            if not page:
                continue
            words = set()
            coords = solr_response.highlighting[result['id']]
            for ocr in self._ocr_list:
                for s in coords.get(ocr) or []:
                    words.update(find_words(s))
            page.words = sorted(words, key=lambda v: v.lower())

            page.highlight_url = self.highlight_url(page.url,
                                                    page.words,
                                                    number, len(pages))
            pages.append(page)

        return Page(pages, number, self)
Beispiel #49
0
    def __init__(self, query):
        self.query = query.copy()

        # figure out the solr query
        q = title_search(self.query)

        try:
            page = int(self.query.get('page'))
        except:
            page = 1

        try:
            rows = int(self.query.get('rows'))
        except:
            rows = 50
        start = rows * (page - 1)

        # determine sort order
        sort_field, sort_order = _get_sort(self.query.get('sort'))

        # execute query
        solr = SolrConnection(
            settings.SOLR)  # TODO: maybe keep connection around?
        solr_response = solr.query(q,
                                   fields=[
                                       'lccn', 'title', 'edition',
                                       'place_of_publication', 'start_year',
                                       'end_year', 'language'
                                   ],
                                   rows=rows,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start)

        # convert the solr documents to Title models
        # could use solr doc instead of going to db, if performance requires it
        lccns = [d['lccn'] for d in solr_response.results]
        results = []
        for lccn in lccns:
            try:
                title = models.Title.objects.get(lccn=lccn)
                results.append(title)
            except models.Title.DoesNotExist, e:
                pass  # TODO: log exception
Beispiel #50
0
    def __init__(self, query):
        self.query = query.copy()

        # figure out the solr query
        q = title_search(self.query)

        try:
            page = int(self.query.get('page'))
        except:
            page = 1

        try:
            rows = int(self.query.get('rows'))
        except:
            rows = 50
        start = rows * (page - 1)

        # determine sort order
        sort_field, sort_order = _get_sort(self.query.get('sort'))

        # execute query
        solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around?
        solr_response = solr.query(q,
                                   fields=['lccn', 'title',
                                           'edition',
                                           'place_of_publication',
                                           'start_year', 'end_year',
                                           'language'],
                                   rows=rows,
                                   sort=sort_field,
                                   sort_order=sort_order,
                                   start=start)

        # convert the solr documents to Title models
        # could use solr doc instead of going to db, if performance requires it
        lccns = [d['lccn'] for d in solr_response.results]
        results = []
        for lccn in lccns:
            try:
                title = models.Title.objects.get(lccn=lccn)
                results.append(title)
            except models.Title.DoesNotExist, e:
                pass # TODO: log exception
Beispiel #51
0
    def handle(self, batch_name=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch '%s'", batch_name)
            loader.purge_batch(batch_name)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
Beispiel #52
0
    def handle(self, batch_location=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch %s", batch_location)
            loader.purge_batch(batch_location)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
    def __init__(self):

        # Build up a Solr query
        filters = []
        filters.append("type:channel")

        # Make the request to Solr
        solr = SolrConnection(settings.SOLR_URL)
        response = solr.select(
            q=" AND ".join(filters), rows=10, fields="datetime, channel_id", sort="channel_id", sort_order="asc"
        )

        # Restore the persisted channels
        for doc in response.results:

            channel_id = doc["channel_id"]

            # Create the channel in the URL hierarchy
            self.__dict__[channel_id] = ChannelResource.Channel(channel_id)
Beispiel #54
0
def solr_connection(ini_prefix):
    """
    Set up solr connection
    :param ini_prefix: prefix to use in specifying .ini file keys (e.g.,
        ati_summaries to use config setting ati_summaries.solr_url etc.)
    :ptype ini_prefix: str

    :return a solr connection from configured URL, user, password settings
    :rtype object
    """
    from solr import SolrConnection
    url = config.get('{0:s}.solr_url'.format(ini_prefix))
    user = config.get('{0:s}.solr_user'.format(ini_prefix))
    password = config.get('{0:s}.solr_password'.format(ini_prefix))
    if url is None:
        raise KeyError('{0:s}.solr_url'.format(ini_prefix))
    if user is not None and password is not None:
        return SolrConnection(url, http_user=user, http_pass=password)
    return SolrConnection(url)
Beispiel #55
0
def get_connection():
    """Returns the global Solr connection, or creates one, as required."""
    global _solr

    if _solr:
        return _solr

    _solr = SolrConnection(url, http_user=http_user, http_pass=http_pass)

    return _solr
Beispiel #56
0
class MLGBsolr:
    def __init__(self):
        self.s_result = ()
        self.conn = None
        self.connstatus = False
        self.req = None

    def solrconn(self, core='books'):
        try:
            solr_base_and_core = "%s/%s" % (solr_base, core)

            self.conn = SolrConnection( host=solr_host, solrBase=solr_base_and_core, \
                                        username=solr_uname, password=solr_pswd )
            self.connstatus = True
            return True
        except:
            #print "solr connection error!"
            return False

    def solrquery(self, para, core='books'):
        if self.solrconn(core):
            try:
                self.search = self.conn.search(para)
                self.connstatus = True
                return True
            except:
                #print "solr query failed!"
                return False

    def solrresults(self, para, Facet=False, core='books'):
        if self.solrquery(para, core):
            rsp = simplejson.loads(self.search)

            s_numFound = rsp['response'].get('numFound', None)
            s_docs = rsp['response'].get('docs', None)
            s_params = rsp['responseHeader'].get('params', None)
            s_rows = s_params.get('rows', None)
            s_start = s_params.get('start', None)
            s_q = s_params.get('q', None)
            if Facet:
                s_facet_fields = rsp['facet_counts'].get('facet_fields', None)

            self.s_result = {
                'numFound': s_numFound,
                'search_params': s_params,
                'query': s_q,
                'start': s_start,
                'rows': s_rows,
                'docs': s_docs
            }
            if Facet:
                self.s_result['facet'] = s_facet_fields

            def __unicode__(self):
                return self.s_result
	def index(self):

		# Build up a Solr query
		filters = []
		filters.append('type:request')
		filters.append('channel_id:%s' % self.channel_id)

		# Make the request to Solr
		solr = SolrConnection(settings.SOLR_URL)
		response = solr.select(q = ' AND '.join(filters), rows = 0, fields = '', facet = 'true', facet_field = 'request_track_id', facet_mincount = 1, facet_limit = 10, facet_sort = 'count')

		results = []
		for track_id in response.facet_counts['facet_fields']['request_track_id'].keys():

			source_id = 'grooveshark'

			# Build up a Solr query
			filters = []
			filters.append('type:track')
			filters.append('request_source_id:%s' % source_id)
			filters.append('request_track_id:%s' % track_id)

			# Make the request to Solr
			track_response = solr.select(q = ' AND '.join(filters), fields = 'track_artist, track_album, track_title')

			if len(track_response.results) == 1:

				results.append({
					'track_artist': track_response.results[0]['track_artist'],
					'track_album': track_response.results[0]['track_album'],
					'track_title': track_response.results[0]['track_title'],

					'request_source_id': source_id,
					'request_track_id': track_id,

					'votes': response.facet_counts['facet_fields']['request_track_id'][track_id],
				})

		results = sorted(results, key = lambda result : -result['votes'])

		cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8'
		return json.dumps({'playlist': results}, ensure_ascii=False, indent=4).encode('utf-8')
Beispiel #58
0
def index_titles(since=None):
    """index all the titles and holdings that are modeled in the database
    if you pass in a datetime object as the since parameter only title
    records that have been created since that time will be indexed.
    """

    solr = SolrConnection(settings.SOLR)

    titles = models.Title.objects.all()
    if since:
        titles = titles.filter(created__gte=since)

    titles = titles.prefetch_related("languages", "alt_titles", "subjects",
                                     "notes", "places", "urls", "essays",
                                     "country", "holdings")

    count = 0

    for chunk in sliced(titles, 500):
        docs = []

        for title in chunk:
            try:
                docs.append(title.solr_doc)
            except Exception:
                LOGGER.exception("Unable to index title %s", title)

        solr.add_many(docs)

        reset_queries()
        solr.commit()

        count += len(chunk)
        LOGGER.info("indexed %d titles", count)

    lccns = set(models.Title.objects.values_list("lccn", flat=True))

    for result in solr.query("+type:title", fields=["id", "lccn"]):
        stale_id = result["id"]
        lccn = result["lccn"]
        if lccn not in lccns:
            LOGGER.warning("Removing stale title %s from the search index",
                           stale_id)
            delete_title(stale_id, solr=solr)

    solr.commit()
Beispiel #59
0
 def test_index_pages(self):
     solr = SolrConnection(settings.SOLR)
     solr.delete_query('type:page')
     solr.commit()
     self.assertEqual(si.page_count(), 0)
     si.index_pages()
     self.assertEqual(si.page_count(), 2)