def _get_count(self): "Returns the total number of objects, across all pages." if self._count is None: solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(self._q, fields=['id']) self._count = int(solr_response.results.numFound) return self._count
def word_matches_for_page(page_id, words): """ Gets a list of pre-analyzed words for a list of words on a particular page. So if you pass in 'manufacturer' you can get back a list like ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ... """ solr = SolrConnection(settings.SOLR) # Make sure page_id is of type str, else the following string # operation may result in a UnicodeDecodeError. For example, see # ticket #493 if not isinstance(page_id, str): page_id = str(page_id) ocr_list = ["ocr"] ocr_list.extend(["ocr_%s" % l for l in settings.SOLR_LANGUAGES]) ocrs = " OR ".join([query_join(words, o) for o in ocr_list]) q = "id:%s AND (%s)" % (page_id, ocrs) params = { "hl.snippets": 100, "hl.requireFieldMatch": "true", "hl.maxAnalyzedChars": "102400" } response = solr.query(q, fields=["id"], highlight=ocr_list, **params) if page_id not in response.highlighting: return [] words = set() for ocr in ocr_list: if ocr in response.highlighting[page_id]: for context in response.highlighting[page_id][ocr]: words.update(find_words(context)) return list(words)
def word_matches_for_page(page_id, words): """ Gets a list of pre-analyzed words for a list of words on a particular page. So if you pass in 'manufacturer' you can get back a list like ['Manufacturer', 'manufacturers', 'MANUFACTURER'] etc ... """ solr = SolrConnection(settings.SOLR) # Make sure page_id is of type str, else the following string # operation may result in a UnicodeDecodeError. For example, see # ticket #493 if not isinstance(page_id, str): page_id = str(page_id) ocr_list = ['ocr', ] ocr_list.extend(['ocr_%s' % l for l in settings.SOLR_LANGUAGES]) ocrs = ' OR '.join([query_join(words, o) for o in ocr_list]) q = 'id:%s AND (%s)' % (page_id, ocrs) params = {"hl.snippets": 100, "hl.requireFieldMatch": 'true', "hl.maxAnalyzedChars": '102400'} response = solr.query(q, fields=['id'], highlight=ocr_list, **params) if page_id not in response.highlighting: return [] words = set() for ocr in ocr_list: if ocr in response.highlighting[page_id]: for context in response.highlighting[page_id][ocr]: words.update(find_words(context)) return list(words)
def oralookup(pid=None, uuid=None, fields_to_return="f_name, f_subject, f_keyphrase, faculty, f_institution, thesis_type, content_type, collection", endpoint="http://ora.ouls.ox.ac.uk:8080/solr/select"): s = SolrConnection(endpoint) results = {} query = "" if pid: pid = "\:".join(pid.split(":")) query = "id:%s" % pid elif uuid: query = "id:uuid\:%s" % uuid else: return results # Running actual query (3 tries, failover) tries = 0 while(tries != 3): try: r = s.query(q = query, fields = fields_to_return) logger.debug("Solr response: %s" % r.header) tries = 3 except BadStatusLine: sleep(0.5) tries = tries + 1 try: assert len(r.results) == 1 return r.results[0] except ValueError: logger.warn("Couldn't parse json response from Solr endpoint: %s" % r) return {} except AssertionError: logger.warn("Couldn't assert that only a single result was fetched: %s" % results) return {}
def get_page_text(page): no_text = ["Text not available"] solr = SolrConnection(settings.SOLR) query = 'id:"%s"' % page.url solr_results = solr.query(query) results_attribute = getattr(solr_results, 'results', None) if isinstance(results_attribute, list) and len(results_attribute) > 0: return results_attribute[0].get('ocr', no_text) else: return no_text
def similar_pages(page): solr = SolrConnection(settings.SOLR) d = page.issue.date_issued year, month, day = '{0:02d}'.format(d.year), '{0:02d}'.format(d.month), '{0:02d}'.format(d.day) date = ''.join(map(str, (year, month, day))) query = '+type:page AND date:%s AND %s AND NOT(lccn:%s)' % (date, query_join(map(lambda p: p.city, page.issue.title.places.all()), 'city'), page.issue.title.lccn) response = solr.query(query, rows=25) results = response.results return map(lambda kwargs: utils.get_page(**kwargs), map(lambda r: urlresolvers.resolve(r['id']).kwargs, results))
def execute_solr_query(query, fields, sort, sort_order, rows, start): solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def execute_solr_query(query, fields, sort, sort_order, rows, start): # default arg_separator - underscore wont work if fields to facet on # themselves have underscore in them solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=[ 'lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language' ], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def execute_solr_query(query, fields, sort, sort_order, rows, start): # default arg_separator - underscore wont work if fields to facet on # themselves have underscore in them solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(query, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort, sort_order=sort_order, start=start) return solr_response
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection( settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = { "hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=[ 'id', 'title', 'date', 'sequence', 'edition_label', 'section_label' ], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) return Page(pages, number, self)
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ solr = SolrConnection(settings.SOLR) titles = models.Title.objects.all() if since: titles = titles.filter(created__gte=since) titles = titles.prefetch_related("languages", "alt_titles", "subjects", "notes", "places", "urls", "essays", "country", "holdings") count = 0 for chunk in sliced(titles, 500): docs = [] for title in chunk: try: docs.append(title.solr_doc) except Exception: LOGGER.exception("Unable to index title %s", title) solr.add_many(docs) reset_queries() solr.commit() count += len(chunk) LOGGER.info("indexed %d titles", count) lccns = set(models.Title.objects.values_list("lccn", flat=True)) for result in solr.query("+type:title", fields=["id", "lccn"]): stale_id = result["id"] lccn = result["lccn"] if lccn not in lccns: LOGGER.warning("Removing stale title %s from the search index", stale_id) delete_title(stale_id, solr=solr) solr.commit()
class QuoteResource: def __init__(self): self.solr = SolrConnection('http://localhost:8983/solr') @cherrypy.expose def index(self, callback=None, person=None, topic=None): filters = [] filters.append('type:quote') if person: filters.append('person_t:%s' % person) if topic: filters.append('quote_t:%s' % topic) results = self.solr.query(q = ' AND '.join(filters), rows=100) docs = [] timeline = { 'timeline': { 'headline':'OnTheRecord', 'type':'default', 'startDate':'2012,1,1', 'text':'We help you track quotations from politicians over time', } } for result in results: doc = { "startDate":result['date'].strftime('%Y,%m,%d'), "headline":result['person'], "text":'<a href="' + result['url'] + '">'+result['title'] +'</a>', "asset": { "media":"<blockquote>\""+result['quote'] + "\"</blockquote>", "credit":"", "caption":"" } } docs.append(doc) timeline['timeline']['date'] = docs; cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps(timeline, ensure_ascii=False, indent=4).encode('utf-8')
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = {"hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=['id', 'title', 'date', 'sequence', 'edition_label', 'section_label'], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) return Page(pages, number, self)
def __init__(self, query): self.query = query.copy() # figure out the solr query q = title_search(self.query) try: page = int(self.query.get('page')) except: page = 1 try: rows = int(self.query.get('rows')) except: rows = 50 start = rows * (page - 1) # determine sort order sort_field, sort_order = _get_sort(self.query.get('sort')) # execute query solr = SolrConnection( settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(q, fields=[ 'lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language' ], rows=rows, sort=sort_field, sort_order=sort_order, start=start) # convert the solr documents to Title models # could use solr doc instead of going to db, if performance requires it lccns = [d['lccn'] for d in solr_response.results] results = [] for lccn in lccns: try: title = models.Title.objects.get(lccn=lccn) results.append(title) except models.Title.DoesNotExist, e: pass # TODO: log exception
def __init__(self, query): self.query = query.copy() # figure out the solr query q = title_search(self.query) try: page = int(self.query.get('page')) except: page = 1 try: rows = int(self.query.get('rows')) except: rows = 50 start = rows * (page - 1) # determine sort order sort_field, sort_order = _get_sort(self.query.get('sort')) # execute query solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? solr_response = solr.query(q, fields=['lccn', 'title', 'edition', 'place_of_publication', 'start_year', 'end_year', 'language'], rows=rows, sort=sort_field, sort_order=sort_order, start=start) # convert the solr documents to Title models # could use solr doc instead of going to db, if performance requires it lccns = [d['lccn'] for d in solr_response.results] results = [] for lccn in lccns: try: title = models.Title.objects.get(lccn=lccn) results.append(title) except models.Title.DoesNotExist, e: pass # TODO: log exception
class PersonResource: def __init__(self): self.solr = SolrConnection('http://localhost:8983/solr') @cherrypy.expose def index(self): results = self.solr.query('*:*', facet='true', facet_field='person') for person in results.facet_counts[u'facet_fields'][u'person']: print person cherrypy.response.headers['Content-Type'] = 'application/json; charset=utf-8' return json.dumps({ 'test': 'test', 'data': 'data' }, ensure_ascii=False, indent=4).encode('utf-8')
def setUpClass(cls): # First, add a folio to Solr so that the image_uri can be retrieved during the MEI conversion # Using curl here because it turned out to be easier than solrconn.add and gives better error messages os.system("curl {0}/update/?commit=true -H 'Content-Type: text/xml' -d '<add><doc>\ <field name=\"id\">testid</field>\ <field name=\"type\">cantusdata_folio</field>\ <field name=\"manuscript_id\">{1}</field>\ <field name=\"number\">{2}</field>\ <field name=\"image_uri\">{3}</field>\ </doc></add>'".format(settings.SOLR_SERVER, MEI_FIXTURE_ID, MEI_FIXTURE_FOLIO, MEI_FIXTURE_URI)) docs = list(MEIConverter.process_file(MEI_FIXTURE, MEI_FIXTURE_SIGLUM, MEI_FIXTURE_ID)) # Sanity check solrconn = SolrConnection(settings.SOLR_SERVER) prequery = solrconn.query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM) assert prequery.numFound == 0, 'MEI was already in the database when loading the test fixture' solrconn.add_many(docs) solrconn.commit()
if not options.wiki: raise Exception('A wiki is required, passed as host name') conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') query = ["host:'%s'" % (options.wiki)] query += ['ns:%d ' % (int(options.namespace))] if options.start_date or options.end_date: start = options.start_date + 'T00:00:00.000Z' if options.start_date else '*' end = options.end_date + 'T00:00:00.000Z' if options.end_date else '*' query += ['created:[%s TO %s]' % (start, end)] response = conn.query(' AND '.join(query), fields=['html_en','nolang_txt','html']) paginator = SolrPaginator(response) print paginator.count, 'results to chomp through...' polarities, subjectivities = [], [] for page in paginator.page_range: for doc in paginator.page(page).object_list: sent = sentiment.sentiment(doc.get('html_en', doc.get('nolang_txt', doc.get('html')))) if ( sent == (0,0)): continue polarities.append(sent[0]) subjectivities.append(sent[1]) if page % int(paginator.num_pages/10) == 0: print "========","On page", page, "of", paginator.num_pages, "======="
"-w", "--wiki", dest="wiki", action="store", default=None, help="Specifies the wiki to perform calculations against" ) parser.add_option( "-n", "--sents", dest="num_sents", action="store", default=5, help="Specifies the number of sentences to write" ) (options, args) = parser.parse_args() if options.id: query = "id:%s" % (options.id) elif options.wiki: query = "host:'%s' AND ns:0" % (options.wiki) else: raise Exception("A wiki or ID is required, passed as host name") conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr") response = conn.query(query, fields=["html_en", "nolang_txt", "html", "title", "title_en", "id"]) paginator = SolrPaginator(response) summarizer = SimpleSummarizer() for page in paginator.page_range: for doc in paginator.page(page).object_list: text = doc.get("html_en", doc.get("nolang_txt", doc.get("html"))) title = doc.get("title_en", doc.get("title", doc["id"])) summed = summarizer.get_summarized(text, options.num_sents) print "\t\t=======", title, "=======" print "\t" + "\n\t".join([sent for sent in summed if not sent.startswith("Contents")]) print "\t\t====================================="
def search(field, data, path, hlength, mode): from termcolor import colored from solr import SolrConnection #hlength = int(hlength) #search solr, get filePath, do a grep and show the line #print 'search' s = SolrConnection(SOLR_URL) if field == 'name': query = 'name:"' + data + '"' response = s.query(query) elif field == 'txt': query = 'txt:"' + data + '"' #response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100) if hlength: response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE') else: response = s.query(query, fl='id,name') else: query = 'name:"' + data + '" OR txt:"' + data + '"' #response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100) if hlength: response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE') else: response = s.query(query, fl='id,name') #print query #print response.__dict__ #print response.highlighting if hlength and field != 'name': hlength = int(hlength) for id in response.highlighting: if os.path.isfile(id): if response.highlighting[id]: for txt in response.highlighting[id]['txt']: txt = txt.strip() startpos = txt.index('<em>') endpos = txt.rindex('</em>') print (txt[:startpos] + colored(txt[startpos+4:endpos], 'red') + txt[endpos+5:]).replace('<em>', '').replace('</em>', '') else: fdata = open(id, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ') fdata = filter(lambda x: x in string.printable, fdata) for m in re.finditer( data, fdata ): start = m.start()-hlength if start < 0 : start = 0 end = m.end() + hlength if end > len(fdata): end = len(fdata) print (fdata[start:m.start()] + colored(fdata[m.start():m.end()], 'red') + fdata[m.end():end]).replace('<em>', '').replace('</em>', '') if id.endswith(('.mp3')): if mode == 'slow': x = raw_input('press `y` to play, `n` to move forward \n') if x == 'y': subprocess.call(["afplay", id]) else: print '\t To open the file press cmd + double click ' print colored("file://"+id, 'blue') print '\n \n' if mode == 'slow': raw_input('press any key to continue \n') else: s.delete_query('id:'+id) else: for hit in response.results: if hit['id']: if hit['id'].endswith(('.mp3')): if mode == 'slow': x = raw_input('press `y` to play, `n` to move forward \n') if x == 'y': subprocess.call(["afplay", hit['id']]) else: print '\t To open the file press cmd + double click ' print colored("file://"+hit['id'], 'blue') print '\n \n' if mode == 'slow': raw_input('press any key to continue \n') else: s.delete_query('id:'+hit['id'])
def title_count(): solr = SolrConnection(settings.SOLR) return solr.query('type:title', fields=['id']).numFound
dest="limit", action="store", default=None, help="Specifies the document size of the calculation set") (options, args) = parser.parse_args() if not options.wiki: raise Exception('A wiki is required, passed as host name') conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') ne = [] query = ["host:'%s'" % (options.wiki), 'ns:0'] response = conn.query(' AND '.join(query), fields=['html_en', 'nolang_txt', 'html'], sort='backlinks desc', limit=100) paginator = SolrPaginator(response) for page in paginator.page_range: for doc in paginator.page(page).object_list: try: text = doc.get('html_en', doc.get('nolang_txt', doc.get('html'))) corpus = NECorpus(text) print corpus.nes() except UnicodeEncodeError: pass
"-l", "--limit", dest="limit", action="store", default=None, help="Specifies the document size of the calculation set", ) (options, args) = parser.parse_args() if not options.wiki: raise Exception("A wiki is required, passed as host name") conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr") ne = [] query = ["host:'%s'" % (options.wiki), "ns:0"] response = conn.query(" AND ".join(query), fields=["html_en", "nolang_txt", "html"], sort="backlinks desc", limit=100) paginator = SolrPaginator(response) for page in paginator.page_range: for doc in paginator.page(page).object_list: try: text = doc.get("html_en", doc.get("nolang_txt", doc.get("html"))) corpus = NECorpus(text) print corpus.nes() except UnicodeEncodeError: pass
if not options.wiki: raise Exception('A wiki is required, passed as host name') conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') query = ["host:'%s'" % (options.wiki)] query += ['ns:%d ' % (int(options.namespace))] if options.start_date or options.end_date: start = options.start_date + 'T00:00:00.000Z' if options.start_date else '*' end = options.end_date + 'T00:00:00.000Z' if options.end_date else '*' query += ['created:[%s TO %s]' % (start, end)] response = conn.query(' AND '.join(query), fields=['html_en', 'nolang_txt', 'html']) paginator = SolrPaginator(response) print paginator.count, 'results to chomp through...' polarities, subjectivities = [], [] for page in paginator.page_range: for doc in paginator.page(page).object_list: sent = sentiment.sentiment( doc.get('html_en', doc.get('nolang_txt', doc.get('html')))) if (sent == (0, 0)): continue polarities.append(sent[0]) subjectivities.append(sent[1]) if page % int(paginator.num_pages / 10) == 0:
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection( settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = { "hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } params.update(self.facet_params) sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=[ 'id', 'title', 'date', 'month', 'day', 'sequence', 'edition_label', 'section_label' ], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) solr_facets = solr_response.facet_counts # sort states by number of hits per state (desc) facets = { 'city': _sort_facets_asc(solr_facets, 'city'), 'county': _sort_facets_asc(solr_facets, 'county'), 'frequency': _sort_facets_asc(solr_facets, 'frequency'), 'language': _sort_facets_asc(solr_facets, 'language'), 'state': _sort_facets_asc(solr_facets, 'state'), } # sort by year (desc) facets['year'] = sorted(list( solr_facets['facet_ranges']['year']['counts'].items()), key=lambda k: k[0], reverse=True) facet_gap = self.facet_params['f_year_facet_range_gap'] if facet_gap > 1: facets['year'] = [('%s-%d' % (y[0], int(y[0]) + facet_gap - 1), y[1]) for y in facets['year']] pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words) pages.append(page) solr_page = Page(pages, number, self) solr_page.facets = facets return solr_page
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = { "hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 # "hl.method":'unified' } params.update(self.facet_params) sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query( self._q, fields="*", # highlight=self._ocr_list, highlight='ocr_vector', rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) solr_facets = solr_response.facet_counts facets = dict() facets['year'] = sorted(solr_facets.get('facet_fields')['year'].items()) facets['lccn'] = sorted(solr_facets.get('facet_fields')['lccn'].items(), lambda x, y: x - y, lambda k: k[1], True) facets['county'] = sorted(solr_facets.get('facet_fields')['county'].items(), lambda x, y: x - y, lambda k: k[1], True) facets['region'] = sorted(solr_facets.get('facet_fields')['region'].items(), lambda x, y: x - y, lambda k: k[1], True) facets['city'] = sorted(solr_facets.get('facet_fields')['city'].items(), lambda x, y: x - y, lambda k: k[1], True) facets['newspaper_type'] = sorted(solr_facets.get('facet_fields')['newspaper_type'].items(), lambda x, y: x - y, lambda k: k[1], True) pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] # for ocr in self._ocr_list: for s in coords.get('ocr_vector') or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) solr_page = Page(pages, number, self) solr_page.facets = facets return solr_page
def title_count(): solr = SolrConnection(settings.SOLR) return solr.query("type:title", fields=["id"]).numFound
def page(self, number): """ Override the page method in Paginator since Solr has already paginated stuff for us. """ number = self.validate_number(number) # figure out the solr query and execute it solr = SolrConnection(settings.SOLR) # TODO: maybe keep connection around? start = self.per_page * (number - 1) params = {"hl.snippets": 100, # TODO: make this unlimited "hl.requireFieldMatch": 'true', # limits highlighting slop "hl.maxAnalyzedChars": '102400', # increased from default 51200 } params.update(self.facet_params) sort_field, sort_order = _get_sort(self.query.get('sort'), in_pages=True) solr_response = solr.query(self._q, fields=['id', 'title', 'date', 'month', 'day', 'sequence', 'edition_label', 'section_label'], highlight=self._ocr_list, rows=self.per_page, sort=sort_field, sort_order=sort_order, start=start, **params) solr_facets = solr_response.facet_counts # sort states by number of hits per state (desc) facets = {'state': sorted(solr_facets.get('facet_fields')['state'].items(), lambda x, y: x - y, lambda k: k[1], True), 'year': solr_facets['facet_ranges']['year']['counts'], 'county': sorted(solr_facets.get('facet_fields')['county'].items(), lambda x, y: x - y, lambda k: k[1], True)} # sort by year (desc) facets['year'] = sorted(solr_facets['facet_ranges']['year']['counts'].items(), lambda x, y: int(x) - int(y), lambda k: k[0], True) facet_gap = self.facet_params['f_year_facet_range_gap'] if facet_gap > 1: facets['year'] = [('%s-%d' % (y[0], int(y[0])+facet_gap-1), y[1]) for y in facets['year']] pages = [] for result in solr_response.results: page = models.Page.lookup(result['id']) if not page: continue words = set() coords = solr_response.highlighting[result['id']] for ocr in self._ocr_list: for s in coords.get(ocr) or []: words.update(find_words(s)) page.words = sorted(words, key=lambda v: v.lower()) page.highlight_url = self.highlight_url(page.url, page.words, number, len(pages)) pages.append(page) solr_page = Page(pages, number, self) solr_page.facets = facets return solr_page
if options.wiki: query = 'host:%s' % (options.wiki) elif not options.query: raise Exception('A wiki is required, passed as host name') if options.query: query += ' '+options.query specifier = options.wiki if options.wiki else str(os.getpid()) conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') print query response = conn.query(query, fields=['html_en','nolang_txt','html', 'title', 'title_en', 'id'], rows=100) paginator = SolrPaginator(response) def initialize_dir(page): paths = [options.dest, specifier, str(page)] fullpath = '' for path in paths: fullpath += path + '/' if not os.path.exists(fullpath): os.mkdir(fullpath) return fullpath for page in paginator.page_range: pagedir = initialize_dir(page) lockfilepath = pagedir+'/LOCK' with open(lockfilepath, 'w') as lockfile:
"--wiki", dest="wiki", action="store", default=None, help="Specifies the wiki to perform calculations against") (options, args) = parser.parse_args() if options.id: query = 'id:%s' % (options.id) elif options.wiki: query = "host:'%s' AND ns:0" % (options.wiki) else: raise Exception('A wiki or ID is required, passed as host name') conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') response = conn.query( query, fields=['html_en', 'nolang_txt', 'html', 'title', 'title_en', 'id']) paginator = SolrPaginator(response) tool = ReadabilityTool(lang='eng') for page in paginator.page_range: for doc in paginator.page(page).object_list: text = doc.get('html_en', doc.get('nolang_txt', doc.get('html'))) title = doc.get('title_en', doc.get('title', doc['id'])) print "=======", title, "=======" tool.getReportAll(text.encode('utf8')) print "====================================="