def query_dockets(query_string): """Identify the d_pks for all the dockets that we need to export :param query_string: The query to run as a URL-encoded string (typically starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd' :return: a set of docket PKs to export """ main_query = build_main_query_from_query_string( query_string, {"fl": ["docket_id"]}, {"group": True, "facet": False, "highlight": False}, ) main_query["group.limit"] = 0 main_query["sort"] = "dateFiled asc" si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") search = si.query().add_extra(**main_query) page_size = 1000 paginator = Paginator(search, page_size) d_pks = set() for page_number in paginator.page_range: page = paginator.page(page_number) for item in page: d_pks.add(item["groupValue"]) logger.info( "After %s pages, got back %s results.", len(paginator.page_range), len(d_pks), ) return d_pks
def items(self, obj): """Do a Solr query here. Return the first 20 results""" search_form = SearchForm(obj.GET) if search_form.is_valid(): cd = search_form.cleaned_data order_by = 'dateFiled' if cd['type'] == 'o': solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') elif cd['type'] == 'r': solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') else: return [] main_params = search_utils.build_main_query(cd, highlight=False, facet=False) main_params.update({ 'sort': '%s desc' % order_by, 'rows': '20', 'start': '0', 'caller': 'SearchFeed', }) # Eliminate items that lack the ordering field. main_params['fq'].append('%s:[* TO *]' % order_by) return solr.query().add_extra(**main_params).execute() else: return []
def index_sitemap_maker(request): """Generate a sitemap index page Counts the number of cases in the site, divides by `items_per_sitemap` and provides links items. """ connection_string_sitemap_path_pairs = ( (settings.SOLR_OPINION_URL, reverse('opinion_sitemap'), False), (settings.SOLR_RECAP_URL, reverse('recap_sitemap'), True), (settings.SOLR_AUDIO_URL, reverse('oral_argument_sitemap'), False), (settings.SOLR_PEOPLE_URL, reverse('people_sitemap'), False), ) sites = [] for connection_string, path, group in connection_string_sitemap_path_pairs: conn = ExtraSolrInterface(connection_string) count = conn.query().add_extra(**make_index_params(group)).count() num_pages = count / items_per_sitemap + 1 for i in range(1, num_pages + 1): sites.append('https://www.courtlistener.com%s?p=%s' % (path, i)) # Random additional sitemaps. sites.extend([ 'https://www.courtlistener.com%s' % reverse('simple_pages_sitemap'), 'https://www.courtlistener.com/sitemap-visualizations.xml', ]) xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites}) # These links contain case names, so they should get crawled but not # indexed response = HttpResponse(xml, content_type='application/xml') response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex' return response
def make_solr_sitemap(request, solr_url, params, changefreq, low_priority_pages, url_field): solr = ExtraSolrInterface(solr_url) page = int(request.GET.get('p', 1)) params['start'] = (page - 1) * items_per_sitemap results = solr.query().add_extra(**params).execute() urls = [] cl = 'https://www.courtlistener.com' for result in results: result = normalize_grouping(result) url_strs = ['%s%s' % (cl, result[url_field])] if result.get('local_path') and \ not result['local_path'].endswith('.xml'): url_strs.append('%s/%s' % (cl, result['local_path'])) item = {} for url_str in url_strs: item['location'] = url_str item['changefreq'] = changefreq item['lastmod'] = result['timestamp'] if any(s in url_str for s in low_priority_pages): item['priority'] = '0.3' else: item['priority'] = '0.5' urls.append(item.copy()) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) response = HttpResponse(xml, content_type='application/xml') response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex' return response
def items(self, obj): """Do a Solr query here. Return the first 20 results""" search_form = SearchForm(obj.GET) if search_form.is_valid(): cd = search_form.cleaned_data order_by = "dateFiled" if cd["type"] == SEARCH_TYPES.OPINION: solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") elif cd["type"] == SEARCH_TYPES.RECAP: solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") else: return [] main_params = search_utils.build_main_query( cd, highlight=False, facet=False ) main_params.update( { "sort": "%s desc" % order_by, "rows": "20", "start": "0", "caller": "SearchFeed", } ) # Eliminate items that lack the ordering field. main_params["fq"].append("%s:[* TO *]" % order_by) items = solr.query().add_extra(**main_params).execute() solr.conn.http_connection.close() return items else: return []
def coverage_data(request, version, court): """Provides coverage data for a court. Responds to either AJAX or regular requests. """ if court != "all": court_str = get_object_or_404(Court, pk=court).pk else: court_str = "all" q = request.GET.get("q") si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") facet_field = "dateFiled" response = (si.query().add_extra( **build_coverage_query(court_str, q, facet_field)).execute()) si.conn.http_connection.close() counts = response.facet_counts.facet_ranges[facet_field]["counts"] counts = strip_zero_years(counts) # Calculate the totals annual_counts = {} total_docs = 0 for date_string, count in counts: annual_counts[date_string[:4]] = count total_docs += count return JsonResponse({ "annual_counts": annual_counts, "total": total_docs }, safe=True)
def search_db_for_fullcitation( full_citation: FullCaseCitation, ) -> SolrResponse: """For a citation object, try to match it to an item in the database using a variety of heuristics. Returns: - a Solr Result object with the results, or an empty list if no hits """ if not hasattr(full_citation, "citing_opinion"): full_citation.citing_opinion = None # TODO: Create shared solr connection for all queries si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") main_params: SearchParam = { "q": "*", "fq": [ "status:Precedential", # Non-precedential documents aren't cited ], "caller": "citation.match_citations.match_citation", } if full_citation.citing_opinion is not None: # Eliminate self-cites. main_params["fq"].append("-id:%s" % full_citation.citing_opinion.pk) # Set up filter parameters if full_citation.year: start_year = end_year = full_citation.year else: start_year, end_year = get_years_from_reporter(full_citation) if ( full_citation.citing_opinion is not None and full_citation.citing_opinion.cluster.date_filed ): end_year = min( end_year, full_citation.citing_opinion.cluster.date_filed.year ) main_params["fq"].append( "dateFiled:%s" % build_date_range(start_year, end_year) ) if full_citation.court: main_params["fq"].append("court_exact:%s" % full_citation.court) # Take 1: Use a phrase query to search the citation field. main_params["fq"].append('citation:("%s")' % full_citation.base_citation()) results = si.query().add_extra(**main_params).execute() si.conn.http_connection.close() if len(results) == 1: return results if len(results) > 1: if ( full_citation.citing_opinion is not None and full_citation.defendant ): # Refine using defendant, if there is one results = case_name_query( si, main_params, full_citation, full_citation.citing_opinion ) return results # Give up. return []
def docket_pks_for_query(query_string): """Yield docket PKs for a query by iterating over the full result set :param query_string: The query to run as a URL-encoded string (typically starts with 'q='). E.g. 'q=foo&type=r&order_by=dateFiled+asc&court=dcd' :return: The next docket PK in the results """ main_query = build_main_query_from_query_string( query_string, {"fl": ["docket_id"]}, { "group": True, "facet": False, "highlight": False }, ) main_query["group.limit"] = 0 main_query["sort"] = "dateFiled asc" si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") search = si.query().add_extra(**main_query) si.conn.http_connection.close() page_size = 100 paginator = Paginator(search, page_size) for page_number in paginator.page_range: page = paginator.page(page_number) for item in page: yield item["groupValue"]
def get_citing_clusters_with_cache( cluster: OpinionCluster, ) -> Tuple[list, int]: """Use Solr to get clusters citing the one we're looking at :param cluster: The cluster we're targeting :type cluster: OpinionCluster :return: A tuple of the list of solr results and the number of results """ cache_key = "citing:%s" % cluster.pk cache = caches["db_cache"] cached_results = cache.get(cache_key) if cached_results is not None: return cached_results # Cache miss. Get the citing results from Solr sub_opinion_pks = cluster.sub_opinions.values_list("pk", flat=True) ids_str = " OR ".join([str(pk) for pk in sub_opinion_pks]) q = { "q": "cites:(%s)" % ids_str, "rows": 5, "start": 0, "sort": "citeCount desc", "caller": "view_opinion", "fl": "absolute_url,caseName,dateFiled", } conn = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") results = conn.query().add_extra(**q).execute() conn.conn.http_connection.close() citing_clusters = list(results) citing_cluster_count = results.result.numFound a_week = 60 * 60 * 24 * 7 cache.set(cache_key, (citing_clusters, citing_cluster_count), a_week) return citing_clusters, citing_cluster_count
def make_solr_sitemap(request, solr_url, params, changefreq, low_priority_pages, url_field): solr = ExtraSolrInterface(solr_url) page = int(request.GET.get('p', 1)) court = request.GET['court'] params['start'] = (page - 1) * items_per_sitemap params['fq'] = ['court_exact:%s' % court] results = solr.query().add_extra(**params).execute() urls = [] cl = 'https://www.courtlistener.com' for result in results: result = normalize_grouping(result) url_strs = ['%s%s' % (cl, result[url_field])] if result.get('local_path') and \ not result['local_path'].endswith('.xml'): url_strs.append('%s/%s' % (cl, result['local_path'])) item = {} for url_str in url_strs: item['location'] = url_str item['changefreq'] = changefreq item['lastmod'] = result['timestamp'] if any(s in url_str for s in low_priority_pages): item['priority'] = '0.3' else: item['priority'] = '0.5' urls.append(item.copy()) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) response = HttpResponse(xml, content_type='application/xml') response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex' return response
def __init__(self, main_query, offset, limit, type=None, length=None): super(SolrList, self).__init__() self.main_query = main_query self.offset = offset self.limit = limit self.type = type self._item_cache = [] if self.type == 'o': self.conn = ExtraSolrInterface( settings.SOLR_OPINION_URL, mode='r', ) elif self.type == 'oa': self.conn = ExtraSolrInterface( settings.SOLR_AUDIO_URL, mode='r', ) elif self.type == 'r': self.conn = ExtraSolrInterface( settings.SOLR_RECAP_URL, mode='r', ) elif self.type == 'p': self.conn = ExtraSolrInterface( settings.SOLR_PEOPLE_URL, mode='r', ) self._length = length
def make_solr_sitemap(request, solr_url, params, changefreq, low_priority_pages, url_field): solr = ExtraSolrInterface(solr_url) page = int(request.GET.get("p", 1)) court = request.GET["court"] params["start"] = (page - 1) * items_per_sitemap params["fq"] = ["court_exact:%s" % court] results = solr.query().add_extra(**params).execute() solr.conn.http_connection.close() urls = [] cl = "https://www.courtlistener.com" for result in results: result = normalize_grouping(result) url_strs = ["%s%s" % (cl, result[url_field])] if result.get( "local_path") and not result["local_path"].endswith(".xml"): url_strs.append("%s/%s" % (cl, result["local_path"])) item = {} for url_str in url_strs: item["location"] = url_str item["changefreq"] = changefreq item["lastmod"] = result["timestamp"] if any(s in url_str for s in low_priority_pages): item["priority"] = "0.3" else: item["priority"] = "0.5" urls.append(item.copy()) xml = smart_str(loader.render_to_string("sitemap.xml", {"urlset": urls})) response = HttpResponse(xml, content_type="application/xml") response["X-Robots-Tag"] = "noindex, noodp, noarchive, noimageindex" return response
def handle(self, *args, **options): super(Command, self).handle(*args, **options) self.verbosity = int(options.get("verbosity", 1)) self.options = options self.noinput = options["noinput"] if not self.options["optimize_everything"]: self.solr_url = options["solr_url"] self.si = ExtraSolrInterface(self.solr_url, mode="rw") self.type = options["type"] if options["update"]: if self.verbosity >= 1: self.stdout.write("Running in update mode...\n") if options.get("everything"): self.add_or_update_all() elif options.get("datetime"): self.add_or_update_by_datetime(options["datetime"]) elif options.get("query"): self.stderr.write("Updating by query not implemented.") sys.exit(1) elif options.get("items"): self.add_or_update(*options["items"]) elif options.get("delete"): if self.verbosity >= 1: self.stdout.write("Running in deletion mode...\n") if options.get("everything"): self.delete_all() elif options.get("datetime"): self.delete_by_datetime(options["datetime"]) elif options.get("query"): self.delete_by_query(options["query"]) elif options.get("items"): self.delete(*options["items"]) if options.get("do_commit"): self.si.commit() if options.get("optimize"): self.optimize() if options.get("optimize_everything"): self.optimize_everything() self.si.conn.http_connection.close() if not any( [ options["update"], options.get("delete"), options.get("do_commit"), options.get("optimize"), options.get("optimize_everything"), ] ): self.stderr.write( "Error: You must specify whether you wish to " "update, delete, commit, or optimize your " "index.\n" ) sys.exit(1)
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.connections = { 'o': ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r'), 'oa': ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r'), } self.options = {} self.valid_ids = {}
def make_court_variable(): courts = Court.objects.exclude(jurisdiction=Court.TESTING_COURT) si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") response = si.query().add_extra(**build_court_count_query()).execute() si.conn.http_connection.close() court_count_tuples = response.facet_counts.facet_fields["court_exact"] courts = annotate_courts_with_counts(courts, court_count_tuples) return courts
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query) si.conn.http_connection.close() q = options["queue"] recap_user = User.objects.get(username="******") throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break logger.info( "Doing row %s: rd: %s, docket: %s", i, result["id"], result["docket_id"], ) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s(result["id"], session.cookies).set( queue=q ), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set( queue=q ), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set( queue=q ), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.connections = { "o": ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r"), "oa": ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r"), "r": ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r"), } self.options = {} self.valid_ids = {}
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False, 'highlight': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing item %s w/rd: %s, d: %s", i, result['id'], result['docket_id']) try: rd = RECAPDocument.objects.get(pk=result['id']) except RECAPDocument.DoesNotExist: logger.warn("Unable to find RECAP Document with id %s", result['id']) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async()
def items(self, obj): solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') params = { 'q': '*', 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'AllJurisdictionsPodcast', } return solr.query().add_extra(**params).execute()
def items(self, obj): solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r") params = { "q": "*", "sort": "dateArgued desc", "rows": "20", "start": "0", "caller": "AllJurisdictionsPodcast", } return solr.query().add_extra(**params).execute()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) self.verbosity = int(options.get('verbosity', 1)) self.options = options self.noinput = options['noinput'] if not self.options['optimize_everything']: self.solr_url = options['solr_url'] self.si = ExtraSolrInterface(self.solr_url, mode='rw') self.type, self.type_str = options['type'] if options['update']: if self.verbosity >= 1: self.stdout.write('Running in update mode...\n') if options.get('everything'): self.add_or_update_all() elif options.get('datetime'): self.add_or_update_by_datetime(options['datetime']) elif options.get('query'): self.stderr.write("Updating by query not implemented.") sys.exit(1) elif options.get('items'): self.add_or_update(*options['items']) elif options.get('delete'): if self.verbosity >= 1: self.stdout.write('Running in deletion mode...\n') if options.get('everything'): self.delete_all() elif options.get('datetime'): self.delete_by_datetime(options['datetime']) elif options.get('query'): self.delete_by_query(options['query']) elif options.get('items'): self.delete(*options['items']) if options.get('do_commit'): self.si.commit() if options.get('optimize'): self.optimize() if options.get('optimize_everything'): self.optimize_everything() if not any([ options['update'], options.get('delete'), options.get('do_commit'), options.get('optimize'), options.get('optimize_everything') ]): self.stderr.write('Error: You must specify whether you wish to ' 'update, delete, commit, or optimize your ' 'index.\n') sys.exit(1)
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { "rows": page_size, "fl": ["id", "docket_id"] }, { "group": False, "facet": False, "highlight": False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: i += 1 continue if i >= options["limit"] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result["id"]) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result["docket_id"]) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1
def items(self, obj): """Do a Solr query here. Return the first 20 results""" solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") params = { "q": "*", "sort": "dateFiled desc", "rows": "20", "start": "0", "caller": "AllJurisdictionsFeed", } return solr.query().add_extra(**params).execute()
def items(self, obj): """Do a Solr query here. Return the first 20 results""" solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') params = { 'q': '*', 'sort': 'dateFiled desc', 'rows': '20', 'start': '0', 'caller': 'AllJurisdictionsFeed', } return solr.query().add_extra(**params).execute()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.sis = { SEARCH_TYPES.OPINION: ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r"), SEARCH_TYPES.ORAL_ARGUMENT: ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r"), SEARCH_TYPES.RECAP: ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r"), } self.options = {} self.valid_ids = {}
def optimize_everything(self): """Run the optimize command on all indexes.""" urls = settings.SOLR_URLS.values() self.stdout.write("Found %s indexes. Optimizing...\n" % len(urls)) for url in urls: self.stdout.write(" - {url}\n".format(url=url)) try: si = ExtraSolrInterface(url) except EnvironmentError: self.stderr.write(" Couldn't load schema!") continue si.optimize() self.stdout.write('Done.\n')
def __init__(self, main_query, offset, type, length=None): super(SolrList, self).__init__() self.main_query = main_query self.offset = offset self.type = type self._item_cache = [] if self.type == SEARCH_TYPES.OPINION: self.conn = ExtraSolrInterface( settings.SOLR_OPINION_URL, mode="r", ) elif self.type == SEARCH_TYPES.ORAL_ARGUMENT: self.conn = ExtraSolrInterface( settings.SOLR_AUDIO_URL, mode="r", ) elif self.type == SEARCH_TYPES.RECAP: self.conn = ExtraSolrInterface( settings.SOLR_RECAP_URL, mode="r", ) elif self.type == SEARCH_TYPES.PEOPLE: self.conn = ExtraSolrInterface( settings.SOLR_PEOPLE_URL, mode="r", ) self._length = length
def items(self, obj): """ Returns a list of items to publish in this feed. """ solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode="r") params = { "q": "*", "fq": "court_exact:%s" % obj.pk, "sort": "dateArgued desc", "rows": "20", "start": "0", "caller": "JurisdictionPodcast", } return solr.query().add_extra(**params).execute()
def get_docket_ids(main_query): """Get the docket IDs for a query dict. :returns: a set() of docket IDs """ si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() docket_ids = set() for result in results: docket_ids.add(result['docket_id']) logger.info("Got %s docket IDs back from Solr." % len(docket_ids)) return docket_ids
def items(self, obj): """ Returns a list of items to publish in this feed. """ solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') params = { 'q': '*', 'fq': 'court_exact:%s' % obj.pk, 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'JurisdictionPodcast', } return solr.query().add_extra(**params).execute()
def items(self, obj): """Do a Solr query here. Return the first 20 results""" solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") params = { "q": "*", "fq": "court_exact:%s" % obj.pk, "sort": "dateFiled desc", "rows": "20", "start": "0", "caller": "JurisdictionFeed", } items = solr.query().add_extra(**params).execute() solr.conn.http_connection.close() return items
def get_attachment_pages(options): """Find docket entries that look like invoices and get their attachment pages. """ page_size = 100 main_query = build_main_query_from_query_string( Q_DOCS_ONLY, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query) q = options['queue'] recap_user = User.objects.get(username='******') throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() paginator = Paginator(results, page_size) i = 0 for page_number in range(1, paginator.num_pages + 1): paged_results = paginator.page(page_number) for result in paged_results.object_list: if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break logger.info("Doing row %s: rd: %s, docket: %s", i, result['id'], result['docket_id']) throttle.maybe_wait() chain( # Query the attachment page and process it get_attachment_page_by_rd.s( result['id'], session.cookies).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s( result['id'], recap_user.pk).set(queue=q), # And then process that using the normal machinery. process_recap_attachment.s(tag_names=[TAG_PHASE_1]).set(queue=q), ).apply_async() i += 1 else: # Inner loop exited normally (didn't "break") continue # Inner loop broke. Break outer loop too. break
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, {'rows': page_size, 'fl': ['id', 'docket_id']}, {'group': False, 'facet': False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], 'search.RECAPDocument').set(queue=q), ).apply_async() i += 1
def items(self, obj): search_form = SearchForm(obj.GET) if search_form.is_valid(): cd = search_form.cleaned_data solr = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') main_params = search_utils.build_main_query(cd, highlight=False, facet=False) main_params.update({ 'sort': 'dateArgued desc', 'rows': '20', 'start': '0', 'caller': 'SearchFeed', }) return solr.query().add_extra(**main_params).execute() else: return []
def items(self, obj): """Do a Solr query here. Return the first 20 results""" search_form = SearchForm(obj.GET) if search_form.is_valid(): cd = search_form.cleaned_data if cd['type'] == 'o': solr = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') elif cd['type'] == 'r': solr = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') main_params = search_utils.build_main_query(cd, highlight=False) main_params.update({ 'sort': 'dateFiled desc', 'rows': '20', 'start': '0', 'caller': 'SearchFeed', }) return solr.query().add_extra(**main_params).execute() else: return []
def get_expected_item_count(self): # OpinionsSitemap uses the solr index to generate the page, so the only # accurate count comes from the index itself which will also be based # on the fixtures. conn = ExtraSolrInterface(settings.SOLR_OPINION_URL) params = make_sitemap_solr_params('dateFiled asc', 'o_sitemap') params['rows'] = items_per_sitemap params['fq'] = ['court_exact:%s' % self.court_id] r = conn.query().add_extra(**params).execute() # the underlying SitemapTest relies on counting url elements in the xml # response...this logic mimics the creation of the xml, so we at least # know what we *should* get getting for a count if the SiteMapTest's # HTTP client-based test gets an HTTP 200 count = 0 for result in r: if result.get('local_path'): count += 2 else: count += 1 return count
def get_solr_result_objects(cd, facet): """Note that this doesn't run the query yet. Not until the pagination is run. """ search_type = cd['type'] if search_type == 'o': si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) elif search_type == 'r': si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) elif search_type == 'oa': si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) elif search_type == 'p': si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) else: raise NotImplementedError("Unknown search type: %s" % search_type) return results
def do_search(request, rows=20, order_by=None, type=None, facet=True): query_citation = None error = False paged_results = None search_form = SearchForm(request.GET) courts = Court.objects.filter(in_use=True) if search_form.is_valid(): cd = search_form.cleaned_data # Allows an override by calling methods. if order_by is not None: cd['order_by'] = order_by if type is not None: cd['type'] = type search_form = _clean_form(request, cd, courts) if cd['type'] == 'o': si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) query_citation = get_query_citation(cd) elif cd['type'] == 'r': si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) elif cd['type'] == 'oa': si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) elif cd['type'] == 'p': si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r') results = si.query().add_extra(**build_main_query(cd, facet=facet)) # Set up pagination try: if cd['type'] == 'r': rows = 10 paginator = Paginator(results, rows) page = request.GET.get('page', 1) try: paged_results = paginator.page(page) except PageNotAnInteger: paged_results = paginator.page(1) except EmptyPage: # Page is out of range (e.g. 9999), deliver last page. paged_results = paginator.page(paginator.num_pages) except Exception as e: # Catches any Solr errors, and aborts. logger.warning("Error loading pagination on search page with " "request: %s" % request.GET) logger.warning("Error was: %s" % e) if settings.DEBUG is True: traceback.print_exc() error = True # Post processing of the results regroup_snippets(paged_results) else: error = True courts, court_count_human, court_count = merge_form_with_courts(courts, search_form) return { 'results': paged_results, 'search_form': search_form, 'courts': courts, 'court_count_human': court_count_human, 'court_count': court_count, 'query_citation': query_citation, 'facet_fields': make_stats_variable(search_form, paged_results), 'error': error, }
def do_search(request, rows=20, order_by=None, type=None): search_form = SearchForm(request.GET) if search_form.is_valid(): cd = search_form.cleaned_data # Allows an override by calling methods. if order_by is not None: cd['order_by'] = order_by if type is not None: cd['type'] = type search_form = _clean_form(request, cd) try: query_citation = None status_facets = None if cd['type'] == 'o': si = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode='r') stat_facet_fields = place_facet_queries(cd, si) status_facets = make_stats_variable(stat_facet_fields, search_form) query_citation = get_query_citation(cd) results = si.query().add_extra(**build_main_query(cd)) elif cd['type'] == 'r': si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**build_main_query(cd)) elif cd['type'] == 'oa': si = ExtraSolrInterface(settings.SOLR_AUDIO_URL, mode='r') results = si.query().add_extra(**build_main_query(cd)) elif cd['type'] == 'p': si = ExtraSolrInterface(settings.SOLR_PEOPLE_URL, mode='r') results = si.query().add_extra(**build_main_query(cd)) courts = Court.objects.filter(in_use=True) courts, court_count_human, court_count = merge_form_with_courts( courts, search_form ) except Exception as e: if settings.DEBUG is True: traceback.print_exc() logger.warning("Error loading search with request: %s" % request.GET) logger.warning("Error was %s" % e) return {'error': True} else: # Invalid form, send it back logger.warning("Invalid form when loading search page with request: %s" % request.GET) return {'error': True} # Set up pagination try: paginator = Paginator(results, rows) page = request.GET.get('page', 1) try: paged_results = paginator.page(page) except PageNotAnInteger: # If page is not an integer, deliver first page. paged_results = paginator.page(1) except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. paged_results = paginator.page(paginator.num_pages) except Exception, e: # Catches any Solr errors, and aborts. logger.warning("Error loading pagination on search page with request: %s" % request.GET) logger.warning("Error was: %s" % e) if settings.DEBUG is True: traceback.print_exc() return {'error': True}
class SolrList(object): """This implements a yielding list object that fetches items as they are queried. """ def __init__(self, main_query, offset, limit, type=None, length=None): super(SolrList, self).__init__() self.main_query = main_query self.offset = offset self.limit = limit self.type = type self._item_cache = [] if self.type == 'o': self.conn = ExtraSolrInterface( settings.SOLR_OPINION_URL, mode='r', ) elif self.type == 'oa': self.conn = ExtraSolrInterface( settings.SOLR_AUDIO_URL, mode='r', ) elif self.type == 'r': self.conn = ExtraSolrInterface( settings.SOLR_RECAP_URL, mode='r', ) elif self.type == 'p': self.conn = ExtraSolrInterface( settings.SOLR_PEOPLE_URL, mode='r', ) self._length = length def __len__(self): if self._length is None: mq = self.main_query.copy() # local copy for manipulation mq['caller'] = 'api_search_count' count = self.conn.query().add_extra(**mq).count() self._length = count return self._length def __iter__(self): for item in range(0, len(self)): try: yield self._item_cache[item] except IndexError: yield self.__getitem__(item) def __getitem__(self, item): self.main_query['start'] = self.offset r = self.conn.query().add_extra(**self.main_query).execute() if r.group_field is None: # Pull the text snippet up a level for result in r.result.docs: result['snippet'] = '…'.join( result['solr_highlights']['text']) self._item_cache.append(SolrObject(initial=result)) else: # Grouped results for group in getattr(r.groups, r.group_field)['groups']: snippets = [] for doc in group['doclist']['docs']: for snippet in doc['solr_highlights']['text']: if snippet not in snippets: snippets.append(snippet) doc0 = group['doclist']['docs'][0] doc0['snippet'] = '…'.join(snippets) self._item_cache.append(SolrObject(initial=doc0)) # Now, assuming our _item_cache is all set, we just get the item. if isinstance(item, slice): s = slice(item.start - int(self.offset), item.stop - int(self.offset), item.step) return self._item_cache[s] else: # Not slicing. try: return self._item_cache[item] except IndexError: # No results! return [] def append(self, p_object): """Lightly override the append method so we get items duplicated in our cache. """ self._item_cache.append(p_object)