def sqs_apply_custom_facets(custom_facets, sqs=None, exclude_facets=None): """ Return a DESearchQuerySet filtered by the input list of saved searches and exclude searches Inputs: :custom_facets: Queryset of CustomFacets to apply to sqs, required :sqs: Haystack SearchQuerySet, optional :exclude_facet: Queryset of ExcludeFacets to exclude from sqs, optional """ if sqs is None: sqs = DESearchQuerySet() # Apply SearchQueries for exclude facets and custom facets to our # SearchQuerySet if exclude_facets: combined_exclude_sq = create_sq(exclude_facets) if combined_exclude_sq: sqs = sqs.narrow_exclude(combined_exclude_sq.build_query()) if custom_facets: combined_sq = create_sq(custom_facets) if combined_sq: sqs = sqs.narrow(combined_sq.build_query()) return sqs
def _build_facet_queries(custom_facets): tagged_facets = {} sqs = DESearchQuerySet() custom_facet_queries = [facet.saved_querystring for facet in custom_facets] for query, facet in zip(custom_facet_queries, custom_facets): tagged_facets[query] = { 'custom_facet': facet, } sqs = sqs.query_facet(query) return tagged_facets, sqs
def get_jobs(custom_facets=None, exclude_facets=None, jsids=None, default_sqs=None, filters={}, fields=None, facet_limit=250, facet_sort="count", facet_offset=None, mc=1, sort_order='relevance', fl=search_fields): """ Returns 3-tuple containing a DESearchQuerySet object, a set of facet counts that have been filtered, and a set of unfiltered facet counts. The unfiltered facet count object is used by the primary nav box to build items and options. Inputs: :custom_facets: A list of saved searches (Custom Facets) to apply to sqs. Defaults to site's default custom facets set in Middleware :default_sqs: Starting search query set :filters: Dictionary of filter terms in field_name:search_term format The following inputs are Solr parameters. :facet_limit: max number of facets to return per field. -1=unlimited :facet_sort: How to sort facets :facet_offset: offset into the facet list :mc: mincount - Smallest facet size to return """ if default_sqs is not None: sqs = default_sqs else: sqs = DESearchQuerySet() sqs = sqs_apply_custom_facets(custom_facets, sqs, exclude_facets) sqs = _sqs_narrow_by_buid_and_site_package(sqs, buids=jsids) # Limit the retrieved results to only fields that are actually needed. sqs = sqs.fields(fl) sqs = sqs.order_by(sort_order_mapper.get(sort_order, '-score')) #The boost function added to this search query set scales relevancy scores #by a factor of 1/2 at ~6 months (1.8e-11 ms) in all future queries sqs = sqs.bf('recip(ms(NOW/HOUR,salted_date),1.8e-9,1,1)') if fields: sqs = sqs.fields(fields) if facet_offset: sqs = sqs.facet_offset(facet_offset) if facet_limit > 0: sqs = sqs.facet_limit(facet_limit) sqs = sqs.facet_sort(facet_sort).facet_mincount(mc) sqs = sqs.facet("city_slab").facet("state_slab").facet("country_slab")\ .facet("moc_slab").facet("title_slab").facet("full_loc")\ .facet("company_slab").facet("buid").facet("mapped_moc_slab") # When get_jobs is called from job_listing_by_slug_tag, sqs already has site # default facets and filters from URL applied. The call to filter_sqs # concatenates the querystring (q=querystring) with itself, # adding + operators and causing parsing errors for more complex custom # facets. Can't remove now until we verify other views don't rely on # this call to filter_sqs. # Jason McLaughlin 09-07-2012 return filter_sqs(sqs, filters)
def bread_box_location_heading(location_slug_value, jobs=None): if not location_slug_value: return None location_slug_value = location_slug_value.strip('/') location = None locations = location_slug_value.split('/') loc_length = len(locations) try: location = location_from_job(jobs[0], loc_length) except IndexError: # We don't have a job pass if not location: # We didn't have a valid job to pull the location, state, # or country from. sqs = DESearchQuerySet() jobs = filter_sqs_by_location(sqs, location_slug_value)[:1] try: location = location_from_job(jobs[0], loc_length) except IndexError: pass if not location: # Solr has no results for it at all either. Resort to # title casing the location term. location = " ".join(locations) location = location.replace('-', ' ').title() return location
def get_job(request, job_id): search_type = 'guid' if len(job_id) > 31 else 'uid' if not job_id: return None try: query = "%s:(%s)" % (search_type, job_id) return DESearchQuerySet().narrow(query)[0] except IndexError: return None
def test_unicode_title(self): # Test imports group = factories.GroupFactory() self.site.group = group self.site.business_units.add(self.businessunit) self.site.save() import_jobs.update_solr(self.buid, download=False, delete_feed=False, data_dir='seo/tests/data/') solr_jobs = self.conn.search("*:*") resp = self.client.get('/') self.assertEqual(resp.context['total_jobs_count'], solr_jobs.hits) # test standard facets against Haystack query standard_cf = factories.CustomFacetFactory.build( # default facet will return both jobs name="Keyword Facet", group=group, show_production=True) standard_cf.save() standard_cf.keyword.add(u'Ключевые') standard_cf.save() standard_site_facet = factories.SeoSiteFacetFactory( seosite=self.site, customfacet=standard_cf, facet_type=factories.SeoSiteFacet.STANDARD) standard_site_facet.save() # test standard facets against Haystack query standard_cf2 = factories.CustomFacetFactory.build( # default facet will return both jobs name='Country Facet', country='United States', group=group, show_production=True) standard_cf2.save() standard_site_facet2 = factories.SeoSiteFacetFactory( seosite=self.site, customfacet=standard_cf2, facet_type=factories.SeoSiteFacet.STANDARD) standard_site_facet2.save() resp = self.client.get('/keyword-facet/new-jobs/', HTTP_HOST=self.site.domain, follow=True) sqs = DESearchQuerySet().filter(text=u'Ключевые') self.assertEqual(len(resp.context['default_jobs']), sqs.count()) for facet_widget in resp.context['widgets']: # Ensure that no standard facet has more results than current # search results for count_tuple in facet_widget.items: self.assertTrue(sqs.count() >= count_tuple[1]) # Test default site facets against PySolr query from django.core.cache import cache cache.clear() default_cf = factories.CustomFacetFactory.build( name="Default Facet", title=u"Специалист", group=group, show_production=True) default_cf.save() default_site_facet = factories.SeoSiteFacetFactory( seosite=self.site, facet_type=factories.SeoSiteFacet.DEFAULT, customfacet=default_cf) default_site_facet.save() resp = self.client.get('/jobs/', HTTP_HOST=self.site.domain, follow=True) total_jobs = resp.context['total_jobs_count'] solr_jobs = self.conn.search(q=u"title:Специалист") self.assertEqual(total_jobs, solr_jobs.hits) self.assertEqual(len(resp.context['default_jobs']), total_jobs) for facet_widget in resp.context['widgets']: for count_tuple in facet_widget.items: self.assertTrue(sqs.count() >= count_tuple[1]) # Feed test resp = self.client.get('/feed/json', HTTP_HOST=self.site.domain) jobs = json.loads(resp.content) self.assertEqual(len(jobs), total_jobs) for job in jobs: resp = self.client.get(job['url'], HTTP_HOST=self.site.domain, follow=False) self.assertEqual(resp.status_code, 302) expected = 'http://my.jobs/%s%d?my.jobs.site.id=%s' %\ (job['guid'], settings.FEED_VIEW_SOURCES['json'], str(self.site.pk)) self.assertEqual(resp['Location'], expected) # Sitemap index Test - Since sitemap only builds out updates from the # last 30 days, this test will eventually be checking 0 jobs in sitemap # TODO, find a way to keep feed dates current. We might be able to use # the mock library to override datetime functions resp = self.client.get('/sitemap.xml', HTTP_HOST=self.site.domain) root = etree.fromstring(resp.content) self.assertGreater(len(root), 0) crawled_jobs = 0 for loc, lastmod in root: self.assertTrue(loc.text) resp = self.client.get(loc.text, HTTP_HOST=self.site.domain) self.assertEqual(resp.status_code, 200) # Get the first daily sitemap urlset = etree.fromstring(resp.content) # Check each job in daily sitemap - I'm a bot for loc, _, _, _ in urlset: resp = self.client.get(loc.text, HTTP_HOST=self.site.domain) self.assertEqual(resp.status_code, 200) self.assertIn(str(resp.context['the_job'].uid), loc.text) crawled_jobs += 1
def prepare_sqs_from_search_params(params, sqs=None): # We usually search description twice, so we need a higher boost on # title to overcome that. boost_value = 10 title = params.get('q') location = params.get('location') moc = params.get('moc') moc_id = params.get('moc_id') company = params.get('company') exact_title = bool(params.get('exact_title')) if sqs is None: sqs = DESearchQuerySet() # The Haystack API does not allow for boosting terms in individual # fields. In this case we want to boost the term represented by # the variable 'title' ONLY when it appears in the `title` field in # the search index. # # To get around this I instead construct the string according to the # format specified for boosting a term in a specific field by the # Solr documentation: # 'q=title:(Accountant)^2' # By using parens instead of quotes, Solr can parse more complex title # searches. # # I then pass that string to an SQ object and proceed as normal. # This allows us to ensure that titles that match a query exactly # will appear higher in the results list than results that simply # have the query term in the text of the document. cleaned_params = dict([(val, _clean(val)) for val in [title, location, moc, moc_id, company] if val]) q_val = cleaned_params.get(title) moc_val = cleaned_params.get(moc) moc_id_val = cleaned_params.get(moc_id) loc_val = cleaned_params.get(location) # If 'q' has a value in the querystring, filter our results by it in # two places: 1. In the `text` field (full document) 2. In the `title` # field, after it has been boosted by a factor of 0.5. We want to make # sure that someone searching for a title like "engineer" sees jobs # that match on job title first, before results that "only" match on # random words in the full text of the document. if q_val: # Escape dashes surrounded by spaces, since they probably aren't # intended as negation. # Retail -Sales will search for Retail excluding Sales # Retail - Sales will search for 'Retail - Sales' title = "(%s)" % transform_search(title.replace(' - ', ' \\- ')) tb = u"({t})^{b}".format(t=title, b=boost_value) if exact_title: sqs = sqs.filter(title_exact__exact=title) else: # We have to query on description here so that highlighting # matches the exact term and not a stem. sqs = sqs.filter(SQ(content=Raw("((%s))^1" % title)) | SQ(title=Raw(tb)) | SQ(description=Raw(title))).highlight() # If there is a value in the `location` parameter, add filters for it # in each location-y field in the index. If the `exact` parameter is # `true` in the querystring, search locations for EXACT matches only; # the rationale being that if a user clicks on "San Diego" he probably # doesn't give a crap about "San Francisco" or "San Jose" results. if loc_val: sqs = sqs.filter(full_loc=loc_val) if company: sqs = sqs.filter(company_exact__exact=company) if moc_val: # Before we can search for MOC, we have to find out if the SeoSite # has specified any custom MOC-Onet mappings. If they do, we'll search # on the jobs mapped_moc* fields prefix = 'mapped_' if settings.SITE_BUIDS else '' if moc_id_val: moc_filt = SQ(**{'%smocid' % prefix: moc_id_val}) else: moc_filt = SQ(SQ(**{'%smoc' % prefix: moc_val}) | SQ(**{'%smoc_slab' % prefix: moc_val})) sqs = sqs.filter(moc_filt) return sqs.highlight()
def facet_data(jsids): sqs = DESearchQuerySet().facet_limit(-1).facet_sort("count").\ facet_mincount(1) sqs = sqs.facet("full_loc").facet("title").facet("country").facet("state") sqs = _sqs_narrow_by_buid_and_site_package(sqs) return sqs.facet_counts()['fields']
def test_unicode_title(self): # Test imports group = factories.GroupFactory() self.site.group = group self.site.business_units.add(self.businessunit) self.site.save() import_jobs.update_solr(self.buid, download=False, delete_feed=False, data_dir='seo/tests/data/') solr_jobs = self.conn.search("*:*") resp = self.client.get('/') self.assertEqual(resp.context['total_jobs_count'], solr_jobs.hits) # test standard facets against Haystack query standard_cf = factories.CustomFacetFactory.build( # default facet will return both jobs name="Keyword Facet", group=group, show_production=True) standard_cf.save() standard_cf.keyword.add(u'Ключевые') standard_cf.save() standard_site_facet = factories.SeoSiteFacetFactory( seosite=self.site, customfacet=standard_cf, facet_type=factories.SeoSiteFacet.STANDARD) standard_site_facet.save() # test standard facets against Haystack query standard_cf2 = factories.CustomFacetFactory.build( # default facet will return both jobs name='Country Facet', country='United States', group=group, show_production=True) standard_cf2.save() standard_site_facet2 = factories.SeoSiteFacetFactory( seosite=self.site, customfacet=standard_cf2, facet_type=factories.SeoSiteFacet.STANDARD) standard_site_facet2.save() resp = self.client.get('/keyword-facet/new-jobs/', HTTP_HOST=self.site.domain, follow=True) sqs = DESearchQuerySet().filter(text=u'Ключевые') self.assertEqual(len(resp.context['default_jobs']), sqs.count()) for facet_widget in resp.context['widgets']: # Ensure that no standard facet has more results than current # search results for count_tuple in facet_widget.items: self.assertTrue(sqs.count() >= count_tuple[1]) # Test default site facets against PySolr query from django.core.cache import cache cache.clear() default_cf = factories.CustomFacetFactory.build(name="Default Facet", title=u"Специалист", group=group, show_production=True) default_cf.save() default_site_facet = factories.SeoSiteFacetFactory( seosite=self.site, facet_type=factories.SeoSiteFacet.DEFAULT, customfacet=default_cf) default_site_facet.save() resp = self.client.get('/jobs/', HTTP_HOST=self.site.domain, follow=True) total_jobs = resp.context['total_jobs_count'] solr_jobs = self.conn.search(q=u"title:Специалист") self.assertEqual(total_jobs, solr_jobs.hits) self.assertEqual(len(resp.context['default_jobs']), total_jobs) for facet_widget in resp.context['widgets']: for count_tuple in facet_widget.items: self.assertTrue(sqs.count() >= count_tuple[1]) # Feed test resp = self.client.get('/feed/json', HTTP_HOST=self.site.domain) jobs = json.loads(resp.content) self.assertEqual(len(jobs), total_jobs) for job in jobs: resp = self.client.get(job['url'], HTTP_HOST=self.site.domain, follow=False) self.assertEqual(resp.status_code, 302) expected = 'https://my.jobs/%s%d?my.jobs.site.id=%s' %\ (job['guid'], settings.FEED_VIEW_SOURCES['json'], str(self.site.pk)) self.assertEqual(resp['Location'], expected) # Sitemap index Test - Since sitemap only builds out updates from the # last 30 days, this test will eventually be checking 0 jobs in sitemap # TODO, find a way to keep feed dates current. We might be able to use # the mock library to override datetime functions resp = self.client.get('/sitemap.xml', HTTP_HOST=self.site.domain) root = etree.fromstring(resp.content) self.assertGreater(len(root), 0) crawled_jobs = 0 for loc, lastmod in root: self.assertTrue(loc.text) resp = self.client.get(loc.text, HTTP_HOST=self.site.domain) self.assertEqual(resp.status_code, 200) # Get the first daily sitemap urlset = etree.fromstring(resp.content) # Check each job in daily sitemap - I'm a bot for loc, _, _, _ in urlset: resp = self.client.get(loc.text, HTTP_HOST=self.site.domain) self.assertEqual(resp.status_code, 200) self.assertIn(str(resp.context['the_job'].uid), loc.text) crawled_jobs += 1
def prepare_sqs_from_search_params(params, sqs=None): # We usually search description twice, so we need a higher boost on # title to overcome that. boost_value = 10 title = params.get('q') location = params.get('location') moc = params.get('moc') moc_id = params.get('moc_id') company = params.get('company') exact_title = bool(params.get('exact_title')) if sqs is None: sqs = DESearchQuerySet() # The Haystack API does not allow for boosting terms in individual # fields. In this case we want to boost the term represented by # the variable 'title' ONLY when it appears in the `title` field in # the search index. # # To get around this I instead construct the string according to the # format specified for boosting a term in a specific field by the # Solr documentation: # 'q=title:(Accountant)^2' # By using parens instead of quotes, Solr can parse more complex title # searches. # # I then pass that string to an SQ object and proceed as normal. # This allows us to ensure that titles that match a query exactly # will appear higher in the results list than results that simply # have the query term in the text of the document. cleaned_params = dict([(val, _clean(val)) for val in [title, location, moc, moc_id, company] if val]) q_val = cleaned_params.get(title) moc_val = cleaned_params.get(moc) moc_id_val = cleaned_params.get(moc_id) loc_val = cleaned_params.get(location) # If 'q' has a value in the querystring, filter our results by it in # two places: 1. In the `text` field (full document) 2. In the `title` # field, after it has been boosted by a factor of 0.5. We want to make # sure that someone searching for a title like "engineer" sees jobs # that match on job title first, before results that "only" match on # random words in the full text of the document. if q_val: # Escape dashes surrounded by spaces, since they probably aren't # intended as negation. # Retail -Sales will search for Retail excluding Sales # Retail - Sales will search for 'Retail - Sales' title = "(%s)" % transform_search(title.replace(' - ', ' \\- ')) tb = u"({t})^{b}".format(t=title, b=boost_value) if exact_title: sqs = sqs.filter(title_exact__exact=title) else: # We have to query on description here so that highlighting # matches the exact term and not a stem. sqs = sqs.filter( SQ(content=Raw("((%s))^1" % title)) | SQ(title=Raw(tb)) | SQ(description=Raw(title))).highlight() # If there is a value in the `location` parameter, add filters for it # in each location-y field in the index. If the `exact` parameter is # `true` in the querystring, search locations for EXACT matches only; # the rationale being that if a user clicks on "San Diego" he probably # doesn't give a crap about "San Francisco" or "San Jose" results. if loc_val: sqs = sqs.filter(full_loc=loc_val) if company: sqs = sqs.filter(company_exact__exact=company) if moc_val: # Before we can search for MOC, we have to find out if the SeoSite # has specified any custom MOC-Onet mappings. If they do, we'll search # on the jobs mapped_moc* fields prefix = 'mapped_' if settings.SITE_BUIDS else '' if moc_id_val: moc_filt = SQ(**{'%smocid' % prefix: moc_id_val}) else: moc_filt = SQ( SQ(**{'%smoc' % prefix: moc_val}) | SQ(**{'%smoc_slab' % prefix: moc_val})) sqs = sqs.filter(moc_filt) return sqs.highlight()
def _clean(term): return DESearchQuerySet().query.clean(term)
def get_jobs(custom_facets=None, exclude_facets=None, jsids=None, default_sqs=None, filters={}, fields=None, facet_limit=250, facet_sort="count", facet_offset=None, mc=1, sort_order='relevance', fl=search_fields, additional_fields=None): """ Returns 3-tuple containing a DESearchQuerySet object, a set of facet counts that have been filtered, and a set of unfiltered facet counts. The unfiltered facet count object is used by the primary nav box to build items and options. Inputs: :custom_facets: A list of saved searches (Custom Facets) to apply to sqs. Defaults to site's default custom facets set in Middleware :default_sqs: Starting search query set :filters: Dictionary of filter terms in field_name:search_term format The following inputs are Solr parameters. :facet_limit: max number of facets to return per field. -1=unlimited :facet_sort: How to sort facets :facet_offset: offset into the facet list :mc: mincount - Smallest facet size to return """ if default_sqs is not None: sqs = default_sqs else: sqs = DESearchQuerySet() sqs = sqs_apply_custom_facets(custom_facets, sqs, exclude_facets) sqs = _sqs_narrow_by_buid_and_site_package(sqs, buids=jsids) sqs = sqs.order_by(sort_order_mapper.get(sort_order, '-score')) # The boost function added to this search query set scales relevancy scores # by a factor of 1/2 at ~6 months (1.8e-11 ms) in all future queries sqs = sqs.bf('recip(ms(NOW/HOUR,salted_date),1.8e-9,1,1)') # Limit the retrieved results to only fields that are actually needed. if fields: field_list = list(fields) else: field_list = list(fl) if additional_fields: field_list = additional_fields + field_list sqs = sqs.fields(field_list) if facet_offset: sqs = sqs.facet_offset(facet_offset) if facet_limit > 0: sqs = sqs.facet_limit(facet_limit) sqs = sqs.facet_sort(facet_sort).facet_mincount(mc) sqs = sqs.facet("city_slab").facet("state_slab").facet("country_slab")\ .facet("moc_slab").facet("title_slab").facet("full_loc")\ .facet("company_slab").facet("buid").facet("mapped_moc_slab") # When get_jobs is called from job_listing_by_slug_tag, sqs already has site # default facets and filters from URL applied. The call to filter_sqs # concatenates the querystring (q=querystring) with itself, # adding + operators and causing parsing errors for more complex custom # facets. Can't remove now until we verify other views don't rely on # this call to filter_sqs. # Jason McLaughlin 09-07-2012 return filter_sqs(sqs, filters)