def proxy_solr(self, action): url = urlparse.urlparse(h.full_current_url()) if url.query != '': data = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8')) else: data = ast.literal_eval(p.toolkit.request.body) content_type = data.get('wt', 'xml') if isinstance(content_type, list): content_type = content_type[0] ckan_response = p.toolkit.response ckan_response.content_type = CONTENT_TYPES[content_type] solr_response = '' if content_type == 'csv': ckan_response.headers['Content-Disposition'] = 'attachment; filename=query.csv' solr_response = str(codecs.BOM_UTF8) conn = make_connection() try: solr_response += conn.raw_query(**data) ckan_response.body = solr_response except SolrException, e: ckan_response.status_int = e.httpcode ckan_response.status = str(e.httpcode) + ' ' + e.reason ckan_response.body = e.body
def more_like_this(pkg, count=5): from ckan.common import config import ckan.plugins.toolkit as toolkit from ckan.lib.search.common import make_connection solr = make_connection() query = 'id:"{}"'.format(pkg['id']) fields_to_compare = 'text title notes' fields_to_return = 'name title score' site_id = config.get('ckan.site_id') filter_query = ''' +site_id:"{}" +dataset_type:dataset +state:active +capacity:public '''.format(site_id) results = solr.more_like_this(q=query, mltfl=fields_to_compare, fl=fields_to_return, fq=filter_query, rows=count) # we want the dataset objects for each item in docs datasets = [] for record in results.docs: context = {} toolkit.get_action('package_show')(context, {'id': record['name']}) datasets.append(context['package']) return datasets
def get_index(self,reference): query = { 'rows': 1, 'q': 'name:"%s" OR id:"%s"' % (reference,reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id')} try: if query['q'].startswith('{!'): raise SearchError('Local parameters are not supported.') except KeyError: pass conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError as e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e)) if solr_response.hits == 0: raise SearchError('Dataset not found in the search index: %s' % reference) else: return solr_response.docs[0]
def harvest_source_index_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id') source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) try: conn.delete_query(query) if asbool(config.get('ckan.search.solr_commit', 'true')): conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def get_index(self, reference): query = { 'rows': 1, 'q': 'name:"%s" OR id:"%s"' % (reference, reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id') } try: if query['q'].startswith('{!'): raise SearchError('Local parameters are not supported.') except KeyError: pass conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError as e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e)) if solr_response.hits == 0: raise SearchError('Dataset not found in the search index: %s' % reference) else: return solr_response.docs[0]
def get_product_issues(context, data_dict): # noinspection PyUnresolvedReferences """ Returns a list of the issues for the product ID :param: productId: A non-data product ID. :return: A dictionary containing the issues for the specified product :rtype: dict """ product_id = _get_or_bust(data_dict, 'productId') slr = make_connection() response = json.loads( slr.raw_query( q='top_parent_id:{pid}'.format(pid=product_id), group='true', group_field='issue_number_int', wt='json', sort='issue_number_int desc', # FIXME: We need to actually paginate on this, but the daily # team will not accept it (yet). rows='2000000')) issue_no_group = response['grouped']['issue_number_int'] return [{ 'issue': group['groupValue'], 'number_articles': group['doclist']['numFound'] } for group in issue_no_group['groups']]
def tag_counts(context, data_dict): """Get the most popular tag counts (Not all tags). This is a much faster implementation that the current ckan tag counts by directly going into Solr and doing a facet search on tags """ from ckan.lib.search.common import make_connection, SearchError, SearchQueryError query = { 'rows': 0, 'q': '*:*', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'facet': 'true', 'facet.field': 'tags'} try: conn = make_connection() solr_response = conn.raw_query(**query) data = json.loads(solr_response) results = [] solr_tags = data["facet_counts"]["facet_fields"]["tags"] for index in range(0,len(solr_tags),2): results.append([solr_tags[index], solr_tags[index+1]]) except Exception as e: raise SearchError("Failed to obtain and parse tag counts. " + str(e)) return results
def harvest_source_index_clear(context, data_dict): check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id', None) source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) try: conn.delete_query(query) if asbool(config.get('ckan.search.solr_commit', 'true')): conn.commit() except Exception as e: log.exception(e) raise SearchIndexError(e) finally: conn.close() return {'id': harvest_source_id}
def harvest_source_index_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id') source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) solr_commit = toolkit.asbool(config.get('ckan.search.solr_commit', 'true')) if toolkit.check_ckan_version(max_version='2.5.99'): # conn is solrpy try: conn.delete_query(query) if solr_commit: conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e) finally:
def commit(): try: conn = make_connection() conn.commit(wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e)
def proxy_solr(self, action): url = urlparse.urlparse(h.full_current_url()) if url.query != '': data = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8')) else: data = ast.literal_eval(p.toolkit.request.body) content_type = data.get('wt', 'xml') if isinstance(content_type, list): content_type = content_type[0] ckan_response = p.toolkit.response ckan_response.content_type = CONTENT_TYPES[content_type] solr_response = '' if content_type == 'csv': ckan_response.headers[ 'Content-Disposition'] = 'attachment; filename=query.csv' solr_response = str(codecs.BOM_UTF8) conn = make_connection() try: solr_response += conn.raw_query(**data) ckan_response.body = solr_response except SolrException, e: ckan_response.status_int = e.httpcode ckan_response.status = str(e.httpcode) + ' ' + e.reason ckan_response.body = e.body
def get_product_issues(context, data_dict): # noinspection PyUnresolvedReferences """ Returns a list of the issues for the product ID :param: productId: A non-data product ID. :return: A dictionary containing the issues for the specified product :rtype: dict """ product_id = _get_or_bust(data_dict, 'productId') slr = make_connection() response = json.loads( slr.raw_query( q='top_parent_id:{pid}'.format( pid=product_id ), group='true', group_field='issue_number_int', wt='json', sort='issue_number_int desc', # FIXME: We need to actually paginate on this, but the daily # team will not accept it (yet). rows='2000000' ) ) issue_no_group = response['grouped']['issue_number_int'] return [{ 'issue': group['groupValue'], 'number_articles': group['doclist']['numFound'] } for group in issue_no_group['groups']]
def get_similar_datasets(id, max_num=5): ''' Get similar datasets for a dataset. :param string id: ID of the target dataset. This must be the actual ID, passing the name is not supported. :param int max_num: Maximum number of datasets to return. :return: A list of similar dataset dicts sorted by decreasing score. ''' solr = make_connection() query = 'id:"{}"'.format(id) fields_to_compare = 'text' fields_to_return = 'id validated_data_dict score' site_id = config.get('ckan.site_id') filter_query = ''' +site_id:"{}" +dataset_type:dataset +state:active +capacity:public '''.format(site_id) results = solr.more_like_this(q=query, mltfl=fields_to_compare, fl=fields_to_return, fq=filter_query, rows=max_num) log.debug('Similar datasets for {}:'.format(id)) print('Similar datasets for {}:'.format(id)) for doc in results.docs: log.debug(' {id} (score {score})'.format(**doc)) print(' {id} (score {score})'.format(**doc)) return [json.loads(doc['validated_data_dict']) for doc in results.docs]
def get_all_entity_ids(self, max_results=1000): """ Return a list of the IDs of all indexed packages. """ query = "*:*" fq = "+site_id:\"%s\" " % config.get('ckan.site_id') fq += "+state:active " conn = make_connection() data = conn.search(query, fq=fq, rows=max_results, fields='id') return [r.get('id') for r in data.docs]
def get_index(self, reference): query = { "rows": 1, "q": 'name:"%s" OR id:"%s"' % (reference, reference), "wt": "json", "fq": 'site_id:"%s"' % config.get("ckan.site_id"), } conn = make_connection() log.debug("Package query: %r" % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
def get_index(self,reference): query = { 'rows': 1, 'q': 'name:%s OR id:%s' % (reference,reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id')} conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def ogdch_autosuggest(context, data_dict): q = get_or_bust(data_dict, 'q') lang = get_or_bust(data_dict, 'lang') fq = data_dict.get('fq', '') if fq: fq = 'NOT private AND %s' % fq else: fq = 'NOT private' # parse language from values like de_CH if len(lang) > 2: lang = lang[:2] if lang not in ['en', 'it', 'de', 'fr']: raise ValidationError('lang must be one of [en, it, de, fr]') handler = '/suggest_%s' % lang suggester = 'ckanSuggester_%s' % lang solr = make_connection() try: log.debug( 'Loading suggestions for %s (lang: %s, fq: %s)' % (q, lang, fq) ) results = solr.search( '', search_handler=handler, **{'suggest.q': q, 'suggest.count': 10, 'suggest.cfq': fq} ) suggestions = results.raw_response['suggest'][suggester].values()[0] # noqa def highlight(term, q): if '<b>' in term: return term clean_q = unidecode(q) clean_term = unidecode(term) re_q = re.escape(clean_q) m = re.search(re_q, clean_term, re.I) if m: replace_text = term[m.start():m.end()] term = term.replace(replace_text, '<b>%s</b>' % replace_text) return term terms = [highlight(suggestion['term'], q) for suggestion in suggestions['suggestions']] # noqa return list(set(terms)) except pysolr.SolrError as e: log.exception('Could not load suggestions from solr: %s' % e) raise ActionError('Error retrieving suggestions from solr')
def get_index(self,reference): query = { 'rows': 1, 'q': 'name:"%s" OR id:"%s"' % (reference,reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id')} conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e))
def proxy_solr(self, action): url = urlparse.urlparse(h.full_current_url()) query = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8')) content_type = query.get('wt', ['xml'])[0] ckan_response = p.toolkit.response ckan_response.content_type = CONTENT_TYPES[content_type] conn = make_connection() try: solr_response = conn.raw_query(**query) ckan_response.body = solr_response except SolrException, e: ckan_response.status_int = e.httpcode ckan_response.status = str(e.httpcode) + ' ' + e.reason ckan_response.body = e.body
def textcomplete(self): """proxies an textcomplete query to the solr suggest search-handler""" # TODO: must autocomplete take the map-extent in consideration? conn = make_connection() suggest = SearchHandler(conn, '/suggest') q = request.params["q"] res = suggest(q=q, wt='json') if not res: payload = [] elif q in res.spellcheck['suggestions'].keys(): payload = res.spellcheck['suggestions'][q]['suggestion'] else: payload = [] return json.dumps(payload)
def get_all_entity_ids(self, max_results=1000): """ Return a list of the IDs of all indexed packages. """ query = "*:*" fq = '+site_id:"%s" ' % config.get("ckan.site_id") fq += "+state:active " conn = make_connection() try: data = conn.query(query, fq=fq, rows=max_results, fields="id") finally: conn.close() return [r.get("id") for r in data.results]
def get_index(self, reference): query = { 'rows': 1, 'q': 'name:%s OR id:%s' % (reference, reference), 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id') } conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def update_package_stats(package_id, stats): try: conn = make_connection() query = "id:%s" % package_id res = conn.search(q=query) if res and res.docs: pkg_dict = res.docs[0] for key, value in stats.items(): pkg_dict["extras_%s" % key] = str(value or '0').rjust(24, '0') if '_version_' in pkg_dict: del pkg_dict['_version_'] conn.add(docs=[pkg_dict], commit=True) except pysolr.SolrError, e: log.error("Solr returned error: %s", e) log.exception(e) return
def get_index(self, reference): """ For a given package reference (ID or name), returns the record for it from the SOLR index. """ query = { "rows": 1, "q": "name:%s OR id:%s" % (reference, reference), "wt": "json", "fq": 'site_id:"%s"' % config.get("ckan.site_id"), } conn = make_connection() log.debug("Package query: %r" % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
def increment_total_downloads(package_id): try: conn = make_connection() query = "id:%s" % package_id res = conn.search(q=query) if res and res.docs: pkg_dict = res.docs[0] total_downloads = int(pkg_dict.get('extras_total_downloads', 0)) total_downloads += 1 pkg_dict["extras_total_downloads"] = str(total_downloads).rjust( 24, '0') if '_version_' in pkg_dict: del pkg_dict['_version_'] conn.add(docs=[pkg_dict], commit=True) except pysolr.SolrError, e: log.error("Solr returned error: %s", e) log.exception(e) return
def harvest_source_index_clear(context, data_dict): check_access("harvest_source_clear", context, data_dict) harvest_source_id = data_dict.get("id", None) source = HarvestSource.get(harvest_source_id) if not source: log.error("Harvest source %s does not exist", harvest_source_id) raise NotFound("Harvest source %s does not exist" % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = """ +%s:"%s" +site_id:"%s" """ % ("harvest_source_id", harvest_source_id, config.get("ckan.site_id")) try: conn.delete_query(query) if asbool(config.get("ckan.search.solr_commit", "true")): conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def harvest_source_index_clear(context, data_dict): check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id', None) source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) try: conn.delete_query(query) if asbool(config.get('ckan.search.solr_commit', 'true')): conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def delete_asset(ast_dict, defer_commit=False): conn = make_connection() if ast_dict.get('remove_all_assets'): index = '' elif ast_dict.get('whole_resource'): index = ' +id:{id} '.format(id=ast_dict['whole_resource']) else: index = ' +index_id:\"{index}\"'.format( index=_get_index_id(ast_dict['id'], ast_dict['assetID']) ) query = "+{type}:{asset} {index} +site_id:\"{site}\"".format( type=TYPE_FIELD, asset=ASSET_TYPE, index=index, site=config.get('ckan.site_id')) try: conn.delete_query(query) if not defer_commit: conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def get_connection(self): u'''Creates new raw connection to Solr. ''' return make_connection()
def run(query): ''' Performs a asset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 20))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') if not '+type:' in q and not '+mimetype:' in q and not '+type:' in fq and not '+mimetype:' in fq: fq += ' -type:image/x* -mimetype:image/x* ' # filter for asset entity_type if not '+entity_type:' in fq: fq += " +entity_type:asset" if not '+state:' in q and not '+state:' in fq: fq += " -state:hidden -state:deleted" user = c.userobj # if user and (user.sysadmin or user.email.endswith('@act.gov.au')): pass if user and user.sysadmin: pass else: user_groups = [] if user: for group in user.get_groups(): user_groups.append(group.id) #get all child orgs user_groups.extend([ item.table_id for item in filter( lambda x: x.capacity=='child_organization' and x.state == 'active', group.member_all ) ]) #get all brothers parents = model.Session.query(model.Group)\ .filter(model.Group.id.in_([ item.table_id for item in filter( lambda x: x.capacity=='parent_organization' and x.state == 'active', group.member_all ) ])).all() for parent in parents: user_groups.extend([ item.table_id for item in filter( lambda x: x.capacity=='child_organization' and x.state == 'active', parent.member_all ) ]) private_query = model.Session.query(model.Package.id, model.Package.owner_org).\ filter(model.Package.private==True) if user_groups: private_query = private_query.filter(~model.Package.owner_org.in_(user_groups)) private = private_query.all() for id in private: fq += " -package_id:{id}".format(id=id[0]) query['fq'] = [fq] fq_list = query.get('fq_list', []) query['fq'].extend(fq_list) # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the asset ID and search scores query['fl'] = query.get('fl', 'data_dict') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() # log.debug('Asset query: %r' % query) try: solr_response = conn.raw_query(**query) except Exception, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [ s for s in set(query.keys()) - VALID_SOLR_PARAMETERS ] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get( 'facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. if ':' not in query['q']: query['defType'] = 'dismax' query['tie'] = '0.1' # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = '2<-1 5<80%' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def index_asset(ast_dict, defer_commit=False): if ast_dict is None: return ast_dict[TYPE_FIELD] = ASSET_TYPE ast_dict['capacity'] = 'public' if not ast_dict.get('package_id'): ast_dict['package_id'] = session.query(model.Resource).filter_by(id=resource).first().get_package_id() bogus_date = datetime.datetime(1, 1, 1) try: ast_dict['metadata_created'] = parse(ast_dict['lastModified'][:19], default=bogus_date).isoformat() + 'Z' except ValueError: ast_dict['metadata_created'] = None ast_dict['metadata_modified'] = datetime.datetime.now().isoformat()[:19] + 'Z' if type(ast_dict['metadata']) in (unicode, str): try: ast_dict['metadata'] = json.loads(_unjson_base(ast_dict['metadata'])) except ValueError: ast_dict['metadata'] = json.loads(_unjson(ast_dict['metadata'])) try: if 'exif' in ast_dict['metadata']: for ex_key, ex_val in ast_dict['metadata']['exif'].items(): if ex_key in ['EXIF:CreateDate', 'EXIF:Model', 'EXIF:Artist', 'EXIF_CreateDate', 'EXIF_Model', 'EXIF_Artist']: if type(ex_val) in (unicode, str): ast_dict['metadata'][ex_key.replace(':','_')] = ex_val except: pass for field in ('organization', 'text', 'notes'): if not ast_dict['metadata'].get(field): if field == 'text': field = '' ast_dict[field] = None if 'text' in ast_dict['metadata'] and not ast_dict['notes']: ast_dict['notes'] = ast_dict['metadata']['text'] elif 'description' in ast_dict['metadata'] and not ast_dict['notes']: ast_dict['notes'] = ast_dict['metadata']['description'] if not 'state' in ast_dict['metadata']: ast_dict['metadata']['state'] = 'active' for field in (('type', 'mimetype'),('mimetype', 'type')): if field[0] in ast_dict['metadata'] and field[1] not in ast_dict['metadata']: ast_dict['metadata'][field[1]] = ast_dict['metadata'][field[0]] if not 'mimetype' in ast_dict['metadata']: ast_dict['metadata']['mimetype'] = 'image/jpeg' tags = ast_dict['metadata'].get('tags') if type(tags) in (str, unicode): tags = [name.strip() for name in tags.split(',') if name] if type(tags) not in (list, tuple, set): tags = [] ast_dict['tags'] = tags ast_dict['data_dict'] = json.dumps(ast_dict) index_fields = RESERVED_FIELDS + ast_dict.keys() # include the extras in the main namespace extras = ast_dict['metadata'] for extra in extras: key, value = extra, extras[extra] if isinstance(value, (tuple, list)): value = " ".join(map(unicode, value)) key = ''.join([c for c in key if c in KEY_CHARS]) ast_dict['extras_' + key] = value if key not in index_fields: ast_dict[key] = value ast_dict.pop('metadata', None) context = {'model': model} # clean the dict fixing keys new_dict = {} for key, value in ast_dict.items(): key = key.encode('ascii', 'ignore') new_dict[key] = value ast_dict = new_dict for k in ('title', 'notes', 'title_string', 'name'): if k in ast_dict and ast_dict[k]: ast_dict[k] = escape_xml_illegal_chars(ast_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html new_dict = {} for key, value in ast_dict.items(): key = key.encode('ascii', 'ignore') if key.endswith('_date'): try: date = parse(value, default=bogus_date) if date != bogus_date: value = date.isoformat() + 'Z' else: # The date field was empty, so dateutil filled it with # the default bogus date value = None except ValueError: continue new_dict[key] = value ast_dict = new_dict # mark this CKAN instance as data source: ast_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title', 'name']: try: value = ast_dict.get(field_name) if value: ast_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts ast_dict['index_id'] = _get_index_id(ast_dict['id'], ast_dict['assetID']) # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get('ckan.search.solr_commit', 'true')): commit = False conn.add_many([ast_dict], _commit=commit) except socket.error, e: err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e)) log.error(err) raise SearchIndexError(err)
def run(self, query, permission_labels=None, **kwargs): ''' Performs a dataset search using the given query. :param query: dictionary with keys like: q, fq, sort, rows, facet :type query: dict :param permission_labels: filter results to those that include at least one of these labels. None to not filter (return everything) :type permission_labels: list of unicode strings; or None :returns: dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [ s for s in set(query.keys()) - VALID_SOLR_PARAMETERS ] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query fq = [] if 'fq' in query: fq.append(query['fq']) fq.extend(query.get('fq_list', [])) # show only results from this CKAN instance fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id'))) # filter for package status if not '+state:' in query.get('fq', ''): fq.append('+state:active') # only return things we should be able to see if permission_labels is not None: fq.append('+permission_labels:(%s)' % ' OR '.join(solr_literal(p) for p in permission_labels)) query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get( 'facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError as e: # Error with the sort parameter. You see slightly different # error messages depending on whether the SOLR JSON comes back # or Jetty gets in the way converting it to HTML - not sure why # if e.args and isinstance(e.args[0], str): if "Can't determine a Sort Order" in e.args[0] or \ "Can't determine Sort Order" in e.args[0] or \ 'Unknown sort order' in e.args[0]: raise SearchQueryError('Invalid "sort" parameter') raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e)) self.count = solr_response.hits self.results = solr_response.docs # #1683 Filter out the last row that is sometimes out of order self.results = self.results[:rows_to_return] # get any extras and add to 'extras' dict for result in self.results: extra_keys = filter(lambda x: x.startswith('extras_'), result.keys()) extras = {} for extra_key in extra_keys: value = result.pop(extra_key) extras[extra_key[len('extras_'):]] = value if extra_keys: result['extras'] = extras # if just fetching the id or name, return a list instead of a dict if query.get('fl') in ['id', 'name']: self.results = [r.get(query.get('fl')) for r in self.results] # get facets and convert facets list to a dict self.facets = solr_response.facets.get('facet_fields', {}) for field, values in six.iteritems(self.facets): self.facets[field] = dict(zip(values[0::2], values[1::2])) return {'results': self.results, 'count': self.count}
def run(self, query): ''' Performs a dataset search using the given query. The query may include highlighting parameters which will be added to package extras. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid valid_params = [] invalid_params = [] for key in query.keys(): if key in VALID_SOLR_PARAMETERS or key == 'hl' or key.startswith('hl.'): valid_params.append(key) else: invalid_params.append(key) if len(invalid_params) > 0: raise SearchQueryError("Invalid search parameters: %s" % invalid_params) query = self.normalize_query_keys(query) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = [fq] fq_list = query.get('fq_list', []) query['fq'].extend(fq_list) # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') query['fl'] = query['fl'] + ' index_id' # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def update_package_search_schema(): fields = { "associated_tasks": b'{"add-field":{"name": ' b'"associated_tasks", "type": "textgen", ' b'"indexed": "true", stored: "true"}}', "collection_period": b'{"add-field":{"name": ' b'"collection_period", "type": "textgen", ' b'"indexed": "true", stored: "true"}}', "geographical_area": b'{"add-field":{"name": ' b'"geographical_area", "type": "textgen", ' b'"indexed": "true", stored: "true"}}', "number_of_instances": b'{"add-field":{"name": ' b'"number_of_instances", ' b'"type": "textgen", ' b'"indexed": "true", stored: "true"}}', "number_of_attributes": b'{"add-field":{"name": ' b'"number_of_attributes", ' b'"type": "textgen", ' b'"indexed": "true", stored: "true"}}', "pkg_description": b'{"add-field":{"name": ' b'"pkg_description", "type": "textgen", ' b'"indexed": "true", stored: "true"}}', "creation_date": b'{"add-field":{"name": ' b'"creation_date", "type": "date", ' b'"indexed": "true", stored: "true"}}', "expiry_date": b'{"add-field":{"name": ' b'"expiry_date", "type": "date", ' b'"indexed": "true", stored: "true"}}', "has_missing_values": b'{"add-field":{"name": ' b'"has_missing_values", ' b'"type": "boolean", ' b'"indexed": "true", stored: "true"}}', } copy_fields = { "associated_tasks": b'{"add-copy-field":{"source": ' b'"associated_tasks", "dest": "text"}}', "collection_period": b'{"add-copy-field":{"source": ' b'"collection_period", "dest": "text"}}', "geographical_area": b'{"add-copy-field":{"source": ' b'"geographical_area", "dest": "text"}}', "pkg_description": b'{"add-copy-field":{"source": ' b'"pkg_description", "dest": "text"}}' } conn = make_connection() path = "schema" for fieldname in fields: res = conn._send_request("post", path, fields[fieldname]) log.debug("Result of update {result}".format(result=res)) for fieldname in copy_fields: res = conn._send_request("post", path, copy_fields[fieldname]) log.debug("Result of update {result}".format(result=res)) pass
def run(self, query): """ Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. """ assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get("q") if not q or q == '""' or q == "''": query["q"] = "*:*" # number of results rows_to_return = min(1000, int(query.get("rows", 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query["rows"] = rows_to_query # show only results from this CKAN instance fq = query.get("fq", "") if not "+site_id:" in fq: fq += ' +site_id:"%s"' % config.get("ckan.site_id") # filter for package status if not "+state:" in fq: fq += " +state:active" query["fq"] = [fq] fq_list = query.get("fq_list", []) query["fq"].extend(fq_list) # faceting query["facet"] = query.get("facet", "true") query["facet.limit"] = query.get("facet.limit", config.get("search.facets.limit", "50")) query["facet.mincount"] = query.get("facet.mincount", 1) # return the package ID and search scores query["fl"] = query.get("fl", "name") # return results as json encoded string query["wt"] = query.get("wt", "json") # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get("defType", "dismax") if ":" not in query["q"] or defType == "edismax": query["defType"] = defType query["tie"] = query.get("tie", "0.1") # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query["mm"] = query.get("mm", "2<-1 5<80%") query["qf"] = query.get("qf", QUERY_FIELDS) conn = make_connection() log.debug("Package query: %r" % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
def run(self, query, permission_labels=None, **kwargs): ''' Performs a dataset search using the given query. :param query: dictionary with keys like: q, fq, sort, rows, facet :type query: dict :param permission_labels: filter results to those that include at least one of these labels. None to not filter (return everything) :type permission_labels: list of unicode strings; or None :returns: dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid valid_solr_parameters = VALID_SOLR_PARAMETERS for item in plugins.PluginImplementations(plugins.IPackageController): if 'update_valid_solr_parameters' in dir(item): valid_solr_parameters = item.update_valid_solr_parameters( valid_solr_parameters) if not set(query.keys()) <= valid_solr_parameters: invalid_params = [ s for s in set(query.keys()) - valid_solr_parameters ] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query fq = [] if 'fq' in query: fq.append(query['fq']) fq.extend(query.get('fq_list', [])) # show only results from this CKAN instance fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id'))) # filter for package status if not '+state:' in query.get('fq', ''): fq.append('+state:active') # only return things we should be able to see if permission_labels is not None: fq.append('+permission_labels:(%s)' % ' OR '.join(solr_literal(p) for p in permission_labels)) query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get( 'facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError, e: # Error with the sort parameter. You see slightly different # error messages depending on whether the SOLR JSON comes back # or Jetty gets in the way converting it to HTML - not sure why # if e.args and isinstance(e.args[0], str): if "Can't determine a Sort Order" in e.args[0] or \ "Can't determine Sort Order" in e.args[0] or \ 'Unknown sort order' in e.args[0]: raise SearchQueryError('Invalid "sort" parameter') raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e))
def run(self, query, permission_labels=None, **kwargs): ''' Performs a dataset search using the given query. :param query: dictionary with keys like: q, fq, sort, rows, facet :type query: dict :param permission_labels: filter results to those that include at least one of these labels. None to not filter (return everything) :type permission_labels: list of unicode strings; or None :returns: dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = int(query.get('rows', 10)) # query['rows'] should be a defaulted int, due to schema, but make # certain, for legacy tests if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query fq = [] if 'fq' in query: fq.append(query['fq']) fq.extend(query.get('fq_list', [])) # show only results from this CKAN instance fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id'))) # filter for package status if not '+state:' in query.get('fq', ''): fq.append('+state:active') # only return things we should be able to see if permission_labels is not None: fq.append('+permission_labels:(%s)' % ' OR '.join( solr_literal(p) for p in permission_labels)) query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) try: if query['q'].startswith('{!'): raise SearchError('Local parameters are not supported.') except KeyError: pass conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError as e: # Error with the sort parameter. You see slightly different # error messages depending on whether the SOLR JSON comes back # or Jetty gets in the way converting it to HTML - not sure why # if e.args and isinstance(e.args[0], str): if "Can't determine a Sort Order" in e.args[0] or \ "Can't determine Sort Order" in e.args[0] or \ 'Unknown sort order' in e.args[0]: raise SearchQueryError('Invalid "sort" parameter') raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e)) self.count = solr_response.hits self.results = solr_response.docs # #1683 Filter out the last row that is sometimes out of order self.results = self.results[:rows_to_return] # get any extras and add to 'extras' dict for result in self.results: extra_keys = filter(lambda x: x.startswith('extras_'), result.keys()) extras = {} for extra_key in extra_keys: value = result.pop(extra_key) extras[extra_key[len('extras_'):]] = value if extra_keys: result['extras'] = extras # if just fetching the id or name, return a list instead of a dict if query.get('fl') in ['id', 'name']: self.results = [r.get(query.get('fl')) for r in self.results] # get facets and convert facets list to a dict self.facets = solr_response.facets.get('facet_fields', {}) for field, values in six.iteritems(self.facets): self.facets[field] = dict(zip(values[0::2], values[1::2])) return {'results': self.results, 'count': self.count}
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [ s for s in set(query.keys()) - VALID_SOLR_PARAMETERS ] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = [fq] fq_list = query.get('fq_list', []) query['fq'].extend(fq_list) # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get( 'facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError, e: # Error with the sort parameter. You see slightly different # error messages depending on whether the SOLR JSON comes back # or Jetty gets in the way converting it to HTML - not sure why # if e.args and isinstance(e.args[0], str): if "Can't determine a Sort Order" in e.args[0] or \ "Can't determine Sort Order" in e.args[0] or \ 'Unknown sort order' in e.args[0]: raise SearchQueryError('Invalid "sort" parameter') raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e))
def _run(query): ''' Custom final preparation of the solr query and call to the solr api. :param query: :return: ''' # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = query.get('rows', 0) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = [fq] fq_list = query.get('fq_list', []) query['fq'].extend(fq_list) # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') boolean = query.get('extras', {}).get('ext_boolean', 'all') if boolean not in ['all', 'any', 'exact']: log.error('Ignoring unknown boolean search operator %r' % (boolean, )) boolean = 'all' if ':' not in query['q']: query['defType'] = 'dismax' query['tie'] = '0.1' if boolean == 'any': query['mm'] = '0' elif boolean == 'all': query['mm'] = '100%' elif boolean == 'exact': query['q'] = '"' + q.replace('"', '\\"') + '"' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.info('Package query: %r' % query) try: start_time = time.time() solr_response = conn.raw_query(**query) duration = time.time() - start_time log.info("Solr returned the resilt after {0}".format(duration)) except SolrException, e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. if ':' not in query['q']: query['defType'] = 'dismax' query['tie'] = '0.1' # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = '2<-1 5<80%' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason))
def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results rows_to_return = min(1000, int(query.get('rows', 10))) if rows_to_return > 0: # #1683 Work around problem of last result being out of order # in SOLR 1.4 rows_to_query = rows_to_return + 1 else: rows_to_query = rows_to_return query['rows'] = rows_to_query # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = [fq] fq_list = query.get('fq_list', []) query['fq'].extend(fq_list) # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # If the query has a colon in it then consider it a fielded search and do use dismax. defType = query.get('defType', 'dismax') if ':' not in query['q'] or defType == 'edismax': query['defType'] = defType query['tie'] = query.get('tie', '0.1') # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = query.get('mm', '2<-1 5<80%') query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection(decode_dates=False) log.debug('Package query: %r' % query) try: solr_response = conn.search(**query) except pysolr.SolrError, e: # Error with the sort parameter. You see slightly different # error messages depending on whether the SOLR JSON comes back # or Jetty gets in the way converting it to HTML - not sure why # if e.args and isinstance(e.args[0], str): if "Can't determine a Sort Order" in e.args[0] or \ "Can't determine Sort Order" in e.args[0] or \ 'Unknown sort order' in e.args[0]: raise SearchQueryError('Invalid "sort" parameter') raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e))