Exemple #1
0
    def proxy_solr(self, action):
        url = urlparse.urlparse(h.full_current_url())
        if url.query != '':
            data = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8'))
        else:
            data = ast.literal_eval(p.toolkit.request.body)
        content_type = data.get('wt', 'xml')
        if isinstance(content_type, list):
            content_type = content_type[0]
        ckan_response = p.toolkit.response
        ckan_response.content_type = CONTENT_TYPES[content_type]
        solr_response = ''

        if content_type == 'csv':
            ckan_response.headers['Content-Disposition'] = 'attachment; filename=query.csv'
            solr_response = str(codecs.BOM_UTF8)

        conn = make_connection()
        try:
            solr_response += conn.raw_query(**data)
            ckan_response.body = solr_response
        except SolrException, e:
            ckan_response.status_int = e.httpcode
            ckan_response.status = str(e.httpcode) + ' ' + e.reason
            ckan_response.body = e.body
def more_like_this(pkg, count=5):
    from ckan.common import config
    import ckan.plugins.toolkit as toolkit
    from ckan.lib.search.common import make_connection

    solr = make_connection()
    query = 'id:"{}"'.format(pkg['id'])
    fields_to_compare = 'text title notes'
    fields_to_return = 'name title score'

    site_id = config.get('ckan.site_id')
    filter_query = '''
        +site_id:"{}"
        +dataset_type:dataset
        +state:active
        +capacity:public
        '''.format(site_id)

    results = solr.more_like_this(q=query,
                                  mltfl=fields_to_compare,
                                  fl=fields_to_return,
                                  fq=filter_query,
                                  rows=count)

    # we want the dataset objects for each item in docs
    datasets = []
    for record in results.docs:
        context = {}
        toolkit.get_action('package_show')(context, {'id': record['name']})
        datasets.append(context['package'])
    return datasets
Exemple #3
0
    def get_index(self,reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference,reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

        try:
            if query['q'].startswith('{!'):
                raise SearchError('Local parameters are not supported.')
        except KeyError:
            pass

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e))

        if solr_response.hits == 0:
            raise SearchError('Dataset not found in the search index: %s' % reference)
        else:
            return solr_response.docs[0]
Exemple #4
0
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Exemple #5
0
    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        try:
            if query['q'].startswith('{!'):
                raise SearchError('Local parameters are not supported.')
        except KeyError:
            pass

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))

        if solr_response.hits == 0:
            raise SearchError('Dataset not found in the search index: %s' %
                              reference)
        else:
            return solr_response.docs[0]
Exemple #6
0
def get_product_issues(context, data_dict):
    # noinspection PyUnresolvedReferences
    """
    Returns a list of the issues for the product ID
    :param: productId: A non-data product ID.

    :return: A dictionary containing the issues for the specified product
    :rtype: dict
    """
    product_id = _get_or_bust(data_dict, 'productId')

    slr = make_connection()

    response = json.loads(
        slr.raw_query(
            q='top_parent_id:{pid}'.format(pid=product_id),
            group='true',
            group_field='issue_number_int',
            wt='json',
            sort='issue_number_int desc',
            # FIXME: We need to actually paginate on this, but the daily
            #        team will not accept it (yet).
            rows='2000000'))

    issue_no_group = response['grouped']['issue_number_int']

    return [{
        'issue': group['groupValue'],
        'number_articles': group['doclist']['numFound']
    } for group in issue_no_group['groups']]
def tag_counts(context, data_dict):
    """Get the most popular tag counts (Not all tags). This is a much faster implementation that the current ckan tag counts by directly going into Solr and doing a facet search on tags """
    from ckan.lib.search.common import make_connection, SearchError, SearchQueryError

    query = {
        'rows': 0,
        'q': '*:*',
        'wt': 'json',
        'fq': 'site_id:"%s"' % config.get('ckan.site_id'),
        'facet': 'true',
        'facet.field': 'tags'}

    try:
        conn = make_connection()
        solr_response = conn.raw_query(**query)
        data = json.loads(solr_response)
        
        results = []
        solr_tags = data["facet_counts"]["facet_fields"]["tags"]
        for index in range(0,len(solr_tags),2):
            results.append([solr_tags[index], solr_tags[index+1]])

    except Exception as e:
        raise SearchError("Failed to obtain and parse tag counts. " + str(e))
    
    return results
def harvest_source_index_clear(context, data_dict):

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception as e:
        log.exception(e)
        raise SearchIndexError(e)
    finally:
        conn.close()

    return {'id': harvest_source_id}
def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string
    '''

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))

    solr_commit = toolkit.asbool(config.get('ckan.search.solr_commit', 'true'))
    if toolkit.check_ckan_version(max_version='2.5.99'):
        # conn is solrpy
        try:
            conn.delete_query(query)
            if solr_commit:
                conn.commit()
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
        finally:
 def commit():
   try:
     conn = make_connection()
     conn.commit(wait_searcher=False)
   except Exception, e:
     log.exception(e)
     raise SearchIndexError(e)
Exemple #11
0
    def proxy_solr(self, action):
        url = urlparse.urlparse(h.full_current_url())
        if url.query != '':
            data = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8'))
        else:
            data = ast.literal_eval(p.toolkit.request.body)
        content_type = data.get('wt', 'xml')
        if isinstance(content_type, list):
            content_type = content_type[0]
        ckan_response = p.toolkit.response
        ckan_response.content_type = CONTENT_TYPES[content_type]
        solr_response = ''

        if content_type == 'csv':
            ckan_response.headers[
                'Content-Disposition'] = 'attachment; filename=query.csv'
            solr_response = str(codecs.BOM_UTF8)

        conn = make_connection()
        try:
            solr_response += conn.raw_query(**data)
            ckan_response.body = solr_response
        except SolrException, e:
            ckan_response.status_int = e.httpcode
            ckan_response.status = str(e.httpcode) + ' ' + e.reason
            ckan_response.body = e.body
def get_product_issues(context, data_dict):
    # noinspection PyUnresolvedReferences
    """
    Returns a list of the issues for the product ID
    :param: productId: A non-data product ID.

    :return: A dictionary containing the issues for the specified product
    :rtype: dict
    """
    product_id = _get_or_bust(data_dict, 'productId')

    slr = make_connection()

    response = json.loads(
        slr.raw_query(
            q='top_parent_id:{pid}'.format(
                pid=product_id
            ),
            group='true',
            group_field='issue_number_int',
            wt='json',
            sort='issue_number_int desc',
            # FIXME: We need to actually paginate on this, but the daily
            #        team will not accept it (yet).
            rows='2000000'
        )
    )

    issue_no_group = response['grouped']['issue_number_int']

    return [{
        'issue': group['groupValue'],
        'number_articles': group['doclist']['numFound']
    } for group in issue_no_group['groups']]
Exemple #13
0
def get_similar_datasets(id, max_num=5):
    '''
    Get similar datasets for a dataset.

    :param string id: ID of the target dataset. This must be the actual
        ID, passing the name is not supported.

    :param int max_num: Maximum number of datasets to return.

    :return: A list of similar dataset dicts sorted by decreasing score.
    '''
    solr = make_connection()
    query = 'id:"{}"'.format(id)
    fields_to_compare = 'text'
    fields_to_return = 'id validated_data_dict score'
    site_id = config.get('ckan.site_id')
    filter_query = '''
        +site_id:"{}"
        +dataset_type:dataset
        +state:active
        +capacity:public
        '''.format(site_id)
    results = solr.more_like_this(q=query,
                                  mltfl=fields_to_compare,
                                  fl=fields_to_return,
                                  fq=filter_query,
                                  rows=max_num)
    log.debug('Similar datasets for {}:'.format(id))
    print('Similar datasets for {}:'.format(id))
    for doc in results.docs:
        log.debug('  {id} (score {score})'.format(**doc))
        print('  {id} (score {score})'.format(**doc))
    return [json.loads(doc['validated_data_dict']) for doc in results.docs]
Exemple #14
0
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        data = conn.search(query, fq=fq, rows=max_results, fields='id')
        return [r.get('id') for r in data.docs]
Exemple #15
0
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        data = conn.search(query, fq=fq, rows=max_results, fields='id')
        return [r.get('id') for r in data.docs]
Exemple #16
0
    def get_index(self, reference):
        query = {
            "rows": 1,
            "q": 'name:"%s" OR id:"%s"' % (reference, reference),
            "wt": "json",
            "fq": 'site_id:"%s"' % config.get("ckan.site_id"),
        }

        conn = make_connection()
        log.debug("Package query: %r" % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
Exemple #17
0
    def get_index(self,reference):
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference,reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Exemple #18
0
def ogdch_autosuggest(context, data_dict):
    q = get_or_bust(data_dict, 'q')
    lang = get_or_bust(data_dict, 'lang')
    fq = data_dict.get('fq', '')

    if fq:
        fq = 'NOT private AND %s' % fq
    else:
        fq = 'NOT private'

    # parse language from values like de_CH
    if len(lang) > 2:
        lang = lang[:2]

    if lang not in ['en', 'it', 'de', 'fr']:
        raise ValidationError('lang must be one of [en, it, de, fr]')

    handler = '/suggest_%s' % lang
    suggester = 'ckanSuggester_%s' % lang

    solr = make_connection()
    try:
        log.debug(
            'Loading suggestions for %s (lang: %s, fq: %s)' % (q, lang, fq)
        )
        results = solr.search(
            '',
            search_handler=handler,
            **{'suggest.q': q, 'suggest.count': 10, 'suggest.cfq': fq}
        )
        suggestions = results.raw_response['suggest'][suggester].values()[0]  # noqa

        def highlight(term, q):
            if '<b>' in term:
                return term
            clean_q = unidecode(q)
            clean_term = unidecode(term)

            re_q = re.escape(clean_q)
            m = re.search(re_q, clean_term, re.I)
            if m:
                replace_text = term[m.start():m.end()]
                term = term.replace(replace_text, '<b>%s</b>' % replace_text)
            return term

        terms = [highlight(suggestion['term'], q) for suggestion in suggestions['suggestions']]  # noqa
        return list(set(terms))
    except pysolr.SolrError as e:
        log.exception('Could not load suggestions from solr: %s' % e)
    raise ActionError('Error retrieving suggestions from solr')
Exemple #19
0
    def get_index(self,reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference,reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e))
Exemple #20
0
    def proxy_solr(self, action):
        url = urlparse.urlparse(h.full_current_url())
        query = urlparse.parse_qs(urllib.unquote(url.query).decode('utf-8'))
        content_type = query.get('wt', ['xml'])[0]
        ckan_response = p.toolkit.response
        ckan_response.content_type = CONTENT_TYPES[content_type]

        conn = make_connection()
        try:
            solr_response = conn.raw_query(**query)
            ckan_response.body = solr_response
        except SolrException, e:
            ckan_response.status_int = e.httpcode
            ckan_response.status = str(e.httpcode) + ' ' + e.reason
            ckan_response.body = e.body
    def textcomplete(self):
        """proxies an textcomplete query to the solr suggest search-handler"""

        # TODO: must autocomplete take the map-extent in consideration?
        conn = make_connection()
        suggest = SearchHandler(conn, '/suggest')
        q = request.params["q"]
        res = suggest(q=q, wt='json')
        if not res:
            payload = []
        elif q in res.spellcheck['suggestions'].keys():
            payload = res.spellcheck['suggestions'][q]['suggestion']
        else:
            payload = []
        return json.dumps(payload)
Exemple #22
0
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = '+site_id:"%s" ' % config.get("ckan.site_id")
        fq += "+state:active "

        conn = make_connection()
        try:
            data = conn.query(query, fq=fq, rows=max_results, fields="id")
        finally:
            conn.close()

        return [r.get("id") for r in data.results]
Exemple #23
0
    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
Exemple #24
0
def update_package_stats(package_id, stats):
    try:
        conn = make_connection()
        query = "id:%s" % package_id
        res = conn.search(q=query)
        if res and res.docs:
            pkg_dict = res.docs[0]
            for key, value in stats.items():
                pkg_dict["extras_%s" % key] = str(value or '0').rjust(24, '0')

            if '_version_' in pkg_dict:
                del pkg_dict['_version_']
            conn.add(docs=[pkg_dict], commit=True)
    except pysolr.SolrError, e:
        log.error("Solr returned error: %s", e)
        log.exception(e)
        return
Exemple #25
0
    def get_index(self, reference):
        """
        For a given package reference (ID or name), returns the record for it
        from the SOLR index.
        """
        query = {
            "rows": 1,
            "q": "name:%s OR id:%s" % (reference, reference),
            "wt": "json",
            "fq": 'site_id:"%s"' % config.get("ckan.site_id"),
        }

        conn = make_connection()
        log.debug("Package query: %r" % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
Exemple #26
0
def increment_total_downloads(package_id):
    try:
        conn = make_connection()
        query = "id:%s" % package_id
        res = conn.search(q=query)
        if res and res.docs:
            pkg_dict = res.docs[0]
            total_downloads = int(pkg_dict.get('extras_total_downloads', 0))
            total_downloads += 1
            pkg_dict["extras_total_downloads"] = str(total_downloads).rjust(
                24, '0')

            if '_version_' in pkg_dict:
                del pkg_dict['_version_']
            conn.add(docs=[pkg_dict], commit=True)
    except pysolr.SolrError, e:
        log.error("Solr returned error: %s", e)
        log.exception(e)
        return
Exemple #27
0
def harvest_source_index_clear(context, data_dict):

    check_access("harvest_source_clear", context, data_dict)
    harvest_source_id = data_dict.get("id", None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error("Harvest source %s does not exist", harvest_source_id)
        raise NotFound("Harvest source %s does not exist" % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = """ +%s:"%s" +site_id:"%s" """ % ("harvest_source_id", harvest_source_id, config.get("ckan.site_id"))
    try:
        conn.delete_query(query)
        if asbool(config.get("ckan.search.solr_commit", "true")):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
Exemple #28
0
def harvest_source_index_clear(context, data_dict):

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception, e:
        log.exception(e)
        raise SearchIndexError(e)
  def delete_asset(ast_dict, defer_commit=False):
    conn = make_connection()
    if ast_dict.get('remove_all_assets'):
      index = ''
    elif ast_dict.get('whole_resource'):
      index = ' +id:{id} '.format(id=ast_dict['whole_resource'])
    else:
      index = ' +index_id:\"{index}\"'.format(
        index=_get_index_id(ast_dict['id'], ast_dict['assetID'])
      )
    query = "+{type}:{asset} {index} +site_id:\"{site}\"".format(
      type=TYPE_FIELD,
      asset=ASSET_TYPE,
      index=index,
      site=config.get('ckan.site_id'))
    try:
      conn.delete_query(query)

      if not defer_commit:
        conn.commit()
    except Exception, e:
      log.exception(e)
      raise SearchIndexError(e)
Exemple #30
0
 def get_connection(self):
     u'''Creates new raw connection to Solr.
     '''
     return make_connection()
  def run(query):
    '''
    Performs a asset search using the given query.

    @param query - dictionary with keys like: q, fq, sort, rows, facet
    @return - dictionary with keys results and count

    May raise SearchQueryError or SearchError.
    '''
    # check that query keys are valid
    if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
      invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
      raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

    # default query is to return all documents
    q = query.get('q')
    if not q or q == '""' or q == "''":
      query['q'] = "*:*"

    # number of results
    rows_to_return = min(1000, int(query.get('rows', 20)))
    if rows_to_return > 0:
      # #1683 Work around problem of last result being out of order
      #       in SOLR 1.4
      rows_to_query = rows_to_return + 1
    else:
      rows_to_query = rows_to_return
    query['rows'] = rows_to_query

    # show only results from this CKAN instance
    
    fq = query.get('fq', '')
    if not '+site_id:' in fq:
      fq += ' +site_id:"%s"' % config.get('ckan.site_id')
    if not '+type:' in q and not '+mimetype:' in q and not '+type:' in fq and not '+mimetype:' in fq:
      fq += ' -type:image/x* -mimetype:image/x* '

    # filter for asset entity_type
    if not '+entity_type:' in fq:
      fq += " +entity_type:asset"
    if not '+state:' in q and not '+state:' in fq:
      fq += " -state:hidden -state:deleted"

    user = c.userobj
    # if user and (user.sysadmin or user.email.endswith('@act.gov.au')): pass
    if user and user.sysadmin: pass
    else:
      user_groups = []
      if user:
        for group in user.get_groups():
          user_groups.append(group.id)

          #get all child orgs
          user_groups.extend([
            item.table_id for item
            in filter(
              lambda x: x.capacity=='child_organization' and x.state == 'active',
              group.member_all
            )
          ])

          #get all brothers
          parents = model.Session.query(model.Group)\
            .filter(model.Group.id.in_([
              item.table_id for item
              in filter(
                lambda x: x.capacity=='parent_organization' and x.state == 'active',
                group.member_all
              )
            ])).all()
          for parent in parents:
            user_groups.extend([
              item.table_id for item
              in filter(
                lambda x: x.capacity=='child_organization' and x.state == 'active',
                parent.member_all
              )
            ])
      private_query = model.Session.query(model.Package.id, model.Package.owner_org).\
        filter(model.Package.private==True)
      if user_groups:
        private_query = private_query.filter(~model.Package.owner_org.in_(user_groups))

      private = private_query.all()
      for id in private:
        fq += " -package_id:{id}".format(id=id[0])
    query['fq'] = [fq]

    fq_list = query.get('fq_list', [])
    query['fq'].extend(fq_list)

    # faceting
    query['facet'] = query.get('facet', 'true')
    query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
    query['facet.mincount'] = query.get('facet.mincount', 1)

    # return the asset ID and search scores
    query['fl'] = query.get('fl', 'data_dict')

    # return results as json encoded string
    query['wt'] = query.get('wt', 'json')

    # If the query has a colon in it then consider it a fielded search and do use dismax.
    defType = query.get('defType', 'dismax')
    if ':' not in query['q'] or defType == 'edismax':
      query['defType'] = defType
      query['tie'] = query.get('tie', '0.1')
      # this minimum match is explained
      # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
      query['mm'] = query.get('mm', '2<-1 5<80%')
      query['qf'] = query.get('qf', QUERY_FIELDS)


    conn = make_connection()
    # log.debug('Asset query: %r' % query)
    try:
      solr_response = conn.raw_query(**query)
    except Exception, e:
      raise SearchError('SOLR returned an error running query: %r Error: %r' %
                        (query, e.reason))
Exemple #32
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None:
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        if ':' not in query['q']:
            query['defType'] = 'dismax'
            query['tie'] = '0.1'
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = '2<-1 5<80%'
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
  def index_asset(ast_dict, defer_commit=False):
    if ast_dict is None:
      return
    ast_dict[TYPE_FIELD] = ASSET_TYPE
    ast_dict['capacity'] = 'public'
    if not ast_dict.get('package_id'):
      ast_dict['package_id']  = session.query(model.Resource).filter_by(id=resource).first().get_package_id()

    bogus_date = datetime.datetime(1, 1, 1)
    try:
      ast_dict['metadata_created'] = parse(ast_dict['lastModified'][:19],  default=bogus_date).isoformat() + 'Z'
    except ValueError:
      ast_dict['metadata_created'] = None

    ast_dict['metadata_modified'] = datetime.datetime.now().isoformat()[:19] + 'Z'

    if type(ast_dict['metadata']) in (unicode, str):
      try:
        ast_dict['metadata'] = json.loads(_unjson_base(ast_dict['metadata']))
      except ValueError:
        ast_dict['metadata'] = json.loads(_unjson(ast_dict['metadata']))
    try:
      if 'exif' in ast_dict['metadata']:
        for ex_key, ex_val in ast_dict['metadata']['exif'].items():
          if ex_key in ['EXIF:CreateDate', 'EXIF:Model', 'EXIF:Artist', 'EXIF_CreateDate', 'EXIF_Model', 'EXIF_Artist']:
            if type(ex_val) in (unicode, str):
              ast_dict['metadata'][ex_key.replace(':','_')] = ex_val
    except:
      pass

    for field in ('organization', 'text', 'notes'):
      if not ast_dict['metadata'].get(field):
        if field == 'text':
          field = ''
        ast_dict[field] = None
    if 'text' in ast_dict['metadata'] and not ast_dict['notes']:
      ast_dict['notes'] = ast_dict['metadata']['text']
    elif 'description' in ast_dict['metadata'] and not ast_dict['notes']:
      ast_dict['notes'] = ast_dict['metadata']['description']

    if not 'state' in ast_dict['metadata']:
      ast_dict['metadata']['state'] = 'active'
    for field in (('type', 'mimetype'),('mimetype', 'type')):
      if field[0] in ast_dict['metadata'] and field[1] not in ast_dict['metadata']:
        ast_dict['metadata'][field[1]] = ast_dict['metadata'][field[0]]
    if not 'mimetype' in ast_dict['metadata']:
      ast_dict['metadata']['mimetype'] = 'image/jpeg'

    tags = ast_dict['metadata'].get('tags')
    if type(tags) in (str, unicode): tags = [name.strip() for name in tags.split(',') if name]
    if type(tags) not in (list, tuple, set): tags = []
    ast_dict['tags'] = tags

    ast_dict['data_dict'] = json.dumps(ast_dict)

    index_fields = RESERVED_FIELDS + ast_dict.keys()

    # include the extras in the main namespace
    extras = ast_dict['metadata']
    for extra in extras:
      key, value = extra, extras[extra]
      if isinstance(value, (tuple, list)):
          value = " ".join(map(unicode, value))
      key = ''.join([c for c in key if c in KEY_CHARS])
      ast_dict['extras_' + key] = value
      if key not in index_fields:
        ast_dict[key] = value
    ast_dict.pop('metadata', None)

    context = {'model': model}

    # clean the dict fixing keys
    new_dict = {}
    for key, value in ast_dict.items():
      key = key.encode('ascii', 'ignore')
      new_dict[key] = value
    ast_dict = new_dict

    for k in ('title', 'notes', 'title_string', 'name'):
      if k in ast_dict and ast_dict[k]:
        ast_dict[k] = escape_xml_illegal_chars(ast_dict[k])

    # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
    # with UTC time (i.e trailing Z)
    # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
    new_dict = {}
    for key, value in ast_dict.items():
      key = key.encode('ascii', 'ignore')
      if key.endswith('_date'):
        try:
          date = parse(value, default=bogus_date)
          if date != bogus_date:
            value = date.isoformat() + 'Z'
          else:
            # The date field was empty, so dateutil filled it with
            # the default bogus date
            value = None
        except ValueError:
          continue
      new_dict[key] = value
    ast_dict = new_dict

    # mark this CKAN instance as data source:
    ast_dict['site_id'] = config.get('ckan.site_id')

    # Strip a selection of the fields.
    # These fields are possible candidates for sorting search results on,
    # so we strip leading spaces because solr will sort " " before "a" or "A".
    for field_name in ['title', 'name']:
      try:
        value = ast_dict.get(field_name)
        if value:
          ast_dict[field_name] = value.lstrip()
      except KeyError:
        pass

    # add a unique index_id to avoid conflicts
    ast_dict['index_id'] = _get_index_id(ast_dict['id'], ast_dict['assetID'])


    # send to solr:
    try:
      conn = make_connection()
      commit = not defer_commit
      if not asbool(config.get('ckan.search.solr_commit', 'true')):
        commit = False
      conn.add_many([ast_dict], _commit=commit)
    except socket.error, e:
      err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e))
      log.error(err)
      raise SearchIndexError(err)
Exemple #34
0
    def run(self, query, permission_labels=None, **kwargs):
        '''
        Performs a dataset search using the given query.

        :param query: dictionary with keys like: q, fq, sort, rows, facet
        :type query: dict
        :param permission_labels: filter results to those that include at
            least one of these labels. None to not filter (return everything)
        :type permission_labels: list of unicode strings; or None

        :returns: dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        fq = []
        if 'fq' in query:
            fq.append(query['fq'])
        fq.extend(query.get('fq_list', []))

        # show only results from this CKAN instance
        fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id')))

        # filter for package status
        if not '+state:' in query.get('fq', ''):
            fq.append('+state:active')

        # only return things we should be able to see
        if permission_labels is not None:
            fq.append('+permission_labels:(%s)' %
                      ' OR '.join(solr_literal(p) for p in permission_labels))
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
        self.count = solr_response.hits
        self.results = solr_response.docs

        # #1683 Filter out the last row that is sometimes out of order
        self.results = self.results[:rows_to_return]

        # get any extras and add to 'extras' dict
        for result in self.results:
            extra_keys = filter(lambda x: x.startswith('extras_'),
                                result.keys())
            extras = {}
            for extra_key in extra_keys:
                value = result.pop(extra_key)
                extras[extra_key[len('extras_'):]] = value
            if extra_keys:
                result['extras'] = extras

        # if just fetching the id or name, return a list instead of a dict
        if query.get('fl') in ['id', 'name']:
            self.results = [r.get(query.get('fl')) for r in self.results]

        # get facets and convert facets list to a dict
        self.facets = solr_response.facets.get('facet_fields', {})
        for field, values in six.iteritems(self.facets):
            self.facets[field] = dict(zip(values[0::2], values[1::2]))

        return {'results': self.results, 'count': self.count}
    def run(self, query):
        '''
        Performs a dataset search using the given query.
        The query may include highlighting parameters which will be
        added to package extras.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
       
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        valid_params = []
        invalid_params = []
        for key in query.keys():
            if key in VALID_SOLR_PARAMETERS or key == 'hl' or key.startswith('hl.'):
                valid_params.append(key)
            else:
                invalid_params.append(key)
        
        if len(invalid_params) > 0:
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

	    query = self.normalize_query_keys(query)
              

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = [fq]

        fq_list = query.get('fq_list', [])
        query['fq'].extend(fq_list)

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')
        query['fl'] = query['fl'] + ' index_id'

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        
        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
                    
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Exemple #36
0
def update_package_search_schema():
    fields = {
        "associated_tasks":
        b'{"add-field":{"name": '
        b'"associated_tasks",  "type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "collection_period":
        b'{"add-field":{"name": '
        b'"collection_period",  "type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "geographical_area":
        b'{"add-field":{"name": '
        b'"geographical_area",  "type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "number_of_instances":
        b'{"add-field":{"name": '
        b'"number_of_instances",  '
        b'"type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "number_of_attributes":
        b'{"add-field":{"name": '
        b'"number_of_attributes",  '
        b'"type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "pkg_description":
        b'{"add-field":{"name": '
        b'"pkg_description",  "type": "textgen", '
        b'"indexed": "true", stored: "true"}}',
        "creation_date":
        b'{"add-field":{"name": '
        b'"creation_date",  "type": "date", '
        b'"indexed": "true", stored: "true"}}',
        "expiry_date":
        b'{"add-field":{"name": '
        b'"expiry_date",  "type": "date", '
        b'"indexed": "true", stored: "true"}}',
        "has_missing_values":
        b'{"add-field":{"name": '
        b'"has_missing_values",  '
        b'"type": "boolean", '
        b'"indexed": "true", stored: "true"}}',
    }

    copy_fields = {
        "associated_tasks":
        b'{"add-copy-field":{"source": '
        b'"associated_tasks",  "dest": "text"}}',
        "collection_period":
        b'{"add-copy-field":{"source": '
        b'"collection_period",  "dest": "text"}}',
        "geographical_area":
        b'{"add-copy-field":{"source": '
        b'"geographical_area",  "dest": "text"}}',
        "pkg_description":
        b'{"add-copy-field":{"source": '
        b'"pkg_description",  "dest": "text"}}'
    }

    conn = make_connection()
    path = "schema"
    for fieldname in fields:
        res = conn._send_request("post", path, fields[fieldname])
        log.debug("Result of update {result}".format(result=res))

    for fieldname in copy_fields:
        res = conn._send_request("post", path, copy_fields[fieldname])
        log.debug("Result of update {result}".format(result=res))
    pass
Exemple #37
0
    def run(self, query):
        """
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        """
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get("q")
        if not q or q == '""' or q == "''":
            query["q"] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get("rows", 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query["rows"] = rows_to_query

        # show only results from this CKAN instance
        fq = query.get("fq", "")
        if not "+site_id:" in fq:
            fq += ' +site_id:"%s"' % config.get("ckan.site_id")

        # filter for package status
        if not "+state:" in fq:
            fq += " +state:active"
        query["fq"] = [fq]

        fq_list = query.get("fq_list", [])
        query["fq"].extend(fq_list)

        # faceting
        query["facet"] = query.get("facet", "true")
        query["facet.limit"] = query.get("facet.limit", config.get("search.facets.limit", "50"))
        query["facet.mincount"] = query.get("facet.mincount", 1)

        # return the package ID and search scores
        query["fl"] = query.get("fl", "name")

        # return results as json encoded string
        query["wt"] = query.get("wt", "json")

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get("defType", "dismax")
        if ":" not in query["q"] or defType == "edismax":
            query["defType"] = defType
            query["tie"] = query.get("tie", "0.1")
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query["mm"] = query.get("mm", "2<-1 5<80%")
            query["qf"] = query.get("qf", QUERY_FIELDS)

        conn = make_connection()
        log.debug("Package query: %r" % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
    def run(self, query, permission_labels=None, **kwargs):
        '''
        Performs a dataset search using the given query.

        :param query: dictionary with keys like: q, fq, sort, rows, facet
        :type query: dict
        :param permission_labels: filter results to those that include at
            least one of these labels. None to not filter (return everything)
        :type permission_labels: list of unicode strings; or None

        :returns: dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        valid_solr_parameters = VALID_SOLR_PARAMETERS
        for item in plugins.PluginImplementations(plugins.IPackageController):
            if 'update_valid_solr_parameters' in dir(item):
                valid_solr_parameters = item.update_valid_solr_parameters(
                    valid_solr_parameters)

        if not set(query.keys()) <= valid_solr_parameters:
            invalid_params = [
                s for s in set(query.keys()) - valid_solr_parameters
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        fq = []
        if 'fq' in query:
            fq.append(query['fq'])
        fq.extend(query.get('fq_list', []))

        # show only results from this CKAN instance
        fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id')))

        # filter for package status
        if not '+state:' in query.get('fq', ''):
            fq.append('+state:active')

        # only return things we should be able to see
        if permission_labels is not None:
            fq.append('+permission_labels:(%s)' %
                      ' OR '.join(solr_literal(p) for p in permission_labels))
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
Exemple #39
0
    def run(self, query, permission_labels=None, **kwargs):
        '''
        Performs a dataset search using the given query.

        :param query: dictionary with keys like: q, fq, sort, rows, facet
        :type query: dict
        :param permission_labels: filter results to those that include at
            least one of these labels. None to not filter (return everything)
        :type permission_labels: list of unicode strings; or None

        :returns: dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = int(query.get('rows', 10))
        # query['rows'] should be a defaulted int, due to schema, but make
        # certain, for legacy tests
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        fq = []
        if 'fq' in query:
            fq.append(query['fq'])
        fq.extend(query.get('fq_list', []))

        # show only results from this CKAN instance
        fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id')))

        # filter for package status
        if not '+state:' in query.get('fq', ''):
            fq.append('+state:active')

        # only return things we should be able to see
        if permission_labels is not None:
            fq.append('+permission_labels:(%s)' % ' OR '.join(
                solr_literal(p) for p in permission_labels))
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        try:
            if query['q'].startswith('{!'):
                raise SearchError('Local parameters are not supported.')
        except KeyError:
            pass

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e))
        self.count = solr_response.hits
        self.results = solr_response.docs


        # #1683 Filter out the last row that is sometimes out of order
        self.results = self.results[:rows_to_return]

        # get any extras and add to 'extras' dict
        for result in self.results:
            extra_keys = filter(lambda x: x.startswith('extras_'), result.keys())
            extras = {}
            for extra_key in extra_keys:
                value = result.pop(extra_key)
                extras[extra_key[len('extras_'):]] = value
            if extra_keys:
                result['extras'] = extras

        # if just fetching the id or name, return a list instead of a dict
        if query.get('fl') in ['id', 'name']:
            self.results = [r.get(query.get('fl')) for r in self.results]

        # get facets and convert facets list to a dict
        self.facets = solr_response.facets.get('facet_fields', {})
        for field, values in six.iteritems(self.facets):
            self.facets[field] = dict(zip(values[0::2], values[1::2]))

        return {'results': self.results, 'count': self.count}
Exemple #40
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = [fq]

        fq_list = query.get('fq_list', [])
        query['fq'].extend(fq_list)

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
def _run(query):
    '''
    Custom final preparation of the solr query
    and call to the solr api.

    :param query:
    :return:
    '''

    # default query is to return all documents
    q = query.get('q')
    if not q or q == '""' or q == "''":
        query['q'] = "*:*"

    # number of results
    rows_to_return = query.get('rows', 0)
    if rows_to_return > 0:
        # #1683 Work around problem of last result being out of order
        #       in SOLR 1.4
        rows_to_query = rows_to_return + 1
    else:
        rows_to_query = rows_to_return
    query['rows'] = rows_to_query

    # show only results from this CKAN instance
    fq = query.get('fq', '')
    if not '+site_id:' in fq:
        fq += ' +site_id:"%s"' % config.get('ckan.site_id')

    # filter for package status
    if not '+state:' in fq:
        fq += " +state:active"
    query['fq'] = [fq]

    fq_list = query.get('fq_list', [])
    query['fq'].extend(fq_list)

    # faceting
    query['facet'] = query.get('facet', 'true')
    query['facet.limit'] = query.get('facet.limit',
                                     config.get('search.facets.limit', '50'))
    query['facet.mincount'] = query.get('facet.mincount', 1)

    # return the package ID and search scores
    query['fl'] = query.get('fl', 'name')

    # return results as json encoded string
    query['wt'] = query.get('wt', 'json')

    # If the query has a colon in it then consider it a fielded search and do use dismax.
    defType = query.get('defType', 'dismax')

    boolean = query.get('extras', {}).get('ext_boolean', 'all')
    if boolean not in ['all', 'any', 'exact']:
        log.error('Ignoring unknown boolean search operator %r' % (boolean, ))
        boolean = 'all'

    if ':' not in query['q']:
        query['defType'] = 'dismax'
        query['tie'] = '0.1'
        if boolean == 'any':
            query['mm'] = '0'
        elif boolean == 'all':
            query['mm'] = '100%'
        elif boolean == 'exact':
            query['q'] = '"' + q.replace('"', '\\"') + '"'
        query['qf'] = query.get('qf', QUERY_FIELDS)

    conn = make_connection()
    log.info('Package query: %r' % query)
    try:
        start_time = time.time()
        solr_response = conn.raw_query(**query)
        duration = time.time() - start_time
        log.info("Solr returned the resilt after {0}".format(duration))
    except SolrException, e:
        raise SearchError(
            'SOLR returned an error running query: %r Error: %r' %
            (query, e.reason))
Exemple #42
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None:
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        if ':' not in query['q']:
            query['defType'] = 'dismax'
            query['tie'] = '0.1'
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = '2<-1 5<80%'
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Exemple #43
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = [fq]

        fq_list = query.get('fq_list', [])
        query['fq'].extend(fq_list)

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e))