Ejemplo n.º 1
0
def check_bad_query_data(kwargs):
    solr_search = SolrSearch(interface)
    try:
        solr_search.query(**kwargs).params()
    except SolrError:
        pass
    else:
        assert False
Ejemplo n.º 2
0
    def search_dataset(self, request, **kwargs):
        """
        Perform a full-text search on only one dataset.

        TKTK -- implement field searches
        TKTK -- implement wildcard + boolean searches
        """
        self.method_check(request, allowed=['get'])
        self.is_authenticated(request)
        self.throttle_check(request)

        if 'pk' in kwargs:
            dataset_id = kwargs['pk']
        else:
            dataset_id = request.GET.get('id')

        d = Dataset.objects.get(id=dataset_id)

        limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get('offset', 0))

        s = SolrSearch(self._solr())
        s = s.query(full_text=request.GET.get('q'))
        s = s.filter(dataset_id=dataset_id)
        s = s.paginate(offset, limit)
        s = s.execute()

        paginator = Paginator(request.GET, s, resource_uri=request.path_info)
        page = paginator.page()

        dataset_url = reverse('api_dispatch_detail', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id })

        # Update with attributes from the dataset
        # (Resulting object matches a group from the search endpoint)
        page.update({
            'id': d.id,
            'name': d.name,
            'resource_uri': dataset_url,
            'row_count': d.row_count,
            'schema': d.schema
        })

        objects = []

        for obj in s.result.docs:
            bundle = self.build_bundle(obj=SolrObject(obj), request=request)
            bundle = self.full_dehydrate(bundle)
            objects.append(bundle)

        page['objects'] = objects

        self.log_throttled_access(request)

        return self.create_response(request, page)
Ejemplo n.º 3
0
def check_multiple_call_data(arg_kw_list, query_output, filter_output):
    solr_search = SolrSearch(interface)
    q = solr_search.query()
    f = solr_search.query()
    for args, kwargs in arg_kw_list:
        q = q.query(*args, **kwargs)
        f = f.filter(*args, **kwargs)
    qp = q.params()
    fp = f.params()
    check_equal_with_debug(qp, query_output)
    check_equal_with_debug(fp, filter_output)
Ejemplo n.º 4
0
 def test_wildcard_search_cleaned_up(self):
     from adhocracy.lib.search.query import add_wildcard_query
     search = SolrSearch(interface)
     query = add_wildcard_query(search, 'text', 'one** two*')
     self.assertEqual(
         query.params(),
         [('q', '(text:one OR text:one*) AND (text:two OR text:two*)')])
Ejemplo n.º 5
0
    def test_wildcard_search_ignore_none(self):
        from adhocracy.lib.search.query import add_wildcard_query
        search = SolrSearch(interface)

        query = add_wildcard_query(search, 'text', None)
        self.assertEqual(
            query.params(),
            [('q', '*:*')])
Ejemplo n.º 6
0
    def test_wildcard_search_added_to_search(self):
        from adhocracy.lib.search.query import add_wildcard_query
        search = SolrSearch(interface).query(text='passedin')

        query = add_wildcard_query(search, 'text', 'wild')
        self.assertEqual(
            query.params(),
            [('q', 'text:passedin AND (text:wild OR text:wild*)')])
Ejemplo n.º 7
0
def test_complex_boolean_queries():
    solr_search = SolrSearch(interface)
    for query, output in complex_boolean_queries:
        check_complex_boolean_query(solr_search, query, output)
Ejemplo n.º 8
0
def check_query_data(method, args, kwargs, output):
    solr_search = SolrSearch(interface)
    p = getattr(solr_search, method)(*args, **kwargs).params()
    check_equal_with_debug(p, output)
Ejemplo n.º 9
0
  def get(rows, start, **kwargs):
    """
    Input
      id
      start_date
      end_date
      phrase
      rows - the number of records to get from solr
      start - where to start getting records in solr (offset)
      frame
      order
      states - list of 2 letter state abbreviations

    Output
      List of output
    """

    solr_query = Speech.build_sunburnt_query(**kwargs).paginate(rows=rows, start=start)

    if kwargs.get('order') and kwargs.get('order') not in ["frame", "tfidf", "idf", "termFreq"]:
      solr_query = solr_query.sort_by(kwargs.get('order'))

    # solr_query = solr_query.terms('speaking').terms(tf=True)
    params = solr_query.params()
    dict_params = dict(params)

    dict_params['norm'] = 'norm(speaking)'
    dict_params['tf'] = 'tf(speaking, %s)' % kwargs.get('phrase')
    dict_params['idf'] = 'idf(speaking, %s)' % kwargs.get('phrase')
    dict_params['tfidf'] = 'mul($tf, $idf)'
    dict_params['termFreq'] = 'termfreq(speaking, %s)' % kwargs.get('phrase')
    dict_params['fl'] = "*, score, $norm, $termFreq, $tf, $idf, $tfidf"
    dict_params['q'] += " AND {!frange l=8}$tfidf"
    if kwargs.get('order') == None or kwargs.get('order') == "tfidf":
      dict_params["sort"] = "$tfidf desc"

    if kwargs.get('frame') and kwargs.get('order') == "frame" and kwargs.get('analysis_id'):

      from app.models.analysis import Analysis

      frame_words = Frame.get(Frame.id == kwargs['frame']).word_string
      # analysis_obj = Analysis.get(Analysis.id == kwargs['analysis_id'])
      # key = "%s - %s" % (kwargs.get('start_date'), kwargs.get('end_date'))
      # vocabulary_proba = json.loads(analysis_obj.speech_windows)[key]
      # frame_vocabulary_proba =  { word: (abs(exp(vocabulary_proba.get(word)[0]) - exp(vocabulary_proba.get(word)[1]))) if vocabulary_proba.get(word) != None else 0 for word in frame_words.split() }
      # dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, frame_vocabulary_proba[word]), frame_words.split())) + "), $norm)"

      dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, 1), frame_words.split())) + "), $norm)"

      if dict_params.get('fl'):
        dict_params['fl'] += ", $frameFreq"
      else:
        dict_params['fl'] = '$frameFreq'

      dict_params["sort"] = "$frameFreq desc"

    params = zip(dict_params.keys(), dict_params.values())

    # print params

    result = si.schema.parse_response(si.conn.select(params))
    q = SolrSearch(si)
    response = q.transform_result(result, q.result_constructor)

    speeches = response.result.docs
    highlighting = response.highlighting
    term_vectors = response.term_vectors

    current_count = response.result.numFound
    current_start = response.result.start

    # TODO: improve this
    if kwargs.get('frame') and kwargs.get('highlight'):
      frame = Frame.get(Frame.id == kwargs['frame'])
      # pdb.set_trace()
      for speech in speeches:
          speech = Speech.highlight_speech(speech, frame)

    speeches_dict = {
      'count': current_count,
      'start': current_start,
      'speeches': speeches,
      'term_vectors': term_vectors,
      'highlighting': highlighting
    }

    return speeches_dict
Ejemplo n.º 10
0
#!/usr/bin/env python

from sunburnt import SolrInterface
from sunburnt.search import SolrSearch

solr = SolrInterface('http://localhost:8983/solr')
s = SolrSearch(solr)

print 'Testing basic query'

response = s.query(full_text='Education').execute()

print response.result

print 'Testing group query'

response = s.query(full_text='Education').group_by('dataset_id', limit=2, sort='+row').execute()

for k, g in response.result.groups.items():
    print k, g.docs 
Ejemplo n.º 11
0
    def search(self, request, **kwargs):
        """
        An endpoint for performing full-text searches.

        TKTK -- implement field searches
        TKTK -- implement wildcard + boolean searches
        """
        self.method_check(request, allowed=['get'])
        self.is_authenticated(request)
        self.throttle_check(request)

        limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS))
        offset = int(request.GET.get('offset', 0))

        s = SolrSearch(self._solr())
        s = s.query(full_text=request.GET.get('q'))
        s = s.group_by('dataset_id', limit=settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP, offset=0, sort='+row')
        s = s.paginate(offset, limit)
        s = s.execute()

        paginator = Paginator(request.GET, s, resource_uri=request.path_info)

        page = paginator.page()

        datasets = []

        for dataset_id, group in s.result.groups.items():
            dataset_url = reverse('api_dispatch_detail', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id })
            dataset_search_url = reverse('api_search_dataset', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id })

            d = Dataset.objects.get(id=dataset_id)

            dataset = {
                'id': d.id,
                'name': d.name,
                'resource_uri': dataset_url,
                'row_count': d.row_count,
                'schema': d.schema,
                'meta': {
                    'limit': settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP,
                    'next': None,
                    'offset': 0,
                    'previous': None,
                    'total_count': group.numFound
                },
                'objects': []
            }

            if group.numFound > settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP:
                dataset['meta']['next'] = '?'.join([dataset_search_url, 'limit=%i&offset=%i' % (settings.PANDA_DEFAULT_SEARCH_ROWS, settings.PANDA_DEFAULT_SEARCH_ROWS)])

            for obj in group.docs:
                bundle = self.build_bundle(obj=SolrObject(obj), request=request)
                bundle = self.full_dehydrate(bundle)
                dataset['objects'].append(bundle)

            datasets.append(dataset)

        page['objects'] = datasets

        self.log_throttled_access(request)

        return self.create_response(request, page)
Ejemplo n.º 12
0
    def get(rows, start, **kwargs):
        """
    Input
      id
      start_date
      end_date
      phrase
      rows - the number of records to get from solr
      start - where to start getting records in solr (offset)
      frame
      order
      states - list of 2 letter state abbreviations

    Output
      List of output
    """

        solr_query = Speech.build_sunburnt_query(**kwargs).paginate(
            rows=rows, start=start)

        if kwargs.get('order') and kwargs.get('order') not in [
                "frame", "tfidf", "idf", "termFreq"
        ]:
            solr_query = solr_query.sort_by(kwargs.get('order'))

        # solr_query = solr_query.terms('speaking').terms(tf=True)
        params = solr_query.params()
        dict_params = dict(params)

        dict_params['norm'] = 'norm(speaking)'
        dict_params['tf'] = 'tf(speaking, %s)' % kwargs.get('phrase')
        dict_params['idf'] = 'idf(speaking, %s)' % kwargs.get('phrase')
        dict_params['tfidf'] = 'mul($tf, $idf)'
        dict_params['termFreq'] = 'termfreq(speaking, %s)' % kwargs.get(
            'phrase')
        dict_params['fl'] = "*, score, $norm, $termFreq, $tf, $idf, $tfidf"
        dict_params['q'] += " AND {!frange l=8}$tfidf"
        if kwargs.get('order') == None or kwargs.get('order') == "tfidf":
            dict_params["sort"] = "$tfidf desc"

        if kwargs.get('frame') and kwargs.get(
                'order') == "frame" and kwargs.get('analysis_id'):

            from app.models.analysis import Analysis

            frame_words = Frame.get(Frame.id == kwargs['frame']).word_string
            # analysis_obj = Analysis.get(Analysis.id == kwargs['analysis_id'])
            # key = "%s - %s" % (kwargs.get('start_date'), kwargs.get('end_date'))
            # vocabulary_proba = json.loads(analysis_obj.speech_windows)[key]
            # frame_vocabulary_proba =  { word: (abs(exp(vocabulary_proba.get(word)[0]) - exp(vocabulary_proba.get(word)[1]))) if vocabulary_proba.get(word) != None else 0 for word in frame_words.split() }
            # dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, frame_vocabulary_proba[word]), frame_words.split())) + "), $norm)"

            dict_params['frameFreq'] = "mul(sum(" + ", ".join(
                map(
                    lambda word: "mul(termfreq(speaking,\"%s\"), %f)" %
                    (word, 1), frame_words.split())) + "), $norm)"

            if dict_params.get('fl'):
                dict_params['fl'] += ", $frameFreq"
            else:
                dict_params['fl'] = '$frameFreq'

            dict_params["sort"] = "$frameFreq desc"

        params = zip(dict_params.keys(), dict_params.values())

        # print params

        result = si.schema.parse_response(si.conn.select(params))
        q = SolrSearch(si)
        response = q.transform_result(result, q.result_constructor)

        speeches = response.result.docs
        highlighting = response.highlighting
        term_vectors = response.term_vectors

        current_count = response.result.numFound
        current_start = response.result.start

        # TODO: improve this
        if kwargs.get('frame') and kwargs.get('highlight'):
            frame = Frame.get(Frame.id == kwargs['frame'])
            # pdb.set_trace()
            for speech in speeches:
                speech = Speech.highlight_speech(speech, frame)

        speeches_dict = {
            'count': current_count,
            'start': current_start,
            'speeches': speeches,
            'term_vectors': term_vectors,
            'highlighting': highlighting
        }

        return speeches_dict