Esempio n. 1
0
 def query(cls, text):
     # user = User.objects.get(pk=user_id)
     text=text.strip()
     cls.ES.refresh()
     q = StringQuery(text)
     highlighter = HighLighter(['<em>'],['</em>'])
     # highlighter = HighLighter()
     s = Search(q,highlight=highlighter)
     s.add_highlight('title')
     s.add_highlight('content')
     results = cls.ES.search(s,indices=['%s-index' % cls.name])
     # logging.user(user, "~FGSearch ~FCsaved stories~FG for: ~SB%s" % text)
     
     if not results.total:
         # logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text)
         q = FuzzyQuery('title', text)
         results = cls.ES.search(q)
         
     if not results.total:
         # logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text)
         q = FuzzyQuery('content', text)
         results = cls.ES.search(q)
         
     if not results.total:
         # logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text)
         q = FuzzyQuery('author', text)
         results = cls.ES.search(q)
         
     return results
Esempio n. 2
0
 def test_iterator(self):
     resultset = self.conn.search(Search(MatchAllQuery(), size=20),
                                  self.index_name, self.document_type)
     self.assertEqual(len([p for p in resultset]), 20)
     resultset = self.conn.search(Search(MatchAllQuery(), size=10),
                                  self.index_name, self.document_type)
     self.assertEqual(len([p for p in resultset[:10]]), 10)
     self.assertEqual(resultset[10].uuid, "11111")
     self.assertEqual(resultset.total, 1000)
Esempio n. 3
0
    def test_QueryHighlight(self):
        q = Search(QueryStringQuery("joe"))
        q.add_highlight("parsedtext")
        q.add_highlight("name")
        resultset = self.conn.search(q, indices=self.index_name)
        self.assertEqual(resultset.total, 2)
        self.assertNotEqual(resultset[0]._meta.highlight, None)

        self.assertEqual(resultset[0]._meta.highlight[u"parsedtext"][0].strip(),
            u'<em>Joe</em> Testere nice guy')
Esempio n. 4
0
    def test_QueryHighlightWithHighLighter(self):
        h = HighLighter(['<b>'], ['</b>'])
        q = Search(QueryStringQuery("joe"), highlight=h)
        q.add_highlight("parsedtext")
        q.add_highlight("name")
        resultset = self.conn.search(q, indices=self.index_name)
        self.assertEqual(resultset.total, 2)
        self.assertNotEqual(resultset[0]._meta.highlight, None)

        self.assertEqual(resultset[0]._meta.highlight[u"parsedtext"][0].strip(),
            u'<b>Joe</b> Testere nice guy')
Esempio n. 5
0
    def test_iterator_offset(self):
        # Query for a block of 10, starting at position 10:
        #
        resultset = self.conn.search(Search(
            MatchAllQuery(),
            start=10,
            size=10,
            sort={'position': {
                'order': 'asc'
            }}),
                                     self.index_name,
                                     self.document_type,
                                     start=10,
                                     size=10)

        # Ensure that there are 1000 results:
        #
        self.assertEqual(len(resultset), 1000)

        # Now check that we actually have records 10-19, rather than 0-9:
        #
        position = 0
        for r in resultset:
            self.assertEqual(r.position, position + 10)
            position += 1
Esempio n. 6
0
    def get_search(self, request, **kwargs):
        """
        Execute a search query to elasticsearch

        Request parameters are:
        - `q`: string query
        - `types`: set of document types (`contact`, `organization`, `invoice`, ...)

        A minimum of 2 chars are required for the query to be processed (wildcards excluded).
        """
        import re
        import pyes
        from pyes.query import Search, StringQuery

        self.method_check(request, allowed=['get'])
        self.is_authenticated(request)
        self.throttle_check(request)

        try:
            # Tenant (slug) must be present
            tenant = request.tenant.slug
            # By default, search is made among all types, can be overriden passing a types argument
            doc_types = request.GET.getlist('types')
            # The 'q' parameter represents the query
            query = request.GET.get('q')
            # The query must be a string and composed by at least 2 chars (ES wildcards excluded)
            assert (isinstance(query, basestring)
                    and len(re.sub('[?*]', '', query)) >= 2)
        except:
            return http.HttpBadRequest()

        try:
            conn = pyes.ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH)
            q = Search(StringQuery(query))
            resultset = conn.search(
                q,
                indices=tenant,
                doc_types=u",".join(doc_types) if doc_types else None)
            searched_items = []
            for res in resultset:
                res.update({
                    'id': res._meta['id'],
                    'resource_type': res._meta['type'],
                    'score': res._meta['score']
                })
                searched_items.append(res)
        except:
            return http.HttpBadRequest()

        self.log_throttled_access(request)

        paginator = self._meta.paginator_class(
            request.GET,
            searched_items,
            resource_uri=self.get_resource_uri(),
            limit=self._meta.limit)
        to_be_serialized = paginator.page()
        return self.create_response(request,
                                    to_be_serialized,
                                    response_class=http.HttpResponse)
def find_most_similar_movie(movie_title, movie_year):
    query = MultiMatchQuery(['title', 'original_title', 'aka'], movie_title)
    results = elasticsearch.search(Search(query=query, size=50),
                                   indices='rightchannel',
                                   doc_types='movie')
    max_score = 0
    max_movie = None
    for r in results:
        title = r.get('title')
        original_title = r.get('original_title')
        aka = r.get('aka')
        year = r.get('year')

        if year and year == movie_year:
            score = 0
            if title:
                score = max(calc_similarity(title, movie_title), score)
            if original_title:
                score = max(calc_similarity(original_title, movie_title),
                            score)
            if aka:
                for t in aka:
                    score = max(calc_similarity(t, movie_title), score)

            if score > max_score:
                max_score = score
                max_movie = r

            if max_score > 0.9999:
                break

    return max_movie, max_score
Esempio n. 8
0
 def test_sorting_by_geolocation(self):
     search = Search(MatchAllQuery())
     search.sort.add(GeoSortOrder(field='location', lat=1, lon=1))
     resultset = self.conn.search(search,
                                  indices=self.index_name,
                                  doc_types=[self.document_type])
     ids = [doc['_id'] for doc in resultset.hits]
     self.assertEqual(ids, ['1', '2', '3'])
Esempio n. 9
0
 def test_sorting_by_script(self):
     search = Search(MatchAllQuery())
     search.sort.add(ScriptSortOrder("1.0/doc['foo'].value", type='number'))
     resultset = self.conn.search(search,
                                  indices=self.index_name,
                                  doc_types=[self.document_type])
     ids = [doc['_id'] for doc in resultset.hits]
     self.assertEqual(ids, ['3', '2', '1'])
Esempio n. 10
0
 def test_sorting_by_foo(self):
     search = Search(MatchAllQuery())
     search.sort.add(SortOrder('foo', order='desc'))
     resultset = self.conn.search(search,
                                  indices=self.index_name,
                                  doc_types=[self.document_type])
     ids = [doc['_id'] for doc in resultset.hits]
     self.assertEqual(ids, ['3', '2', '1'])
Esempio n. 11
0
    def query(cls, text):
        # user = User.objects.get(pk=user_id)
        text = text.strip()
        cls.ES.refresh()
        q = StringQuery(text)
        highlighter = HighLighter(['<em>'], ['</em>'])
        # highlighter = HighLighter()
        s = Search(q, highlight=highlighter)
        s.add_highlight('title')
        s.add_highlight('content')
        results = cls.ES.search(s, indices=['%s-index' % cls.name])
        # logging.user(user, "~FGSearch ~FCsaved stories~FG for: ~SB%s" % text)

        if not results.total:
            # logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text)
            q = FuzzyQuery('title', text)
            results = cls.ES.search(q)

        if not results.total:
            # logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text)
            q = FuzzyQuery('content', text)
            results = cls.ES.search(q)

        if not results.total:
            # logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text)
            q = FuzzyQuery('author', text)
            results = cls.ES.search(q)

        return results
Esempio n. 12
0
    def test_QueryHighlight(self):
        q = Search(StringQuery("joe"))
        q.add_highlight("parsedtext")
        q.add_highlight("name")
        resultset = self.conn.search(q, indices=self.index_name)
        self.assertEquals(resultset.total, 2)
        self.assertNotEqual(resultset[0]._meta.highlight, None)

        self.assertEquals(
            resultset[0]._meta.highlight[u"parsedtext"][0].strip(),
            u'<em>Joe</em> Testere nice guy')
Esempio n. 13
0
    def test_QueryHighlightWithHighLighter(self):
        h = HighLighter(['<b>'], ['</b>'])
        q = Search(StringQuery("joe"), highlight=h)
        q.add_highlight("parsedtext")
        q.add_highlight("name")
        resultset = self.conn.search(q, indices=self.index_name)
        self.assertEquals(resultset.total, 2)
        self.assertNotEqual(resultset[0]._meta.highlight, None)

        self.assertEquals(
            resultset[0]._meta.highlight[u"parsedtext"][0].strip(),
            u'<b>Joe</b> Testere nice guy')
Esempio n. 14
0
def search_add_sort(query, sort_field, sort_type):
    search = Search(query)
    sort_order = SortOrder(sort_field, sort_type)
    search.sort.add(sort_order)
    return search
Esempio n. 15
0
    def run(self):
        while True:
            try:
                movie_item = movie_pool.get()
                query = MultiMatchQuery(['title', 'original_title', 'aka'],
                                        movie_item.title)
                results = elasticsearch.search(
                    Search(query=query, size=10),
                    indices=settings['elasticsearch']['index'],
                    doc_types='movie')
                max_score = 0
                max_movie = None
                for r in results:
                    title = r.get('title')
                    original_title = r.get('original_title')
                    aka = r.get('aka')
                    year = r.get('year')
                    countries = r.get('countries')
                    directors = r.get('directors')
                    casts = r.get('casts')

                    score = 0
                    if title:
                        score = max(calc_similarity(title, movie_item.title),
                                    score)
                    if original_title:
                        score = max(
                            calc_similarity(original_title, movie_item.title),
                            score)
                    if aka:
                        for t in aka:
                            score = max(calc_similarity(t, movie_item.title),
                                        score)
                    if movie_item.year and year and movie_item.year == year:
                        score += 1
                    if movie_item.countries and countries:
                        for country in movie_item.countries:
                            if country in countries:
                                score += 1
                    if movie_item.directors and directors:
                        for director in movie_item.directors:
                            if director in directors:
                                score += 1
                    if movie_item.casts and casts:
                        for cast in movie_item.casts:
                            if cast in casts:
                                score += 1

                    if score > max_score:
                        max_score = score
                        max_movie = r

                if max_movie:
                    mongodb['movies'].update(
                        {'_id': ObjectId(max_movie.get('_id'))}, {
                            '$set': {
                                'resources.online.%s' % movie_item.provider: {
                                    'url': movie_item.url,
                                    'similarity': max_score,
                                    'last_updated': datetime.datetime.utcnow()
                                }
                            }
                        })
                    self.logger.info('%s(%s) %s(douban) %s(similarity)',
                                     movie_item.title, movie_item.provider,
                                     max_movie.get('title'), max_score)
                else:
                    self.logger.warn('No similar movie for %s(%s)',
                                     movie_item.title, movie_item.provider)
            except PyMongoError, e:
                self.logger.error('Mongodb error %s' % e)
            except Exception, e:
                self.logger.error(e)