Example #1
0
async def fetchArticlBody(*,
                          projectName: str = Path(...),
                          urlItem: str,
                          word: str):
    # 查询 项目数据库,articles 表中数据

    # projectName 转 projectId
    projectId = await findProjectIdFromProjectName(
        dbPrefix,
        'Project',
        queryDict={'projectName': projectName},
        showDict={'_id': 1})
    if not projectId:
        raise HTTPException(status_code=503, detail='projectNotExist')

    # 页码起始
    start = 0
    end = 0
    # 带搜索的 es索引 (等价于 mongo中的 数据库)
    _index = f'kwm-{projectId}.articles'.lower()
    #print('_index', _index)

    s = Search()

    q1 = Q("match_phrase", url=f"\"{urlItem}\"")  # url 匹配
    q2 = Q('match_phrase', body=f"\"{word}\"")  # word 匹配
    s = s.query(q1)
    s = s.query(q2)
    s = s.source(includes=[''])  # 不返回输出
    s = s.highlight_options(order='score')
    s = s.highlight_options(
        pre_tags="<strong style=\"background: yellow;color: red\">")
    s = s.highlight_options(post_tags="</strong>")
    s = s.highlight_options(fragment_size=300)  #
    s = s.highlight('body')
    s = s[0:10000]

    # common setting
    #print(s.to_dict())

    # 执行
    response = await esRun(s.to_dict(), _index)  #s.execute(ignore_cache=True)
    #totalCount = response.hits.total.value
    temp = response.to_dict()['hits']['hits']
    result = []
    for item in temp:
        tt = {'_id': {'$oid': item['_id']}}
        tt.update(item['_source'])
        if item.get('highlight'):
            tt.update({'highlight': item['highlight']})
        if start >= 0 and end > 0:
            tt.update({'id': start + 1})
        result.append(tt)
        start = start + 1
    return (result)
def elasticsearch_pages(context, sort, page):
    result_limit = int(os.environ['RESULT_LIMIT'])
    max_result_limit = int(os.environ['MAX_RESULT_LIMIT'])
    start = (page - 1) * result_limit
    end = start + result_limit
    domain_query = Q("term", is_banned=False)
    if context["is_up"]:
        domain_query = domain_query & Q("term", is_up=True)
    if not context["show_fh_default"]:
        domain_query = domain_query & Q("term", is_crap=False)
    if not context["show_subdomains"]:
        domain_query = domain_query & Q("term", is_subdomain=False)
    if context["rep"] == "genuine":
        domain_query = domain_query & Q("term", is_genuine=True)
    if context["rep"] == "fake":
        domain_query = domain_query & Q("term", is_fake=True)

    limit = max_result_limit if context["more"] else result_limit

    has_parent_query = Q("has_parent", type="domain", query=domain_query)
    query = Search().filter(has_parent_query).query(
        Q("match", body_stripped=context['search']))
    query = query.highlight_options(
        order='score', encoder='html').highlight('body_stripped')[start:end]
    query = query.source(['title', 'domain_id', 'created_at',
                          'visited_at']).params(request_cache=True)
    return query.execute()
	def GetAuditDataMain(self, data):
		s = Search()
		s = s[0:1000]
		s = s.highlight('*')
		s = s.highlight_options(require_field_match=False)
		t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q('query_string', default_field="AuditType.Generator", query="w32processes-tree")

		query = s.query(t)

		try:
			r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False)
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = []

		try:
			for x in r.json()['hits']['hits']:
				for y, v in x['highlight'].iteritems():
					data.append({
							"doc_id": x['_id'],
							"endpoint": x['_parent'],
							"audittype": x['_source']['AuditType']['Generator'],
							"field": y,
							"response": v
						})
		except KeyError:
			pass

		return data
Example #4
0
    def get(self, request, *args, **kwargs):
        query = self.request.query_params.get('query')
        country = self.request.query_params.get('country')
        points = self.request.query_params.get('points')

        search = Search(index=constants.ES_INDEX)
        q = {'should': [], 'filter': []}

        if query:
            q['should'] = [
                Match(variety={
                    'query': query,
                    'boost': 3.0
                }),
                Match(winery={
                    'query': query,
                    'boost': 2.0
                }),
                Match(description={
                    'query': query,
                    'boost': 1.0
                })
            ]
            q['minimum_should_match'] = 1

            search = search.highlight_options(number_of_fragments=0,
                                              pre_tags=['<mark>'],
                                              post_tags=['</mark>'])
            search = search.highlight('variety', 'winery', 'description')

        if country:
            q['filter'].append(Term(country=country))
        if points:
            q['filter'].append(Term(points=points))

        response = search.query('bool', **q).params(size=100).execute()

        if response.hits.total.value > 0:
            return Response(data=[{
                'id':
                hit.meta.id,
                'country':
                hit.country,
                'description':
                (hit.meta.highlight.description[0] if 'highlight' in hit.meta
                 and 'description' in hit.meta.highlight else hit.description),
                'points':
                hit.points,
                'price':
                hit.price,
                'variety': (
                    hit.meta.highlight.variety[0] if 'highlight' in hit.meta
                    and 'variety' in hit.meta.highlight else hit.variety),
                'winery': (
                    hit.meta.highlight.winery[0] if 'highlight' in hit.meta
                    and 'winery' in hit.meta.highlight else hit.winery)
            } for hit in response])
        else:
            return Response(data=[])
Example #5
0
    def search_close(self, origin_timestamp, channel, qterm, number_results):
        """
        Find log entries close to origin timestamp, filter by channel, highlight qterm and return them sorted by date.

        :param origin_timestamp: origin timestamp to find logs around
        :param channel: Channel to be filtered
        :param qterm: Term to be highlighted
        :param number_results: how many results
        :return: List of sorted log entries (Elastic-search response)
        :rtype: ``list``
        """
        # Prepare query
        s = DslSearch(using=self._es, index=self._index_prefix.format('*'))

        # Function score
        main_query_boosting = 1e-15  # only used for highlighting, not for scoring -> give very low signifance
        pos = MatchPhrase(msg={'query': qterm, 'boost': main_query_boosting}) | \
              Match(**{'username': {'query': qterm, 'boost': main_query_boosting}}) | \
              Match(channel={'query': qterm, 'boost': main_query_boosting}) | \
              Match(msg={'query': qterm, 'boost': main_query_boosting})
        main_query = (pos | Q('match_all'))

        function_score_query = Q('function_score',
                                 query=main_query,
                                 functions=[
                                     SF(
                                         'exp', **{
                                             '@timestamp': {
                                                 "origin": origin_timestamp,
                                                 "scale": "1m",
                                                 "decay": 0.999
                                             }
                                         })
                                 ])

        s = s.query(function_score_query)

        # filter channel
        s = s.filter('term', **{'channel.keyword': channel})

        # Number of results
        s = s[0:number_results]

        # Highlight
        s = s.highlight_options(order='score')
        s = s.highlight('msg', number_of_fragments=0)
        s = s.highlight('username')
        s = s.highlight('channel')

        # Execute
        response = s.execute()

        # Sort results
        response_sorted = sorted(response, key=lambda hit: hit['@timestamp'])

        return response_sorted
Example #6
0
    def portalSearch(expression, start=0, end=25):
        client = Elasticsearch()
        ret = {'nodes': [], 'Counts': {}}
        q = Q("bool", must=[Q('match', _all=expression)])
        s = Search(using=client,
                   index="neo4j-inquisite-node",
                   doc_type="Repository,Data").query(q)
        q_total = s.count()
        s = s[0:q_total]
        s = s.highlight_options(require_field_match=False)
        s = s.highlight('*', fragment_size=45)
        res = s.execute()
        data = {}
        uuids = []
        pub_uuids = {}
        if res:
            for r in res:
                d = r.to_dict()
                if r.meta.doc_type == 'Repository':
                    if int(d['published']) == 0:
                        continue
                    repo_id = r.meta.id
                    ret['nodes'].append({
                        "id": r.meta.id,
                        "type": "Repository",
                        "name": d['name'],
                        "description": d['readme']
                    })
                    repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id)
                    pub_uuids[repo_id] = repo_uuids
                else:
                    hits = []
                    highs = r.meta.highlight.to_dict()
                    for high_field, high_value in highs.items():
                        hits.append({high_field: high_value})
                    data[r.meta.id] = {'id': r.meta.id, "hits": hits}
                    uuids.append(r.meta.id)
            qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id"
            pub_data = db.run(qString, {"uuids": uuids})
            data_max = 0
            for checked in pub_data:
                if data_max >= 32:
                    break
                ret['nodes'].append({
                    "id": checked['uuid'],
                    "type": "Data",
                    "repo_id": checked['repo_id'],
                    "repo_name": checked['repo_name'],
                    "hits": data[checked['uuid']]['hits']
                })
                data_max += 1

            return ret
        else:
            return ret
def match_phrase_in_text(phrase):
    s = Search(using=client, index="sample_film_index")
    q = Q('match_phrase', text=phrase)
    s = s.query(q)
    s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>')  # for html
    s = s.highlight('text', fragment_size=999999999, number_of_fragments=1)
    response = s.execute()
    print "Num hits for", phrase, len(response.to_dict()['hits']['hits'])
    for hit in response:
        print hit.meta.score  #doc score
        print hit.meta.highlight  #highlighted snippet
Example #8
0
def search_content(keyword, limit=50):
    client = Elasticsearch()
    q = Q("multi_match", query=keyword, fields=['title', 'content'])
    s = Search(using=client)
    # s = Search(using=client, index="pet-index").query("match", content="金毛")
    s = Search(using=client, index="pet-index").query(q)
    s = s[0:limit]
    s = s.highlight_options(order='score')
    s = s.highlight('content')
    response = s.execute()
    return response
def free_search_in_title(word):
    s = Search(using=client, index="sample_film_index")
    # Q is a shortcut for constructing a query object
    q = Q('match', title=word)
    # At some point, q has to be added to the search object.
    s = s.query(q)
    s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') # for html
    s = s.highlight('title', word, fragment_size=999999999, number_of_fragments=1)
    response = s.execute()
    print "Num hits for", word, len(response.to_dict()['hits']['hits'])
    for hit in response:
        print hit.meta.score #doc score
        print hit.meta.highlight #highlighted snippet
Example #10
0
    def get_queryset(self):
        if not self.index_manager.connected_to_es:
            messages.warning(self.request, _(u'Impossible de se connecter à Elasticsearch'))
            return []

        if self.search_query:

            # find forums the user is allowed to visit
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()

            # setting the different querysets (according to the selected models, if any)
            part_querysets = []
            chosen_groups = self.search_form.cleaned_data['models']

            if chosen_groups:
                models = []
                for group in chosen_groups:
                    if group in settings.ZDS_APP['search']['search_groups']:
                        models.append(settings.ZDS_APP['search']['search_groups'][group][1])
            else:
                models = [v[1] for k, v in settings.ZDS_APP['search']['search_groups'].iteritems()]

            models = reduce(operator.concat, models)

            for model in models:
                part_querysets.append(getattr(self, 'get_queryset_{}s'.format(model))())

            queryset = part_querysets[0]
            for query in part_querysets[1:]:
                queryset |= query

            # weighting:
            weight_functions = []
            for _type, weights in settings.ZDS_APP['search']['boosts'].items():
                if _type in models:
                    weight_functions.append({'filter': Match(_type=_type), 'weight': weights['global']})

            scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions)
            search_queryset = search_queryset.query(scored_queryset)

            # highlighting:
            search_queryset = search_queryset.highlight_options(
                fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]'])
            search_queryset = search_queryset.highlight('text').highlight('text_html')

            # executing:
            return self.index_manager.setup_search(search_queryset)

        return []
Example #11
0
def highlight(search: Search) -> Search:
    """
    Apply hit highlighting to the search, before execution.

    Parameters
    ----------
    search : :class:`.Search`

    Returns
    -------
    :class:`.Search`
        The search object that was originally passed, updated to include
        requests for hit highlighting.

    """
    # Highlight class .search-hit defined in search.sass
    search = search.highlight_options(pre_tags=[HIGHLIGHT_TAG_OPEN],
                                      post_tags=[HIGHLIGHT_TAG_CLOSE])
    search = search.highlight('title', type='plain', number_of_fragments=0)
    search = search.highlight('title.english',
                              type='plain',
                              number_of_fragments=0)
    search = search.highlight('title.tex', type='plain', number_of_fragments=0)

    search = search.highlight('comments', number_of_fragments=0)
    # Highlight any field the name of which begins with "author".
    search = search.highlight('author*')
    search = search.highlight('owner*')
    search = search.highlight('submitter*')
    search = search.highlight('journal_ref', type='plain')
    search = search.highlight('acm_class', number_of_fragments=0)
    search = search.highlight('msc_class', number_of_fragments=0)
    search = search.highlight('doi', type='plain')
    search = search.highlight('report_num', type='plain')

    # Setting number_of_fragments to 0 tells ES to highlight the entire
    # abstract.
    search = search.highlight('abstract', type='plain', number_of_fragments=0)
    search = search.highlight('abstract.tex',
                              type='plain',
                              number_of_fragments=0)
    search = search.highlight('abstract.english',
                              type='plain',
                              number_of_fragments=0)

    search = search.highlight('primary_classification*',
                              type='plain',
                              number_of_fragments=0)
    return search
Example #12
0
def find(query, company_id, proposal_id):
    client = get_client()
    index = current_app.config["ES_IMPORT_INDEX"]
    s = Search(using=client, index=index)
    s = s.filter("term", company_id=company_id)
    # s = s.filter(~Q("term", proposal_id=proposal_id))
    # Weighting title more than the content since a user writing an exact title
    # should yield that section rather than the same query in a content
    s = s.query(Q("multi_match", query=query, fields=["title^4", "content"]))
    s = s.highlight_options(order="score",
                            pre_tags=["<span class='search-highlight'>"],
                            post_tags=["</span>"])
    s = s.highlight("title", "content")
    # Only get the first 20 results
    response = s[:20].execute()
    return response.hits
Example #13
0
    def GetAuditDataMain(self, data):
        s = Search()
        s = s[0:1000]
        s = s.highlight('*')
        s = s.highlight_options(require_field_match=False)
        t = Q('query_string',
              query=data) & ~Q('query_string',
                               default_field="AuditType.Generator",
                               query="stateagentinspector") & ~Q(
                                   'query_string',
                                   default_field="AuditType.Generator",
                                   query="w32processes-tree")

        query = s.query(t)

        try:
            r = requests.post(self.es_host + ":" + self.es_port + self.index +
                              self.type_audit_type + '/_search',
                              data=json.dumps(query.to_dict()),
                              auth=(self.elastic_user, self.elastic_pass),
                              verify=False)
        except ConnectionError as e:
            ret = {"connection_error": e.args[0]}
            return ret

        data = []

        try:
            for x in r.json()['hits']['hits']:
                for y, v in x['highlight'].iteritems():
                    data.append({
                        "doc_id":
                        x['_id'],
                        "endpoint":
                        x['_parent'],
                        "audittype":
                        x['_source']['AuditType']['Generator'],
                        "field":
                        y,
                        "response":
                        v
                    })
        except KeyError:
            pass

        return data
Example #14
0
    async def get(self):
        """Get the results from Elasticsearch."""
        q = self.request.query.get("q")
        if not q:
            return web.json_response([])

        es = Elasticsearch(
            hosts=[self.request.app["settings"].ELASTICSEARCH_URL],
            timeout=ELASTICSEARCH_TIMEOUT,
            verify_certs=ELASTICSEARCH_VERIFY_CERTS,
        )
        mapping = es.indices.get_mapping(ELASTICSEARCH_INDEX,
                                         include_type_name=True)
        search = Search(index=ELASTICSEARCH_INDEX, using=es)
        search = search.highlight_options(
            pre_tags=[PRE_HIGHLIGHT_TAG],
            post_tags=[POST_HIGHLIGHT_TAG],
        )
        query = self.queries(mapping, q)
        search = search.query(query)
        highlights = self.build_highlight(
            mapping[ELASTICSEARCH_INDEX]["mappings"]["_doc"]["properties"])

        for highlight in highlights:
            search = search.highlight(highlight, type="plain")

        search = search.extra(
            from_=0,
            size=MAX_RESULTS,
        )

        values = []
        for hit in search.execute():
            hit._d_.pop(META, None)
            if HIGHLIGHT and hasattr(hit.meta, "highlight"):
                highlight = hit.meta.highlight
                query = DictQuery(hit._d_)
                for key in highlight:
                    path = key.split(".")[:-1]
                    value = highlight[key][0]
                    query.set("/".join(path), value)
                values.append(query)
            else:
                values.append(hit._d_)
        return web.json_response(values)
Example #15
0
def highlight(search: Search) -> Search:
    """
    Apply hit highlighting to the search, before execution.

    Parameters
    ----------
    search : :class:`.Search`

    Returns
    -------
    :class:`.Search`
        The search object that was originally passed, updated to include
        requests for hit highlighting.

    """
    # Highlight class .search-hit defined in search.sass
    search = search.highlight_options(pre_tags=[HIGHLIGHT_TAG_OPEN],
                                      post_tags=[HIGHLIGHT_TAG_CLOSE])
    search = search.highlight("title", type="plain", number_of_fragments=0)
    search = search.highlight("title.english",
                              type="plain",
                              number_of_fragments=0)
    search = search.highlight("title.tex", type="plain", number_of_fragments=0)

    search = search.highlight("comments", number_of_fragments=0)
    # Highlight any field the name of which begins with "author".
    search = search.highlight("author*")
    search = search.highlight("owner*")
    search = search.highlight("announced_date_first")
    search = search.highlight("submitter*")
    search = search.highlight("journal_ref", type="plain")
    search = search.highlight("acm_class", number_of_fragments=0)
    search = search.highlight("msc_class", number_of_fragments=0)
    search = search.highlight("doi", type="plain")
    search = search.highlight("report_num", type="plain")

    # Setting number_of_fragments to 0 tells ES to highlight the entire field.
    search = search.highlight("abstract", number_of_fragments=0)
    search = search.highlight("abstract.tex",
                              type="plain",
                              number_of_fragments=0)
    search = search.highlight("abstract.english", number_of_fragments=0)
    return search
Example #16
0
    def portalSearch(expression, start=0, end=25):
        client = Elasticsearch()
        ret = {'nodes': [], 'Counts': {}}
        q = Q("bool", must=[Q('match', _all=expression)])
        s = Search(using=client, index="neo4j-inquisite-node", doc_type="Repository,Data").query(q)
        q_total = s.count()
        s = s[0:q_total]
        s = s.highlight_options(require_field_match=False)
        s = s.highlight('*', fragment_size=45)
        res = s.execute()
        data = {}
        uuids = []
        pub_uuids = {}
        if res:
            for r in res:
                d = r.to_dict()
                if r.meta.doc_type == 'Repository':
                    if int(d['published']) == 0:
                        continue
                    repo_id = r.meta.id
                    ret['nodes'].append({"id": r.meta.id, "type": "Repository", "name": d['name'], "description": d['readme']})
                    repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id)
                    pub_uuids[repo_id] = repo_uuids
                else:
                    hits = []
                    highs = r.meta.highlight.to_dict()
                    for high_field,high_value in highs.items():
                        hits.append({high_field: high_value})
                    data[r.meta.id] = {'id': r.meta.id, "hits": hits}
                    uuids.append(r.meta.id)
            qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id"
            pub_data = db.run(qString, {"uuids": uuids})
            data_max = 0
            for checked in pub_data:
                if data_max >= 32:
                    break;
                ret['nodes'].append({"id": checked['uuid'], "type": "Data", "repo_id": checked['repo_id'], "repo_name": checked['repo_name'], "hits": data[checked['uuid']]['hits']})
                data_max += 1

            return ret
        else:
            return ret
Example #17
0
    def search_by_keywords(self, keywords, subject):
        search = Search(using=self.es, index='arxiv-index')
        query_content = Q()
        keywords = re.sub('[^A-Za-z0-9 ]+', '', keywords).lower()
        for keyword in keywords.split(' '):
            query_content = query_content + \
                (Q('wildcard', pdf='*' + keyword + '*') | \
                 Q('wildcard', abstract='*' + keyword + '*') | \
                 Q('wildcard', authors='*' + keyword + '*'))
        query_subject = Q()
        query_other = Q()
        if subject and subject != 'all':
            query_subject = Q('wildcard', subject='*' + subject + '.*')
            query_other = Q('wildcard', other_subjects='*' + subject + '.*')
        final_query = Q('bool',
                        must=[query_content],
                        should=[query_subject, query_other],
                        minimum_should_match=1)
        search = search.query(final_query)
        search = search.source([
            'title', 'authors', 'subject', 'other_subjects', 'abstract',
            'abstract_url', 'pdf_url', 'submit_date'
        ])
        search = search.highlight_options(order='score')
        search = search.highlight('abstract', fragment_size=400)

        total = search.count()
        search = search[0:total]
        search = self._extend_query(search, keywords)
        request = search.execute()

        for hit in request:
            response = hit.to_dict()
            if 'highlight' in hit.meta:
                response.update({'fragment': hit.meta.highlight.abstract})
            else:
                response.update({'fragment': []})
            yield response
Example #18
0
def esearch(username="", gender="", address="", email="", photo=""):
    client = Elasticsearch()
    q = Q("bool",
          should=[
              Q("match", username=username),
              Q("match", photo=photo),
              Q("match", address=address),
              Q("match", email=email),
              Q("match", gender=gender)
          ],
          minimum_should_match=1)
    s = Search(using=client, index="users").query(q)
    s = s.highlight_options(order='score',
                            require_field_match='false',
                            fields={
                                "*": {
                                    "pre_tags": ["<font color='red'>"],
                                    "post_tags": ["</font>"]
                                }
                            })
    # s = s.highlight('username', fragment_size=50)
    response = s.execute()
    search = get_results(response)
    return search
Example #19
0
    def get_queryset(self):
        if not self.index_manager.connected_to_es:
            messages.warning(self.request, _('Impossible de se connecter à Elasticsearch'))
            return []

        if self.search_query:

            # Searches forums the user is allowed to visit
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()

            # Restrict (sub)category if any
            if self.search_form.cleaned_data['category']:
                self.content_category = self.search_form.cleaned_data['category']
            if self.search_form.cleaned_data['subcategory']:
                self.content_subcategory = self.search_form.cleaned_data['subcategory']

            # Mark that contents must come from library if required
            self.from_library = False
            if self.search_form.cleaned_data['from_library'] == 'on':
                self.from_library = True

            # Setting the different querysets (according to the selected models, if any)
            part_querysets = []
            chosen_groups = self.search_form.cleaned_data['models']

            if chosen_groups:
                models = []
                for group in chosen_groups:
                    if group in settings.ZDS_APP['search']['search_groups']:
                        models.append(settings.ZDS_APP['search']['search_groups'][group][1])
            else:
                models = [v[1] for k, v in settings.ZDS_APP['search']['search_groups'].items()]

            models = reduce(operator.concat, models)

            for model in models:
                part_querysets.append(getattr(self, 'get_queryset_{}s'.format(model))())

            queryset = part_querysets[0]
            for query in part_querysets[1:]:
                queryset |= query

            # Weighting:
            weight_functions = []
            for _type, weights in list(settings.ZDS_APP['search']['boosts'].items()):
                if _type in models:
                    weight_functions.append({'filter': Match(_type=_type), 'weight': weights['global']})

            scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions)
            search_queryset = search_queryset.query(scored_queryset)

            # Highlighting:
            search_queryset = search_queryset.highlight_options(
                fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]'])
            search_queryset = search_queryset.highlight('text').highlight('text_html')

            # Executing:
            return self.index_manager.setup_search(search_queryset)

        return []
Example #20
0
    def get_queryset(self):
        if not self.index_manager.connected_to_es:
            messages.warning(self.request,
                             _("Impossible de se connecter à Elasticsearch"))
            return []

        if self.search_query:

            # Searches forums the user is allowed to visit
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()

            # Restrict (sub)category if any
            if self.search_form.cleaned_data["category"]:
                self.content_category = self.search_form.cleaned_data[
                    "category"]
            if self.search_form.cleaned_data["subcategory"]:
                self.content_subcategory = self.search_form.cleaned_data[
                    "subcategory"]

            # Mark that contents must come from library if required
            self.from_library = False
            if self.search_form.cleaned_data["from_library"] == "on":
                self.from_library = True

            # Setting the different querysets (according to the selected models, if any)
            part_querysets = []
            chosen_groups = self.search_form.cleaned_data["models"]

            if chosen_groups:
                models = []
                for group in chosen_groups:
                    if group in settings.ZDS_APP["search"]["search_groups"]:
                        models.append(settings.ZDS_APP["search"]
                                      ["search_groups"][group][1])
            else:
                models = [
                    v[1] for k, v in settings.ZDS_APP["search"]
                    ["search_groups"].items()
                ]

            models = reduce(operator.concat, models)

            for model in models:
                part_querysets.append(
                    getattr(self, f"get_queryset_{model}s")())

            queryset = part_querysets[0]
            for query in part_querysets[1:]:
                queryset |= query

            # Weighting:
            weight_functions = []
            for _type, weights in list(
                    settings.ZDS_APP["search"]["boosts"].items()):
                if _type in models:
                    weight_functions.append({
                        "filter": Match(_type=_type),
                        "weight": weights["global"]
                    })

            scored_queryset = FunctionScore(query=queryset,
                                            boost_mode="multiply",
                                            functions=weight_functions)
            search_queryset = search_queryset.query(scored_queryset)

            # Highlighting:
            search_queryset = search_queryset.highlight_options(
                fragment_size=150,
                number_of_fragments=5,
                pre_tags=["[hl]"],
                post_tags=["[/hl]"])
            search_queryset = search_queryset.highlight("text").highlight(
                "text_html")

            # Executing:
            return self.index_manager.setup_search(search_queryset)

        return []
Example #21
0
    def search_keyword(self, keyword, doc_filter=None, size=10):
        '''
        Create the search object and get the number of hits.
        '''

        s = Search(index='lucid').using(self.client)
        print doc_filter
        if 'divtype' in doc_filter:
            for i, types in enumerate(doc_filter['divtype']):
                if i == 0:
                    filt = Q("match", divtype=types)
                else:
                    filt = filt | Q("match", divtype=types)
            s = s.filter(filt)
        n_hits = s.count()
        if 'docsource' in doc_filter:
            for i, types in enumerate(doc_filter['docsource']):
                if i == 0:
                    filt = Q("match", docsource=types)
                else:
                    filt = filt | Q("match", docsource=types)
            s = s.filter(filt)

        flag = 0
        if 'end' in doc_filter:
            flag = 1
            end_year = datetime.datetime(int(doc_filter['end']), 12, 31)
        else:
            end_year = datetime.datetime.now()

        if 'start' in doc_filter:
            flag = 0
            start_year = datetime.datetime(int(doc_filter['start']), 1, 1)
            s = s.filter('range',
                         publishdate={
                             'gte': start_year,
                             'lte': end_year
                         })

        if flag:
            s = s.filter('range', publishdate={'lte': end_year})
        # the search object. -p indicates sort by order=desc on p
        # --------------------------------------query-------------------------------------------------------
        q1 = Q("multi_match",
               query=keyword,
               fields=["title", "keywords", "doc"],
               type="best_fields",
               cutoff_frequency=0.0007,
               operator="and",
               fuzziness="AUTO")
        q2 = Q("multi_match",
               query=keyword,
               fields=["title", "keywords", "doc"],
               type="phrase")
        q3 = Q("bool", must=[q1], should=[q2])
        s = s.query(q3)

        s = s.suggest("didYouMean", keyword, phrase={'field': 'did_you_mean'})

        s = s.highlight_options(order="score",
                                pre_tags=["<mark>"],
                                post_tags=["</mark>"],
                                fragment_size=80,
                                no_match_size=0)
        s = s.highlight('title', number_of_fragments=0)
        s = s.highlight('keywords', number_of_fragments=10)
        s = s.highlight('doc', number_of_fragments=10)
        # ---------------------------------------------------------------------------------------------------
        n_hits = s.count()
        print "hits = ", n_hits
        hits_start = 0
        return s, n_hits
def search(search_params,
           index,
           page_size,
           ip,
           request,
           filter_dead,
           page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Apply term filters. Each tuple pairs a filter's parameter name in the API
    # with its corresponding field in Elasticsearch. "None" means that the
    # names are identical.
    filters = [('extension', None), ('categories', None),
               ('aspect_ratio', None), ('size', None), ('source', 'provider'),
               ('license', 'license__keyword'),
               ('license_type', 'license__keyword')]
    for tup in filters:
        api_field, elasticsearch_field = tup
        s = _apply_filter(s, search_params, api_field, elasticsearch_field)
    # Get suggestions for any route
    s = s.suggest('get_suggestion', '', term={'field': 'creator'})
    # Exclude mature content unless explicitly enabled by the requester
    if not search_params.data['mature']:
        s = s.exclude('term', mature=True)
    # Hide data sources from the catalog dynamically.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = models.ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    to_exclude = [f['provider_identifier'] for f in filtered_providers]
    s = s.exclude('terms', provider=to_exclude)

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query('simple_query_string', query=query, fields=search_fields)
        # Get suggestions for term query
        s = s.suggest('get_suggestion', query, term={'field': 'creator'})
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query('simple_query_string',
                        query=creator,
                        fields=['creator'])
            # Get suggestions for creator
            s = s.suggest('get_suggestion', creator, term={'field': 'creator'})
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query('simple_query_string', query=title, fields=['title'])
            # Get suggestions for title
            s = s.suggest('get_suggestion', title, term={'field': 'title'})
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query('simple_query_string',
                        fields=['tags.name'],
                        query=tags)
            # Get suggestions for tags
            s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'})
    # Boost by popularity metrics
    if POPULARITY_BOOST:
        queries = []
        factors = ['comments', 'views', 'likes']
        boost_factor = 100 / len(factors)
        for factor in factors:
            rank_feature_query = Q('rank_feature',
                                   field=factor,
                                   boost=boost_factor)
            queries.append(rank_feature_query)
        s = Search().query(
            Q('bool', must=s.query, should=queries, minimum_should_match=1))

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip), request_timeout=7)
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    try:
        search_response = s.execute()
        log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}')
    except RequestError as e:
        raise ValueError(e)
    results = _post_process_results(s, start, end, page_size, search_response,
                                    request, filter_dead)

    suggestion = _query_suggestions(search_response)

    result_count, page_count = _get_result_and_page_count(
        search_response, results, page_size)

    return results, page_count, result_count, suggestion
Example #23
0
    def get(self, q, from_hit=0, hits_returned=20, type='all', **kwargs):
        if type == 'all':
            types = ['statutes', 'regulations', 'advisory_opinions', 'murs']
        else:
            types = [type]

        parsed_query = parse_query_string(q)
        terms = parsed_query.get('terms')
        phrases = parsed_query.get('phrases')
        hits_returned = min([200, hits_returned])

        results = {}
        total_count = 0
        for type in types:
            must_query = [Q('term', _type=type)]
            text_highlight_query = Q()

            if len(terms):
                term_query = Q('match', _all=' '.join(terms))
                must_query.append(term_query)
                text_highlight_query = text_highlight_query & term_query

            if len(phrases):
                phrase_queries = [Q('match_phrase', _all=phrase) for phrase in phrases]
                must_query.extend(phrase_queries)
                text_highlight_query = text_highlight_query & Q('bool', must=phrase_queries)

            query = Search().using(es) \
                .query(Q('bool',
                         must=must_query,
                         should=[Q('match', no=q), Q('match_phrase', _all={"query": q, "slop": 50})])) \
                .highlight('description', 'name', 'no', 'summary', 'text') \
                .source(exclude='text') \
                .extra(size=hits_returned, from_=from_hit) \
                .index('docs')

            if type == 'advisory_opinions':
                query = query.query("match", category="Final Opinion")

            if text_highlight_query:
                query = query.highlight_options(highlight_query=text_highlight_query.to_dict())

            es_results = query.execute()

            formatted_hits = []
            for hit in es_results:
                formatted_hit = hit.to_dict()
                formatted_hit['highlights'] = []
                formatted_hits.append(formatted_hit)

                if 'highlight' in hit.meta:
                    for key in hit.meta.highlight:
                        formatted_hit['highlights'].extend(hit.meta.highlight[key])

            count = es_results.hits.total
            total_count += count

            results[type] = formatted_hits
            results['total_%s' % type] = count

        results['total_all'] = total_count
        return results
Example #24
0
def results(page):
    global tmp_text
    global tmp_title
    global tmp_star
    global tmp_min
    global tmp_max
    global tmp_director
    global tmp_lan
    global tmp_country
    global tmp_loc
    global tmp_minyear
    global tmp_maxyear
    global tmp_cats
    global gresults

    # convert the <page> parameter in url to integer.
    if type(page) is not int:
        page = int(page.encode('utf-8'))
        # if the method of request is post (for initial query), store query in local global variables
    # if the method of request is get (for "next" results), extract query contents from client's global variables
    if request.method == 'POST':
        # if has query, strip() all whitespace
        text_query = request.form['query'].strip()
        star_query = request.form['starring'].strip()

        mintime_query = request.form['mintime'].strip()
        if len(mintime_query) != 0:
            mintime_query = int(mintime_query)

        maxtime_query = request.form['maxtime'].strip()
        if len(maxtime_query) != 0:
            maxtime_query = int(maxtime_query)

        director_query = request.form['director'].strip()
        lan_query = request.form['language'].strip()
        country_query = request.form['country'].strip()
        loc_query = request.form['location'].strip()

        minyear_query = request.form['minplottime'].strip()
        if len(minyear_query) != 0:
            minyear_query = int(minyear_query)

        maxyear_query = request.form['maxplottime'].strip()
        if len(maxyear_query) != 0:
            maxyear_query = int(maxyear_query)

        cats_query = request.form['categories'].strip()

        # update global variable template data
        tmp_text = text_query
        tmp_star = star_query
        tmp_min = mintime_query
        tmp_max = maxtime_query
        tmp_director = director_query
        tmp_lan = lan_query
        tmp_country = country_query
        tmp_loc = loc_query
        tmp_minyear = minyear_query
        tmp_maxyear = maxyear_query
        tmp_cats = cats_query
    else:
        # use the current values stored in global variables.
        text_query = tmp_text
        star_query = tmp_star
        mintime_query = tmp_min
        maxtime_query = tmp_max
        director_query = tmp_director
        lan_query = tmp_lan
        country_query = tmp_country
        loc_query = tmp_loc
        minyear_query = tmp_minyear
        maxyear_query = tmp_maxyear
        cats_query = tmp_cats

    # store query values to display in search boxes in UI
    shows = {}
    shows['text'] = text_query
    shows['star'] = star_query
    shows['maxtime'] = maxtime_query
    shows['mintime'] = mintime_query
    shows['director'] = director_query
    shows['lan'] = lan_query
    shows['country'] = country_query
    shows['loc'] = loc_query
    shows['minyear'] = minyear_query
    shows['maxyear'] = maxyear_query
    shows['cats'] = cats_query
    # keep a copy of original text query, in case cull out explicit phrases later
    full_text_query = text_query

    # Create a search object to query our index
    s = Search(index=index_name)

    # Build up your elasticsearch query in piecemeal fashion based on the user's parameters passed in.
    # The search API is "chainable".
    # Each call to search.query method adds criteria to our growing elasticsearch query.
    # You will change this section based on how you want to process the query data input into your interface.

    # set flag to default to indicate all terms have been matched
    all_matched = True

    # compile a Regex pattern to extract explicit phrases enclosed by ""
    pattern = re.compile(r'(?:\B\")(.*?)(?:\b\")')
    phrases = pattern.findall(text_query)
    # get the rest free terms
    text_query = pattern.sub('', text_query).strip()

    # First doing conjunctive search over multiple fields (title and text) using the text_query and phrases passed in
    if len(text_query) + len(phrases) > 0:
        # save deep copies for disjunctive search later
        tmp_s = s.__copy__()
        tmp_phrases = phrases.copy()

        # conjunctive search for text_query AND phrases, with boosted field weight
        if len(text_query) > 0:
            s = s.query('multi_match',
                        query=text_query,
                        type='cross_fields',
                        fields=['title^2', 'text'],
                        operator='and')
        while len(phrases) > 0:
            s = s.query('multi_match',
                        query=phrases.pop(),
                        type='phrase_prefix',
                        fields=['title^2', 'text'])

        # if conjunctive search has no result, doing disjunctive ( text_query OR phrases )
        if s.count() == 0:
            # indicate not all terms are matched
            all_matched = False

            if len(text_query) > 0:
                q = Q('multi_match',
                      query=text_query,
                      type='cross_fields',
                      fields=['title^2', 'text'],
                      operator='or')
            else:
                q = Q('multi_match',
                      query=tmp_phrases.pop(),
                      type='phrase_prefix',
                      fields=['title^2', 'text'])

            while len(tmp_phrases) > 0:
                q |= Q('multi_match',
                       query=tmp_phrases.pop(),
                       type='phrase_prefix',
                       fields=['title^2', 'text'])

            s = tmp_s.query(q)

    # search for multiple fields using chained query (AND)
    if len(mintime_query) > 0:
        s = s.query('range', runtime={'gte': mintime_query})

    if len(maxtime_query) > 0:
        s = s.query('range', runtime={'lte': maxtime_query})

    if len(minyear_query) > 0:
        s = s.query('range', runtime={'gte': minyear_query})

    if len(maxyear_query) > 0:
        s = s.query('range', runtime={'lte': maxyear_query})

    if len(star_query) > 0:
        s = s.query('match', starring=star_query)

    if len(director_query) > 0:
        s = s.query('match', director=director_query)

    if len(lan_query) > 0:
        s = s.query('match', language=lan_query)

    if len(country_query) > 0:
        s = s.query('match', country=country_query)

    if len(loc_query) > 0:
        s = s.query('match', location=loc_query)

    if len(cats_query) > 0:
        s = s.query('match', categories=cats_query)

    # highlight
    s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>')
    s = s.highlight('text', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('title', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('starring', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('director', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('language', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('country', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('location', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('categories',
                    fragment_size=999999999,
                    number_of_fragments=1)

    # determine the subset of results to display (based on current <page> value)
    start = 0 + (page - 1) * 10
    end = 10 + (page - 1) * 10

    # execute search and return results in specified range.
    response = s[start:end].execute()

    # insert data into response
    resultList = {}
    for hit in response.hits:
        result = {}
        result['score'] = hit.meta.score

        if 'highlight' in hit.meta:
            if 'title' in hit.meta.highlight:
                result['title'] = hit.meta.highlight.title[0]
            else:
                result['title'] = hit.title

            if 'text' in hit.meta.highlight:
                result['text'] = hit.meta.highlight.text[0]
            else:
                result['text'] = hit.text

            if 'starring' in hit.meta.highlight:
                result['starring'] = hit.meta.highlight.starring[0]
            else:
                result['starring'] = hit.starring

            if 'director' in hit.meta.highlight:
                result['director'] = hit.meta.highlight.director[0]
            else:
                result['director'] = hit.director

            if 'language' in hit.meta.highlight:
                result['language'] = hit.meta.highlight.language[0]
            else:
                result['language'] = hit.language

            if 'country' in hit.meta.highlight:
                result['country'] = hit.meta.highlight.country[0]
            else:
                result['country'] = hit.country

            if 'location' in hit.meta.highlight:
                result['location'] = hit.meta.highlight.location[0]
            else:
                result['location'] = hit.location

            if 'categories' in hit.meta.highlight:
                result['categories'] = hit.meta.highlight.categories[0]
            else:
                result['categories'] = hit.categories

        else:
            result['title'] = hit.title
            result['text'] = hit.text
            result['starring'] = hit.starring
            result['director'] = hit.director
            result['language'] = hit.language
            result['country'] = hit.country
            result['location'] = hit.location
            result['categories'] = hit.categories

        resultList[hit.meta.id] = result

    # make the result list available globally
    gresults = resultList

    # get the total number of matching results
    result_num = response.hits.total

    # if we find the results, extract title and text information from doc_data, else do nothing
    if result_num > 0:
        return render_template('page_SERP.html',
                               results=resultList,
                               res_num=result_num,
                               page_num=page,
                               queries=shows,
                               all_matched=all_matched)
    else:
        message = []
        if len(full_text_query) > 0:
            message.append('Unknown search term: ' + full_text_query)
        if len(star_query) > 0:
            message.append('Cannot find star: ' + star_query)
        if len(director_query) > 0:
            message.append('Cannot find director: ' + director_query)
        if len(lan_query) > 0:
            message.append('Cannot find language: ' + lan_query)
        if len(country_query) > 0:
            message.append('Cannot find country: ' + country_query)
        if len(loc_query) > 0:
            message.append('Cannot find location: ' + loc_query)
        if len(cats_query) > 0:
            message.append('Cannot find categories: ' + cats_query)

        return render_template('page_SERP.html',
                               results=message,
                               res_num=result_num,
                               page_num=page,
                               queries=shows)
Example #25
0
def search_tblescalation_symptoms(value):
    es = Elasticsearch()
    query = Search(using=es, index="tblescalation-index").query("match", symptoms=value)
    s = query.highlight_options(order='score')
    response = s.execute()
    return response
Example #26
0
def search_elastic(term='',
                   user=None,
                   sort='id',
                   order='desc',
                   category='0_0',
                   quality_filter='0',
                   page=1,
                   rss=False,
                   admin=False,
                   logged_in_user=None,
                   per_page=75,
                   max_search_results=1000):
    # This function can easily be memcached now
    if page > 4294967295:
        flask.abort(404)

    es_client = Elasticsearch()

    es_sort_keys = {
        'id': 'id',
        'size': 'filesize',
        # 'name': 'display_name',  # This is slow and buggy
        'comments': 'comment_count',
        'seeders': 'seed_count',
        'leechers': 'leech_count',
        'downloads': 'download_count'
    }

    sort_ = sort.lower()
    if sort_ not in es_sort_keys:
        flask.abort(400)

    es_sort = es_sort_keys[sort]

    order_keys = {'desc': 'desc', 'asc': 'asc'}

    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)

    # Only allow ID, desc if RSS
    if rss:
        sort = es_sort_keys['id']
        order = 'desc'

    # funky, es sort is default asc, prefixed by '-' if desc
    if 'desc' == order:
        es_sort = '-' + es_sort

    # Quality filter
    quality_keys = [
        '0',  # Show all
        '1',  # No remakes
        '2',  # Only trusted
        '3'  # Only completed
    ]

    if quality_filter.lower() not in quality_keys:
        flask.abort(400)

    quality_filter = int(quality_filter)

    # Category filter
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)

        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))

        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(
                    main_cat_id, sub_cat_id)
                if not sub_category:
                    flask.abort(400)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
                if not main_category:
                    flask.abort(400)

    # This might be useless since we validate users
    # before coming into this method, but just to be safe...
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id

    same_user = False
    if logged_in_user:
        same_user = user == logged_in_user.id

    s = Search(using=es_client,
               index=app.config.get('ES_INDEX_NAME'))  # todo, sukebei prefix

    # Apply search term
    if term:
        # Do some preprocessing on the search terms for literal "" matching
        s = _parse_es_search_terms(s, term)

    # User view (/user/username)
    if user:
        s = s.filter('term', uploader_id=user)

        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in user is not the same as the user being viewed,
            # show only torrents that aren't hidden or anonymous.
            #
            # If logged in user is the same as the user being viewed,
            # show all torrents including hidden and anonymous ones.
            #
            # On RSS pages in user view, show only torrents that
            # aren't hidden or anonymous no matter what
            if not same_user or rss:
                s = s.filter('term', hidden=False)
                s = s.filter('term', anonymous=False)
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if logged_in_user and not rss:
                hiddenFilter = Q('term', hidden=False)
                userFilter = Q('term', uploader_id=logged_in_user.id)
                combinedFilter = hiddenFilter | userFilter
                s = s.filter('bool', filter=[combinedFilter])
            else:
                s = s.filter('term', hidden=False)

    if main_category:
        s = s.filter('term', main_category_id=main_cat_id)
    elif sub_category:
        s = s.filter('term', main_category_id=main_cat_id)
        s = s.filter('term', sub_category_id=sub_cat_id)

    if quality_filter == 0:
        pass
    elif quality_filter == 1:
        s = s.filter('term', remake=False)
    elif quality_filter == 2:
        s = s.filter('term', trusted=True)
    elif quality_filter == 3:
        s = s.filter('term', complete=True)

    # Apply sort
    s = s.sort(es_sort)

    # Only show first RESULTS_PER_PAGE items for RSS
    if rss:
        s = s[0:per_page]
    else:
        max_page = min(page,
                       int(math.ceil(max_search_results / float(per_page))))
        from_idx = (max_page - 1) * per_page
        to_idx = min(max_search_results, max_page * per_page)
        s = s[from_idx:to_idx]

    highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT')
    if highlight:
        s = s.highlight_options(tags_schema='styled')
        s = s.highlight("display_name")

    # Return query, uncomment print line to debug query
    # from pprint import pprint
    # print(json.dumps(s.to_dict()))
    return s.execute()
Example #27
0
def search(search_params, index, page_size, ip, request,
           filter_dead, page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Add requested filters.
    if 'li' in search_params.data:
        s = _filter_licenses(s, search_params.data['li'])
    elif 'lt' in search_params.data:
        s = _filter_licenses(s, search_params.data['lt'])

    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)
    if 'extension' in search_params.data:
        extension = search_params.data['extension']
        extension_filter = Q('term', extension=extension)
        s = s.filter('bool', should=extension_filter, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(
            key=filter_cache_key,
            timeout=CACHE_TIMEOUT,
            value=filtered_providers
        )
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query(
            'query_string',
            query=query,
            fields=search_fields,
            type='most_fields'
        )
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query(
                'query_string', query=creator, default_field='creator'
            )
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query(
                'query_string', query=title, default_field='title'
            )
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query(
                'query_string',
                default_field='tags.name',
                query=tags
            )

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip))
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    search_response = s.execute()
    results = _post_process_results(
        s,
        start,
        end,
        page_size,
        search_response,
        request,
        filter_dead
    )

    result_count, page_count = _get_result_and_page_count(
        search_response,
        results,
        page_size
    )

    return results, page_count, result_count
Example #28
0
async def get_urls(*,
                   projectName: str = Path(...),
                   dateRange: Optional[List[str]] = Query(['', '']),
                   urlPart: Optional[str] = '',
                   UrlId: Optional[str] = '',
                   statusFilter: Optional[List[str]] = Query(['']),
                   categoryFilter: Optional[List[str]] = Query(['']),
                   highlight: Optional[List[str]] = Query(['']),
                   showReturn: Optional[List[str]] = Query(['']),
                   currentPage: Optional[int] = 1,
                   pageSize: Optional[int] = 10):
    # 查询 url表中的数据
    # print(projectName,dateRange,urlPart, UrlId, currentPage,pageSize,statusFilter,categoryFilter)

    # projectName 转 projectId
    projectId = await findProjectIdFromProjectName(
        dbPrefix,
        'Project',
        queryDict={'projectName': projectName},
        showDict={'_id': 1})
    #print(projectId)
    if not projectId:
        raise HTTPException(status_code=503, detail='projectNotExist')

    # 页码起始
    start = 0
    end = 0
    # 带搜索的 es索引 (等价于 mongo中的 数据库)
    _index = f'kwm-{projectId}.urls'.lower()
    #print('_index', _index)

    ## 首先,更新 index 的 mapping,添加 {'fielddata':True} , 使得 word 和 topicWord字段,可以 整体进行操作
    #xindex = Index(_index, using=esconnection)
    ## 给 topicWord 和 word字段 添加 "fielddata": True 属性
    #xindex.put_mapping(using=esconnection, body={"properties": {"rootUrl": {"type": "text", "fielddata": True}}})

    s = Search()

    #wordPart
    if urlPart:  # 通配符 匹配查询
        # 按照url 分隔符进行 分组: '/'
        urlPart = urlPart.replace(':', '')
        urlParts = urlPart.split('/')
        all = []
        for ele in urlParts:
            all.extend(ele.split('.'))
        q = ''
        for urlPart in all:
            if urlPart:
                q += f'Q("wildcard", rootUrl=f"*{urlPart.strip()}*") &'
            #q = Q("wildcard", rootUrl=f"*{urlPart.strip()}*") & Q("wildcard", rootUrl=f"*{urlPart.strip()}*")
        q = q.rstrip('&')
        s = s.query(eval(q))

    # category
    if categoryFilter != ['']:
        categoryFilter = unquote(categoryFilter[0], 'utf-8').split(',')
        # print(categoryFilter)
        categoryFilter = '\"' + '\" \"'.join(categoryFilter) + '\"'
        #print('ccc',categoryFilter)
        q = Q("query_string", query=categoryFilter, fields=['category'])
        s = s.query(q)

    #statusfilter
    if statusFilter != ['']:
        statusFilter = unquote(statusFilter[0], 'utf-8').split(',')
        statusFilter = '\"' + '\" \"'.join(statusFilter) + '\"'
        #print('ccc',statusFilter)
        q = Q("query_string", query=f"{statusFilter}", fields=['status'])
        s = s.query(q)

    # dateRange
    # 此处 因为  dateRange 的格式问题 会有一些问题,所以先 用两次判断 解决
    if dateRange != ['', '']:
        dateRange = unquote(dateRange[0], 'utf-8').split(',')
        #print('dateRange', dateRange)
        if dateRange != ['', '']:
            #s = s.query('range',**{'timestamp': {'gte': dateRange[0], 'lt': dateRange[1]}}) # 这种也可以,为了统一Q,使用下面的表达式
            r = Q(
                'range',
                **{'modifiedTime': {
                    'gte': dateRange[0],
                    'lt': dateRange[1]
                }})
            s = s.query(r)

    # 排序设定: 构造 排序 表达式,如果存在排序的话
    s = s.source(includes=[])

    # 返回哪些字段
    if showReturn != ['']:
        showReturn = unquote(showReturn[0], 'utf-8').split(',')
        s = s.source(includes=showReturn)
    else:
        s = s.source(includes=[])

    # 高亮哪些字段
    if highlight != ['']:
        #highlight = ['rootUrl']
        highlight = unquote(highlight[0], 'utf-8').split(',')
        #print(highlight)
        s = s.highlight_options(order='score')
        s = s.highlight_options(pre_tags="<strong>")
        s = s.highlight_options(post_tags="</strong>")
        for ele in highlight:  # 每一个逐个添加高亮
            s = s.highlight(ele)

    # 返回页码
    if currentPage == 0 and pageSize == 0:
        # 返回所有数据
        s = s[0:
              10000]  # 这里写死了10000, 如果超过,会报错。最好的解决方法是 用 scan,但是 scan 不会排序。后面再解决
    else:
        start = (currentPage - 1) * pageSize
        end = start + pageSize
        s = s[start:end]

    # common setting
    #print(s.to_dict())

    # 执行
    try:
        response = await esRun(s.to_dict(),
                               _index)  #s.execute(ignore_cache=True)
    except Exception as e:
        print(e)
        return ({'count': 0, 'content': []})
    else:
        totalCount = response.hits.total.value
        temp = response.to_dict()['hits']['hits']
        result = []
        for item in temp:
            tt = {'_id': {'$oid': item['_id']}}
            tt.update(item['_source'])
            if item.get('highlight'):
                tt.update({'highlight': item['highlight']})
            if start >= 0 and end > 0:
                tt.update({'id': start + 1})
            result.append(tt)
            start = start + 1
        #print('final',result)
        return ({'count': totalCount, 'content': result})
def results(page):
    global tmp_name
    global tmp_pinyin
    global tmp_zodiac
    global tmp_difficulty
    global tmp_sentiment
    global tmp_char_num
    global gresults

    # convert the <page> parameter in url to integer.
    if type(page) is not int:
        page = int(page.encode('utf-8'))
    # if the method of request is post (for initial query), store query in local global variables
    if request.method == 'POST':
        name_query = request.form['name']
        pinyin_query = request.form['pinyin']
        zodiac_query = request.form['zodiac']
        difficulty_query = request.form['difficulty']
        sentiment_query = request.form['sentiment']
        char_num_query = request.form['char_num']

        tmp_name = name_query
        tmp_pinyin = pinyin_query
        tmp_zodiac = zodiac_query
        tmp_difficulty = difficulty_query
        tmp_sentiment = sentiment_query
        tmp_char_num = char_num_query

    else:
        name_query = tmp_name
        pinyin_query = tmp_pinyin
        zodiac_query = tmp_zodiac
        difficulty_query = tmp_difficulty
        sentiment_query = tmp_sentiment
        char_num_query = tmp_char_num

    shows = {}
    shows['name'] = name_query
    shows['pinyin'] = pinyin_query
    shows['zodiac'] = zodiac_query
    shows['difficulty'] = difficulty_query
    shows['sentiment'] = sentiment_query
    shows['char_num'] = char_num_query


    s = Search(index='idioms_search')

    if len(name_query) > 0:
        s = s.query('multi_match', query=name_query, type='cross_fields', fields=['name^4', 'english^4', 'desc_segmentation', 'desc_translation',
                                                                                  'synonym^2', 'source_translation',
                                                                                  'source_segmentation^2', 'story_translation',
                                                                                  'story_segmentation', 'usage_translation',
                                                                                  'usage_segmentation'], operator='and')
    if len(pinyin_query) > 0:
        q = Q('match', pinyin={'query': pinyin_query, 'operator': 'and'})
        s = s.query(q)

    if len(zodiac_query) > 0:
        q = Q('match', zodiac=zodiac_query)
        s = s.query(q)

    if len(difficulty_query) > 0:
        q = Q('match', difficulty=difficulty_query)
        s = s.query(q)

    if len(sentiment_query) > 0:
        q = Q('match', sentiment=sentiment_query)
        s = s.query(q)

    if len(char_num_query) > 0:
        q = Q('match', char_num=char_num_query)
        s = s.query(q)


    s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>')
    s = s.highlight('name', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('pinyin', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('english', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('zodiac', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('desc_segmentation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('desc_translation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('source_segmentation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('source_translation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('story_segmentation', fragment_size=999999999, number_of_fragments=20)
    s = s.highlight('story_translation', fragment_size=999999999, number_of_fragments=20)
    s = s.highlight('usage_segmentation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('usage_translation', fragment_size=999999999, number_of_fragments=5)
    s = s.highlight('difficulty', fragment_size=999999999, number_of_fragments=1)
    s = s.highlight('sentiment', fragment_size=999999999, number_of_fragments=1)

    # determine the subset of results to display (based on current <page> value)
    start = 0 + (page - 1) * 10
    end = 10 + (page - 1) * 10

    response = s[start:end].execute()

    # if response.hits.total == 0:
    #     # if conjunction failed, make the query disjunctive for text field
    #     search = Search(index='idioms_search')

    resultList = {}
    translation_hits = []
    source_translation_hits = []
    story_translation_hits = []
    usage_translation_hits = []
    for hit in response.hits:
        result = dict()
        result['score'] = hit.meta.score

        if 'highlight' in hit.meta:
            if 'name' in hit.meta.highlight:
                result['name'] = hit.meta.highlight.name[0]
            else:
                result['name'] = hit.name
            if 'english' in hit.meta.highlight:
                result['english'] = hit.meta.highlight.english[0]
            else:
                result['english'] = hit.english
            if 'pinyin' in hit.meta.highlight:
                result['pinyin'] = hit.meta.highlight.pinyin[0]
            else:
                result['pinyin'] = hit.pinyin
            if 'zodiac' in hit.meta.highlight:
                result['zodiac'] = hit.meta.highlight.zodiac
            else:
                result['zodiac'] = hit.zodiac
            if 'difficulty' in hit.meta.highlight:
                result['difficulty'] = hit.meta.highlight.difficulty[0]
            else:
                result['difficulty'] = hit.difficulty
            if 'sentiment' in hit.meta.highlight:
                result['sentiment'] = hit.meta.highlight.sentiment[0]
            else:
                result['sentiment'] = hit.sentiment
            if 'desc_translation' in hit.meta.highlight:
                result['desc_translation'] = hit.meta.highlight.desc_translation[0]
                translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.desc_translation]
            if 'source_translation' in hit.meta.highlight:
                result['source_translation'] = hit.meta.highlight.source_translation[0]
                source_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.source_translation]
            if 'story_translation' in hit.meta.highlight:
                result['story_translation'] = hit.meta.highlight.story_translation[0]
                story_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.story_translation]
            if 'usage_translation' in hit.meta.highlight:
                result['usage_translation'] = hit.meta.highlight.usage_translation[0]
                usage_translation_hits = [re.sub(r'</?mark>', '', t) for t in hit.meta.highlight.usage_translation]
        else:
            result['name'] = hit.name
            result['pinyin'] = hit.pinyin
            result['english'] = hit.english
            # result['description'] = hit.description
            result['zodiac'] = hit.zodiac
            result['difficulty'] = hit.difficulty
            result['sentiment'] = hit.sentiment
        sgmt = json_data[hit.meta.id]['Description_Segmentation']
        sent_code = json_data[hit.meta.id]['Description_Sentence_Code']
        sgmt_dict = dict()
        src_sgmt = json_data[hit.meta.id]['Source_Segmentation']
        src_sent_code = json_data[hit.meta.id]['Source_Sentence_Code']
        src_sgmt_dict = dict()
        story_sgmt = json_data[hit.meta.id]['Story_Segmentation']
        story_sent_code = json_data[hit.meta.id]['Story_Sentence_Code']
        story_sgmt_dict = dict()
        usage_sgmt = json_data[hit.meta.id]['Usage_Segmentation']
        usage_sent_code = json_data[hit.meta.id]['Usage_Sentence_Code']
        usage_sgmt_dict = dict()
        translation_index_hits = find_translations(translation_hits, sent_code)
        translation_src_hits = find_translations(source_translation_hits, src_sent_code)
        translation_story_hits = find_translations(story_translation_hits, story_sent_code)
        translation_usage_hits = find_translations(usage_translation_hits, usage_sent_code)
        make_sgmt_dict(sgmt, sgmt_dict)
        make_sgmt_dict(src_sgmt, src_sgmt_dict)
        make_sgmt_dict(story_sgmt, story_sgmt_dict)
        make_sgmt_dict(usage_sgmt, usage_sgmt_dict)



        result['desc_segmentation'] = sgmt_dict
        result['desc_sentence_code'] = sent_code
        result['translation_hits'] = translation_index_hits
        result['source_segmentation'] = src_sgmt_dict
        result['source_sentence_code'] = src_sent_code
        result['source_translation_hits'] = translation_src_hits
        result['story_segmentation'] = story_sgmt_dict
        result['story_sentence_code'] = story_sent_code
        result['story_translation_hits'] = translation_story_hits
        result['usage_segmentation'] = usage_sgmt_dict
        result['usage_sentence_code'] = usage_sent_code
        result['usage_translation_hits'] = translation_usage_hits
        resultList[hit.meta.id] = result

    # make the result list available globally
    gresults = resultList

    # total number of matching results
    result_num = response.hits.total

    # if results are found, extract title and text information from doc_data, else do nothing
    message = []
    if result_num > 0:
        if result_num > 500:
            message.append('Over 500 search results! We recommend you narrow your search.')
        return render_template('page_SERP.html', results=resultList, res_num=result_num, page_num=page, queries=shows,
                               zodiac=zodiac, sentiment=sentiment, difficulty=difficulty, char_num=char_num, warning=message,
                               json_data=json_data)
    else:
        warning = None
        message.append(['One of the field you typed in cannot be found.'])
        return render_template('page_SERP.html', results=message, res_num=result_num, page_num=page, queries=shows,
                               warning=warning, zodiac=zodiac, sentiment=sentiment, difficulty=difficulty, char_num=char_num,
                               json_data=json_data)
Example #30
0
async def getBasicWords(*,
                        projectName: str = Path(...),
                        dateRange: Optional[List[str]] = Query(['', '']),
                        basicWordItemId: Optional[str] = None,
                        highlight: Optional[List[str]] = Query(['']),
                        showReturn: Optional[List[str]] = Query(['']),
                        statusFilter: Optional[List[str]] = Query(['']),
                        lengthFilter: Optional[List[str]] = Query(['']),
                        weightFilter: Optional[List[str]] = Query(['']),
                        categoryFilter: Optional[List[str]] = Query(['']),
                        wordPart: Optional[str] = None,
                        sortDict: Optional[str] = '{}',
                        fullMatch: Optional[bool] = False,
                        currentPage: Optional[int] = 1,
                        pageSize: Optional[int] = 10):
    # 查询Project Name下的所有 符合条件的 基础词列表,使用es
    # 要查询的 es索引,类似于  mongodb中的数据库
    # projectName 转 projectId
    #print(projectName,currentPage,pageSize,dateRange,fullMatch,basicWordItemId,statusFilter,lengthFilter,weightFilter,categoryFilter,wordPart,sortDict)

    projectId = await findProjectIdFromProjectName(
        dbPrefix,
        'Project',
        queryDict={'projectName': projectName},
        showDict={'_id': 1})
    if not projectId:
        raise HTTPException(status_code=503, detail='projectNotExist')

    # 页码起始
    start = 0
    end = 0
    # 带搜索的 es索引 (等价于 mongo中的 数据库)
    _index = f'kwm-{projectId}.basicwords'.lower()
    #print('_index', _index)

    s = Search()

    #wordPart
    if wordPart:
        q = Q("multi_match", query=f"{wordPart.strip()}", fields=['word'])
        s = s.query(q)

    # category
    if categoryFilter != ['']:
        categoryFilter = unquote(categoryFilter[0], 'utf-8').split(',')
        # print(categoryFilter)
        categoryFilter = '\"' + '\" \"'.join(categoryFilter) + '\"'
        #print('ccc',categoryFilter)
        q = Q("query_string", query=categoryFilter, fields=['category'])
        s = s.query(q)

    #statusfilter
    if statusFilter != ['']:
        statusFilter = unquote(statusFilter[0], 'utf-8').split(',')
        statusFilter = '\"' + '\" \"'.join(statusFilter) + '\"'
        #print('ccc',statusFilter)
        q = Q("query_string", query=f"{statusFilter}", fields=['status'])
        s = s.query(q)

    # dateRange
    # 此处 因为  dateRange 的格式问题 会有一些问题,所以先 用两次判断 解决
    if dateRange != ['', '']:
        dateRange = unquote(dateRange[0], 'utf-8').split(',')
        #print('dateRange', dateRange)
        if dateRange != ['', '']:
            #s = s.query('range',**{'timestamp': {'gte': dateRange[0], 'lt': dateRange[1]}}) # 这种也可以,为了统一Q,使用下面的表达式
            r = Q('range',
                  **{'timestamp': {
                      'gte': dateRange[0],
                      'lt': dateRange[1]
                  }})
            s = s.query(r)
    # length
    if lengthFilter != ['']:
        lengthFilter = unquote(lengthFilter[0], 'utf-8').split(',')

        # 存在 lengthFilter 查询
        #长度对应字典
        lengthDict = {
            '1': [0, 3],
            '2': [3, 5],
            '3': [5, 8],
            '4': [8, 13],
            '5': [13, 18],
            '6': [18, 25]
        }

        ss = ''
        for ele in lengthFilter:
            ss = ss + '|' + f'Q("range",**{{"Length": {{"gte": {lengthDict[ele][0]},"lt": {lengthDict[ele][1]}}}}})'
        #print(ss[1:])
        s = s.query(eval(ss[1:]))

    # 权重配置
    if weightFilter != ['']:
        weightFilter = unquote(weightFilter[0], 'utf-8').split(',')
        #权重对应字典
        weightDict = {
            '1': [0, 0.3],
            '2': [0.3, 0.5],
            '3': [0.5, 1],
            '4': [1, 5],
            '5': [5, 10],
            '6': [10, 20],
            '7': [20, 50]
        }

        ss = ''
        for ele in weightFilter:
            ss = ss + '|' + f'Q("range",**{{"weight": {{"gte": {weightDict[ele][0]},"lt": {weightDict[ele][1]}}}}})'
        #print(ss[1:])
        s = s.query(eval(ss[1:]))

    # 排序设定: 构造 排序 表达式,如果存在排序的话
    sortMap = {'desc': -1, 'asc': 1}
    #print('sortDict',sortDict)
    if sortDict != '{}':
        # 前端有 排序信息发过来,检查是否有效
        # 装换 sortDict 为 字典
        sortDict = json.loads(sortDict)
        for ele in list(sortDict.keys()):
            if sortDict[ele] == 'normal':
                sortDict.pop(ele)
        #print('sortDict',sortDict)

        if sortDict != {}:
            # 非空
            sortDicttemp = [(ele, sortMap[sortDict[ele]]) for ele in sortDict]
            sortDict = sortDicttemp
        else:
            sortDict = []
        #print('sortDict',sortDict)
        # 构造 排序命令
        sorts = []
        for ss in sortDict:
            if ss[1] == 1:
                # asc
                sorts.append(ss[0])
            else:
                # desc
                sorts.append('-' + ss[0])
        #print('sorts', sorts)
        s = s.sort(*sorts)
    else:
        s = s.sort('_id')

    # 返回哪些字段
    if showReturn != ['']:
        showReturn = unquote(showReturn[0], 'utf-8').split(',')
        s = s.source(includes=showReturn)
    else:
        s = s.source(includes=[])

    # 高亮哪些字段
    if highlight != ['']:
        highlight = unquote(highlight[0], 'utf-8').split(',')
        #print(highlight)
        s = s.highlight_options(order='score')
        s = s.highlight_options(pre_tags="<strong>")
        s = s.highlight_options(post_tags="</strong>")
        for ele in highlight:  # 每一个逐个添加高亮
            s = s.highlight(ele)

    # 返回页码
    if currentPage == 0 and pageSize == 0:
        # 返回所有数据
        s = s[0:
              10000]  # 这里写死了10000, 如果超过,会报错。最好的解决方法是 用 scan,但是 scan 不会排序。后面再解决
    else:
        start = (currentPage - 1) * pageSize
        end = start + pageSize
        s = s[start:end]

    # common setting
    #print(s.to_dict())

    # 执行
    try:
        response = await esRun(s.to_dict(),
                               _index)  #s.execute(ignore_cache=True)
    except Exception as e:
        print(e)
        return ({'count': 0, 'content': []})
    else:
        totalCount = response.hits.total.value
        temp = response.to_dict()['hits']['hits']
        result = []
        for item in temp:
            tt = {'_id': {'$oid': item['_id']}}
            tt.update(item['_source'])
            if item.get('highlight'):
                tt.update({'highlight': item['highlight']})
            if start >= 0 and end > 0:
                tt.update({'id': start + 1})
            result.append(tt)
            start = start + 1
        #print(result)
        return ({'count': totalCount, 'content': result})
Example #31
0
    def search_by_fields(self, title, authors, abstract, content, subject):
        search = Search(using=self.es, index='arxiv-index')
        query_title = Q()
        query_authors = Q()
        query_subject = Q()
        query_other = Q()
        query_abstract = Q()
        query_content = Q()

        if title:
            title = re.sub('[^A-Za-z0-9 ]+', '', title).lower()
            for word in title.split(' '):
                query_title = query_title + \
                    Q('wildcard', title='*' + word + '*')

        if authors:
            authors = re.sub('[^A-Za-z0-9 ]+', '', authors).lower()
            for author in authors.split(' '):
                query_authors = query_authors + \
                    Q('wildcard', authors='*' + author + '*')

        if subject and subject != 'all':
            query_subject = Q('wildcard', subject='*' + subject + '.*')
            query_other = Q('wildcard', other_subjects='*' + subject + '.*')

        if abstract:
            abstract = re.sub('[^A-Za-z0-9 ]+', '', abstract).lower()
            for word in abstract.split(' '):
                query_abstract = query_abstract + \
                    Q('wildcard', abstract='*' + word + '*')

        if content:
            content = re.sub('[^A-Za-z0-9 ]+', '', content).lower()
            for word in content.split(' '):
                query_content = query_content + \
                    Q('wildcard', pdf='*' + word + '*')

        final_query = Q('bool',
                        must=[query_title, query_authors, query_subject],
                        should=[query_abstract, query_content, query_other],
                        minimum_should_match=2)

        total = search.count()
        search = search[0:total]
        search = search.query(final_query)
        search = search.source([
            'title', 'authors', 'subject', 'other_subjects', 'abstract',
            'abstract_url', 'pdf_url', 'submit_date'
        ])

        if content:
            search = self._extend_query(search, content)
        if abstract:
            search = self._extend_query(search, abstract)

        search = search.highlight_options(order='score')
        search = search.highlight('abstract', fragment_size=400)
        request = search.execute()

        for hit in request:
            response = hit.to_dict()
            if 'highlight' in hit.meta:
                response.update({'fragment': hit.meta.highlight.abstract})
            else:
                response.update({'fragment': []})
            yield response
Example #32
0
def run_query(q,
              model,
              size,
              offset=0,
              facets={},
              fuzzy=False,
              connection="default",
              page=None):
    """Run an Elasticsearch query.

    Arguments:
        q (str): the string to search
        model (str): one of 'chem', 'puc', 'product', or 'datadocument'
        size (int): the number of objects to return
        offset (optional int): the value to start at [default=0]
        page (optional int): the Django paginator page to return [default=None]
        facets (optional dict): a key, value pair to filter on. value can be a str or a list of strings 

            [default={}] e.g. {'datadocument_grouptype': 'CO'} or 
            {'datadocument_grouptype': ['CO', 'FU']}
        fuzzy (optional bool): enable fuzzy search [default=False]
        connection (optional str): which Elasticsearch instance to use [default="default"]

    Returns:
        {
        'hits': a list of results,
        'facets': a dictionary of facets,
        'took': time in seconds of search,
        'total': total results found
        }

    """
    # make sure the model is valid
    validate_model(model)
    # get index to search on based on ELASTICSEARCH setting and con
    index = settings.ELASTICSEARCH.get(connection, {}).get("INDEX", "_all")
    # get the search object
    s = Search(using=connection, index=index)
    # filter on the facets
    for term, filter_array in facets.items():
        s = s.filter("terms", **{term: filter_array})
    # pull relevant fields
    id_field = get_id_field(model)
    fields = FIELD_DICT[id_field]
    # filter null id
    s = s.filter("exists", field=id_field)
    # Enable highlighting
    s = s.highlight_options(order="score")
    s = s.highlight("*")
    # add the query with optional fuzziness
    if fuzzy:
        s = s.query(MultiMatch(query=q, fields=fields, fuzziness="AUTO"))
    else:
        s = s.query(MultiMatch(query=q, fields=fields))
    # collapse on id_field
    dict_update = {}
    inner_hits = []
    for f in list(FIELD_DICT.keys()) + ["rawchem_id"]:
        inner_hits.append({"name": f, "collapse": {"field": f}, "size": 0})
    dict_update.update(
        {"collapse": {
            "field": id_field,
            "inner_hits": inner_hits
        }})
    # set the size of the result
    if page is not None:
        dict_update.update({"size": 0, "from": 0})
    else:
        dict_update.update({"size": size, "from": offset})
    s.update_from_dict(dict_update)
    # aggregate facets
    for facet in FACETS:
        a = A("terms", field=facet)
        a.metric("unique_count", "cardinality", field=id_field)
        s.aggs.bucket(facet, a)
    # add cardinal aggregation on id_field to get unique total count
    s.aggs.bucket(TOTAL_COUNT_AGG, A("cardinality", field=id_field))
    # execute the search
    response = s.execute().to_dict()
    # gather the results
    # hits
    results_hits = []
    for h in response["hits"]["hits"]:
        results_hits_object = {
            "id":
            h["_source"][id_field],
            "num_rawchem":
            h["inner_hits"]["rawchem_id"]["hits"]["total"]["value"],
            "num_truechem":
            h["inner_hits"]["truechem_dtxsid"]["hits"]["total"]["value"],
            "num_datadocument":
            h["inner_hits"]["datadocument_id"]["hits"]["total"]["value"],
            "num_product":
            h["inner_hits"]["product_id"]["hits"]["total"]["value"],
            "num_puc":
            h["inner_hits"]["puc_id"]["hits"]["total"]["value"],
            "highlights":
            h["highlight"],
            "source":
            h["_source"],
        }
        results_hits.append(results_hits_object)
    # available facets
    results_facets = {}
    response_aggs = response["aggregations"]
    for facet in FACETS:
        results_facets_data = response_aggs[facet]
        results_facets_list = []
        for b in results_facets_data["buckets"]:
            results_facets_object = {
                "key": b["key"],
                "count": b["unique_count"]["value"],
            }
            results_facets_list.append(results_facets_object)
        results_facets[facet] = results_facets_list
    # get unique total count
    length = response_aggs[TOTAL_COUNT_AGG]["value"]
    # replace hits with paginator
    if page is not None:
        espaginator = ElasticPaginator(length,
                                       q,
                                       model,
                                       facets,
                                       fuzzy,
                                       connection="default")
        results_hits = Paginator(espaginator, size).get_page(page)
    return {
        "hits": results_hits,
        "facets": results_facets,
        "took": response["took"] / 1000,
        "total": length,
    }
Example #33
0
def query(word: str,
          page: int,
          size: int,
          post_type: str,
          boards: list,
          sort: str,
          order: str,
          start: datetime = None,
          end: datetime = None,
          pos: bool = False,
          window_size: int = 10) -> dict:
    """Query word."""
    s = Search(using=client, index='ptt')
    must = [Q('match', content=word)]

    if isinstance(post_type, str):
        must.append(Q('match', post_type=int(post_type)), )

    s.query = Q(
        'bool',
        must=must,
        should=[Q('match', board=board) for board in boards],
        minimum_should_match=1,
    )

    # sort and order
    s = s.sort({sort: {'order': order}})

    # filter date range
    s = s.filter('range', published={'gte': start, 'lte': end})

    # highlight
    s = s.highlight_options(number_of_fragments=0)
    s = s.highlight('content')
    total = s.count()
    left_bound = page * size
    right_bound = left_bound + size
    data = []
    if total:
        for i in s[left_bound:right_bound]:
            d = i.to_dict()
            if pos:
                segments = j.seg(d['content'], pos=True)
                for idx, (char, pos) in enumerate(segments):
                    segments[idx] = f'{char}|{pos}'
                    if char == word:
                        segments[idx] = f'<em>{segments[idx]}</em>'
                        left = idx - window_size
                        if left < 0:
                            left = 0
                        right = idx + window_size + 1
                        break
                d['concordance'] = (
                    ' '.join(segments[left:idx]),
                    segments[idx],
                    ' '.join(f'{char}|{pos}'
                             for (char, pos) in segments[idx + 1:right]),
                )

            else:
                concordance = i.meta.highlight.content[0].replace('\n ', '')
                concordance = concordance.split(' ')
                for idx, word in enumerate(concordance):
                    if word.startswith('<em>'):
                        left = idx - window_size
                        if left < 0:
                            left = 0
                        right = idx + window_size + 1
                        d['concordance'] = (
                            ' '.join(concordance[left:idx]),
                            concordance[idx],
                            ' '.join(concordance[idx + 1:right]),
                        )
                        break
            data.append(d)
    output = {
        'total': total,
        'page': page,
        'size': size,
        'data': data,
    }
    return output