Example #1
0
    def find_tweetable(self, limit=10, timeout=30):

        s = Search(using=self.es_client, index=self.resource_index)
        s = s.filter('term', **{'resource.keyword': 'gpc'})

        # Only tweet about sites where the last scan succeded, a gpc.json was
        # found, and it indicates support for GPC.
        s = s.filter('term', **{'status.keyword': 'ok'})
        s = s.filter('term', **{'scan_data.found': True})
        s = s.filter('term', **{'scan_data.gpc.parsed.gpc': True})
        # Only tweet about base domains, not subdomains.
        s = s.filter('term', **{'is_base_domain': True})
        # Don't tweet about sites we're previously tweeted about (or may have).
        # We may have set `tweeting` and failed before we could set `tweeted`. In this case, it's
        # unclear if the tweet went out or not - needs to be checked manually.
        s = s.exclude('term', **{'gpcsup.tweeting': True})
        s = s.exclude('term', **{'gpcsup.tweeted': True})

        s = s.sort('update_dt')
        s = s[:limit]
        s = s.params(request_timeout=timeout)

        response = s.execute()

        return [r.domain for r in response]
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
    """
    Queries fatcat search index (the full regular fatcat.wiki release index)
    for search string passed (and some filters), iterates over the result set
    (using scroll), and fetches full release entity (via api.fatcat.wik) for
    each.

    TODO: group by work_id
    """
    api_session = requests_retry_session()

    es_backend = os.environ.get(
        "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki"
    )
    es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release")
    es_client = elasticsearch.Elasticsearch(es_backend)

    search = Search(using=es_client, index=es_index)

    search = search.exclude("terms", release_type=["stub", "component", "abstract"])

    # "Emerald Expert Briefings"
    search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])

    # ResearchGate
    search = search.exclude("terms", doi_prefix=["10.13140"])

    if fulltext_only:
        search = search.filter("terms", in_ia=True)

    search = search.query(
        Q("query_string", query=query, default_operator="AND", fields=["biblio"])
    )

    print(f"Expecting {search.count()} search hits", file=sys.stderr)

    search = search.params(clear_scroll=False)
    search = search.params(_source=False)

    results = search.scan()
    for hit in results:
        release_id = hit.meta.id
        resp = api_session.get(
            f"https://api.fatcat.wiki/v0/release/{release_id}",
            params={
                "expand": "container,files,filesets,webcaptures",
                "hide": "references",
            },
        )
        resp.raise_for_status()
        row = dict(
            fatcat_hit=hit.meta._d_,
            release_id=release_id,
            fatcat_release=resp.json(),
        )
        print(json.dumps(row, sort_keys=True), file=json_output)
Example #3
0
def remove_cropped_if_asset_exists(asset):
    try:
        search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) +
                              cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta))
        search.query = Q('match', asset_id=asset.asset_id)
        search.exclude()
        for hit in search:
            idx = '{}-{}'.format(asset.asset_id, hit.cropped_id)
            s = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) +
                             cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped))
            s.query = Q('match', id=idx)
            s.delete()
        search.delete()
    except:
        print(sys.exc_info()[0])
def related_images(uuid, index, request, filter_dead):
    """
    Given a UUID, find related search results.
    """
    # Convert UUID to sequential ID.
    item = Search(index=index)
    item = item.query('match', identifier=uuid)
    _id = item.execute().hits[0].id

    s = Search(index=index)
    s = s.query('more_like_this',
                fields=['tags.name', 'title', 'creator'],
                like={
                    '_index': index,
                    '_id': _id
                },
                min_term_freq=1,
                max_query_terms=50)
    # Never show mature content in recommendations.
    s = s.exclude('term', mature=True)
    page_size = 10
    page = 1
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    response = s.execute()
    results = _post_process_results(s, start, end, page_size, response,
                                    request, filter_dead)

    result_count, _ = _get_result_and_page_count(response, results, page_size)

    return results, result_count
Example #5
0
def exclude_filtered_providers(s: Search) -> Search:
    """
    Hide data sources from the catalog dynamically. This excludes providers with
    ``filter_content`` enabled from the search results.

    :param s: the search query to issue to Elasticsearch
    :return: the modified search query
    """
    logger = parent_logger.getChild("exclude_filtered_providers")
    filter_cache_key = "filtered_providers"
    filtered_providers = cache.get(key=filter_cache_key)
    if filtered_providers is None:
        filtered_providers = ContentProvider.objects.filter(
            filter_content=True).values("provider_identifier")
        logger.debug("adding filtered providers to cache")
        cache.set(
            key=filter_cache_key,
            timeout=FILTER_CACHE_TIMEOUT,
            value=filtered_providers,
        )

    logger.info(f'filtered_providers={",".join(filtered_providers)}')
    if len(filtered_providers) != 0:
        to_exclude = [f["provider_identifier"] for f in filtered_providers]
        logger.info("auto-excluding filtered providers")
        s = s.exclude("terms", provider=to_exclude)
    return s
Example #6
0
def get_warnings_by_package(package_name, package_warnings):
    '''
        Returns all the warnings for a specific package

        Arguments:
            package_name: the package in the database
            package_warnings: a dict keyed by warning_type we will populate in this function
        Returns:
            None, but populates the package_warnings dict
    '''
    client = Elasticsearch(host=HOST)
    s = Search(using=client)
    s = s.source(['package', 'type', 'severity', 'score'])
    #q = Q("match", type=warning)  & Q("match", severity=severity)
    s = s.query("match", package__keyword=package_name)
    s = s.exclude("match", tag="test_code")
    #print(s.to_dict())

    # process the query
    for hit in s.scan():
        #print(hit.type)
        #print(hit.severity)
        #print(hit.package)

        if hit.type not in package_warnings.keys():
            package_warnings[hit.type] = {}
        if hit.severity in package_warnings[hit.type]:
            package_warnings[hit.type][hit.severity] += 1
        else:
            package_warnings[hit.type][hit.severity] = 0
def init_last_datetime(**kwargs):
    from airflow.models import Variable
    from elasticsearch_dsl import Search
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT

    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)
    s = s.exclude('exists', field="is_english")
    Variable.set("lemmatize_number_of_documents_eng", s.count())
Example #8
0
    def get(self, request, *args, **kwargs):
        if "q" in request.GET:
            self.search_query = "".join(request.GET["q"])
        excluded_content_ids = request.GET.get("excluded", "").split(",")
        results = []
        if self.index_manager.connected_to_es and self.search_query:
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()
            if len(excluded_content_ids) > 0 and excluded_content_ids != [""]:
                search_queryset = search_queryset.exclude(
                    "terms", content_pk=excluded_content_ids)
            query = Match(_type="publishedcontent") & MultiMatch(
                query=self.search_query, fields=["title", "description"])

            functions_score = [
                {
                    "filter":
                    Match(content_type="TUTORIAL"),
                    "weight":
                    settings.ZDS_APP["search"]["boosts"]["publishedcontent"]
                    ["if_tutorial"],
                },
                {
                    "filter":
                    Match(content_type="ARTICLE"),
                    "weight":
                    settings.ZDS_APP["search"]["boosts"]["publishedcontent"]
                    ["if_article"],
                },
                {
                    "filter":
                    Match(content_type="OPINION"),
                    "weight":
                    settings.ZDS_APP["search"]["boosts"]["publishedcontent"]
                    ["if_opinion"],
                },
            ]

            scored_query = FunctionScore(query=query,
                                         boost_mode="multiply",
                                         functions=functions_score)
            search_queryset = search_queryset.query(scored_query)[:10]

            # Build the result
            for hit in search_queryset.execute():
                result = {
                    "id": hit.content_pk,
                    "pubdate": hit.publication_date,
                    "title": str(hit.title),
                    "description": str(hit.description),
                }
                results.append(result)

        data = {"results": results}

        return HttpResponse(json_handler.dumps(data),
                            content_type="application/json")
Example #9
0
    def _build_search(self, index, **kwargs):
        """
        Internal method building the quering with respect to elasticsearch-dsl package.
        :param index: index for search
        :param kwargs: see getDocumentsCount and getDocuments
        :return:
        """
        startdate = kwargs.get('startdate', None)
        if startdate:
            timefield = kwargs.get('timefield')
            enddate = kwargs.get('enddate', 'now')
        filters = kwargs.get('filters', None)
        exclude = kwargs.get('exclude', None)
        ranges = kwargs.get('ranges', None)
        fields_to_include = kwargs.get('field_to_include', None)
        wildcards = kwargs.get('wildcard', None)
        start_from = kwargs.get('from_', None)
        size = kwargs.get('size', None)
        sort_ = kwargs.get('sort', None)

        search = Search(using=self.es, index=index, doc_type=self.doc_type)\
            .params(request_timeout=2000)

        if startdate:
            if startdate != enddate:
                timeRange = {timefield: {'gte': startdate, 'lt': enddate}}
            else:
                timeRange = {timefield: {'gte': startdate, 'lte': enddate}}
            search = search.filter('range', **timeRange)
        if filters:
            for key, val in filters.items():
                search = search.filter(
                    'terms' if isinstance(val, list) else 'term', **{key: val})
        if exclude:
            for ex in exclude.keys():
                search = search.exclude('terms', **{ex: exclude[ex]})
        if ranges:
            # ranges are expected in format:
            # [{field:{'gte':value, 'lte':value}}, {field: {'gte': value}}, {field: {'lte': value}}]
            for range_filter in ranges:
                search = search.filter('range', **range_filter)
        if fields_to_include:
            for field in fields_to_include.keys():
                search = search.source(**{field: fields_to_include[field]})
        if wildcards:
            for wild in wildcards:
                search = search.filter('wildcard', **{wild: wildcards[wild]})
        if start_from:
            search = search.extra(**{"from_": start_from})
        if size:
            search = search.extra(**{"size": size})
        if sort_:
            search = search.sort(*sort_)

        self._logger.info(json.dumps(search.to_dict()))

        return search
def make_elasticsearch(index,
                       filters,
                       queries=None,
                       exclusion_filters=None,
                       range_filters=None,
                       prefix_filters=None,
                       terms_filters=None,
                       es_url='http://elasticsearch.lco.gtn:9200'):
    """
    Make an ElasticSearch query

    Parameters
    ----------
    index : str
            Name of index to search
    filters : list of dicts
              Each dict has a criterion for an ElasticSearch "filter"
    queries : list of dicts
              Each dict has a "type" and "query" entry. The 'query' entry is a dict that has a criterion for an
              ElasticSearch "query"
    exclusion_filters : list of dicts
                        Each dict has a criterion for an ElasticSearch "exclude"
    range_filters: list of dicts
                   Each dict has a criterion an ElasticSearch "range filter"
    es_url : str
             URL of the ElasticSearch host

    Returns
    -------
    search : elasticsearch_dsl.Search
             The ElasticSearch object
    """
    if queries is None:
        queries = []
    if exclusion_filters is None:
        exclusion_filters = []
    if range_filters is None:
        range_filters = []
    if terms_filters is None:
        terms_filters = []
    if prefix_filters is None:
        prefix_filters = []
    es = Elasticsearch(es_url)
    s = Search(using=es, index=index)
    for f in filters:
        s = s.filter('term', **f)
    for f in terms_filters:
        s = s.filter('terms', **f)
    for f in range_filters:
        s = s.filter('range', **f)
    for f in prefix_filters:
        s = s.filter('prefix', **f)
    for f in exclusion_filters:
        s = s.exclude('term', **f)
    for q in queries:
        s = s.query(q['type'], **q['query'])
    return s
Example #11
0
    def get(self, request, *args, **kwargs):
        if 'q' in request.GET:
            self.search_query = ''.join(request.GET['q'])
        excluded_content_ids = request.GET.get('excluded', '').split(',')
        results = []
        if self.index_manager.connected_to_es and self.search_query:
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()
            if len(excluded_content_ids) > 0 and excluded_content_ids != ['']:
                search_queryset = search_queryset.exclude(
                    'terms', content_pk=excluded_content_ids)
            query = Match(_type='publishedcontent') & MultiMatch(
                query=self.search_query, fields=['title', 'description'])

            functions_score = [{
                'filter':
                Match(content_type='TUTORIAL'),
                'weight':
                settings.ZDS_APP['search']['boosts']['publishedcontent']
                ['if_tutorial']
            }, {
                'filter':
                Match(content_type='ARTICLE'),
                'weight':
                settings.ZDS_APP['search']['boosts']['publishedcontent']
                ['if_article']
            }, {
                'filter':
                Match(content_type='OPINION'),
                'weight':
                settings.ZDS_APP['search']['boosts']['publishedcontent']
                ['if_opinion']
            }]

            scored_query = FunctionScore(query=query,
                                         boost_mode='multiply',
                                         functions=functions_score)
            search_queryset = search_queryset.query(scored_query)[:10]

            # Build the result
            for hit in search_queryset.execute():
                result = {
                    'id': hit.content_pk,
                    'pubdate': hit.publication_date,
                    'title': str(hit.title),
                    'description': str(hit.description)
                }
                results.append(result)

        data = {'results': results}

        return HttpResponse(json_handler.dumps(data),
                            content_type='application/json')
Example #12
0
    def test_exclude_org(self):
        """Test add organization name exclusion filter.
        """

        s = Search()
        s.exclude = MagicMock(return_value='test')

        result = esc.exclude_org(s, esc.UNKNOWN_ORG_NAME)

        s.exclude.assert_called_with('term',
                                     author_org_name=esc.UNKNOWN_ORG_NAME)
        self.assertEqual(result, 'test')
Example #13
0
def test_filters():
    s = Search()
    s = s.filter('terms', tags=['search', 'python'])
    print(s.to_dict())
    # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}}

    s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])])
    print(s.to_dict())
    # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}}

    s = s.exclude('terms', tags=['search', 'python'])
    # 或者
    # s = s.query('bool', filter=[~Q('terms', tags=['search', 'python'])])
    print(s.to_dict())
def _exclude_filtered(s: Search):
    """
    Hide data sources from the catalog dynamically.
    """
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = models.ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=FILTER_CACHE_TIMEOUT,
                  value=filtered_providers)
    to_exclude = [f['provider_identifier'] for f in filtered_providers]
    s = s.exclude('terms', provider=to_exclude)
    return s
Example #15
0
def get_all_warnings_x(warning_type, all_warnings):
    """
    get all warnings for all packages, specified by a specific warning type

    Arguments
    ---------
    warning_type : the name of the warning type
    all_warnings : a dict of all the warnings of that type and their metadata, keyed by package name,
        which is empty to start

    Returns:
    ---------
    populates all_warnings

    """
    client = Elasticsearch(host=HOST)
    #s = Search(using=client, index='production-logs-2021.04.14').params(request_timeout=60)
    s = Search(using=client)
    s = s.source(['package', 'type', 'severity', 'score', 'line', 'line_no'])
    s = s.query("match", type=warning_type)
    s = s.exclude("match", tag="test_code")
    #s = s.query("multi_match", type=warning_type, fields=['package', 'type', 'severity', 'score', 'line','line_no'])
    #print(s.to_dict())
    #response = s.execute()
    #print(response)
    #for i in response:
    #print(i)

    # process the query
    for hit in s.scan():
        if hit.package not in all_warnings.keys():
            all_warnings[hit.package] = []
        if not hasattr(hit, "severity"):
            hit.severity = None
        if not hasattr(hit, "score"):
            hit.score = None
        if not hasattr(hit, "line"):
            hit.line = None
        if not hasattr(hit, "line_no"):
            hit.line_no = None
        all_warnings[hit.package].append({
            'warning_type': warning_type,
            'severity': hit.severity,
            'score': hit.score,
            'line': hit.line,
            'line_no': hit.line_no
        })
Example #16
0
def searchAllByGeonameid(geonameid):
    es = ELASTICSEARCHAUTHENTIFICATION
    geonameSearch = Search(using=es, index=GEONAMESINDEX)
    geonameSearch = geonameSearch[0:5]
    geonameSearch = geonameSearch.query("match", geonameid=geonameid)
    geonameSearch = geonameSearch.query("exists", field="admin2Code")
    geonameSearch = geonameSearch.exclude("match", featureClass="A")
    geonameSearch = geonameSearch.exclude("match", featureClass="P")

    response = geonameSearch.execute()
    responseDict = response.to_dict()
    found = None

    nbRes = responseDict['hits']['total']['value']
    if nbRes > 0:
        found = responseDict['hits']['hits'][0]['_source']
    return found
Example #17
0
def get_all_warnings_counts_x(warning_type, all_warnings, all_unique_warnings,
                              all_severities, all_raw_scores):
    """
    populates the incoming dictionaries by the warning type specified

    Arguments
    ---------
    warning_type : the AuraScan warning type to search the database for
    all_warnings : a dict that stores the total number of warnings, keyed by package
    all_unique_warnings : a dict that stores the total number of unique warnings, keyed by package
    all_severities : a dict that stores all the severities for each package
    all_raw_scores : a list of all the scores of all packages, used to calculate percentile scores

    Returns
    ---------
    None, but populates all_warnings, all_unique_warnings, and all_severities
    """
    client = Elasticsearch(host=HOST)
    s = Search(using=client).params(request_timeout=30)
    s = s.source(['package', 'type', 'severity', 'score'])
    s = s.query("match", type=warning_type)
    s = s.exclude("match", tag="test_code")
    #print(s.to_dict())

    # process the query
    for hit in s.scan():
        #print(hit)
        #print(hit.score)
        if not hasattr(hit, "severity"):
            hit.severity = None
        if not hasattr(hit, "score"):
            hit.score = None

        if hit.package not in all_warnings.keys():
            all_warnings[hit.package] = 0
        all_warnings[hit.package] += 1

        if hit.package not in all_unique_warnings.keys():
            all_unique_warnings[hit.package] = {}
        if warning_type not in all_unique_warnings[hit.package].keys():
            all_unique_warnings[hit.package][warning_type] = 1

        if hit.package not in all_severities.keys():
            all_severities[hit.package] = 0
        all_severities[hit.package] += get_score_percentiles(
            all_raw_scores, int(hit.score))
Example #18
0
def get_LOC_by_warning(package_name):
    """
    get all the LOC for a specific package

    Arguments
    ---------
    package_name : the package you want warnings for (do not specify a version)

    Returns
    ---------
    warnings : a list of lines of code (and their metadata)
    """
    client = Elasticsearch(host=HOST)
    s = Search(using=client)
    s = s.source([
        'package', 'type', 'severity', 'score', 'line', 'line_no', 'location'
    ])
    #q = Q("match", type=warning)  & Q("match", severity=severity)
    s = s.query("match", package__keyword=package_name)
    s = s.exclude("match", tag="test_code")
    #print(s.to_dict())

    # process the query
    results = []
    for hit in s.scan():
        if not hasattr(hit, "line"):
            hit.line = None
        if not hasattr(hit, "line_no"):
            hit.line_no = None
        if not hasattr(hit, "location"):
            hit.location = None
        #print(hit.to_dict())
        results.append(
            [hit.line, hit.line_no, hit.location, hit.type, hit.severity])
    #print(results)
    return results
Example #19
0
def example10():
    """
    DSL objects for common entities instead of dict/json.
    All importable from elasticsearch_dsl
    """
    from elasticsearch_dsl import Q, Search
    """
    Straightforward mapping to json - kwargs are translated into keys into json.
    You can use the to_dict() method to see the result json.
    """

    q = Q("terms", tags=["python", "search"])
    q.to_dict()
    """
    All objects can also be constructed using the raw dict.
    """

    q = Q({"terms": {"tags": ["python", "search"]}})
    q.to_dict()
    """
    Query objects support logical operators which result in bool queries
    """
    q = q | Q("match", title="python")
    q.to_dict()
    """
    DSL objects also allow for attribute access instead of ['key']
    """
    q.minimum_should_match = 2
    q.minimum_should_match
    q.to_dict()

    from datetime import date

    q = q & Q("range", **{"@timestamp": {"lt": date(2019, 1, 1)}})
    q.to_dict()
    """
    Configuration is global so no client needs to be passed around.
    """
    from elasticsearch_dsl import connections
    """
    Default connection used where no other connection specified. Any configuration
    methods just pass all parameters to the underlying elasticsearch-py client.
    """
    connections.create_connection(hosts=["localhost"])
    """
    Optionally specify an alias for the connection in case of multiple connections.
    """
    connections.create_connection("prod", hosts=["localhost"])
    s = Search(using="prod")
    s.count()
    """
    You can always just pass in your own client instance
    """
    s = Search(using=Elasticsearch())
    s.count()
    """
    Any method on Search returns a clone so you need to always assign it back to
    the same variable.
    """
    s = Search()
    s = s.params(q="fix")
    """
    Multiple queries are combined together using the AND operator
    """
    s = Search()
    s = s.query("match", description="fix")
    s = s.query("match", author="Honza")
    """
    Filter shortcut to use {bool: {filter: []}}
    """
    s = s.filter("range", committed_date={"lt": date(2016, 1, 1)})
    s.to_dict()
    """
    Exclude as a wrapper around must_not, use __ instead of dots for convenience.
    """
    s = s.exclude("term", committer__name__keyword="Honza Král")
    """
    Search is executed when iterated on or when .execute() is called.
    """
    for hit in s:
        """
        Hit class offers direct access to fields and via .meta any other properties
        on the returned hit (_id, _seq_no, ...)
        """
        print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}")
    """
    Aggregations are implemented in place to allow for chaining
    """
    s = Search(index="git")
    s.aggs.bucket("tags", "terms", field="terms").metric(
        "lines", "sum",
        field="stats.lines").metric("authors",
                                    "cardinality",
                                    field="author.name.keyword")
    r = s.execute()
    """
    Or modify aggregation in place
    """
    s.aggs["tags"].bucket("months",
                          "date_histogram",
                          field="committed_date",
                          interval="month")
    """
    Analysis
    """

    from elasticsearch_dsl import analyzer, token_filter

    a = analyzer(
        "file_analyzer",
        tokenizer="path_hierarchy",
        filter=[
            "lowercase",
            token_filter(
                "split_ext",
                "pattern_capture",
                preserve_original=True,
                patterns=[r"^([^\.]+)"],
            ),
        ],
    )

    a.simulate("test/integration/search.py")
    """
    """

    from elasticsearch_dsl import Document, Text, Keyword, InnerDoc, Date, Nested

    class FileDiff(InnerDoc):
        filename = Text(analyzer=a)
        patch = Text()

    class Commit(Document):
        description = Text()
        committed_date = Date()
        author = Text(fields={"keyword": Keyword()})

        files = Nested(FileDiff)

        def subject(self):
            return self.description.split("\n", 1)[0][:80]

        class Index:
            name = "git*"
            settings = {"number_of_replicas": 0}

    """
    Create the index
    """

    Commit.init(index="git-v2")
    """
    Search now returns Commit objects
    """
    for c in Commit.search():
        print(f"{c.meta.id}: {c.subject()}")
def search(search_params,
           index,
           page_size,
           ip,
           request,
           filter_dead,
           page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Apply term filters. Each tuple pairs a filter's parameter name in the API
    # with its corresponding field in Elasticsearch. "None" means that the
    # names are identical.
    filters = [('extension', None), ('categories', None),
               ('aspect_ratio', None), ('size', None), ('source', 'provider'),
               ('license', 'license__keyword'),
               ('license_type', 'license__keyword')]
    for tup in filters:
        api_field, elasticsearch_field = tup
        s = _apply_filter(s, search_params, api_field, elasticsearch_field)
    # Get suggestions for any route
    s = s.suggest('get_suggestion', '', term={'field': 'creator'})
    # Exclude mature content unless explicitly enabled by the requester
    if not search_params.data['mature']:
        s = s.exclude('term', mature=True)
    # Hide data sources from the catalog dynamically.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = models.ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    to_exclude = [f['provider_identifier'] for f in filtered_providers]
    s = s.exclude('terms', provider=to_exclude)

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query('simple_query_string', query=query, fields=search_fields)
        # Get suggestions for term query
        s = s.suggest('get_suggestion', query, term={'field': 'creator'})
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query('simple_query_string',
                        query=creator,
                        fields=['creator'])
            # Get suggestions for creator
            s = s.suggest('get_suggestion', creator, term={'field': 'creator'})
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query('simple_query_string', query=title, fields=['title'])
            # Get suggestions for title
            s = s.suggest('get_suggestion', title, term={'field': 'title'})
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query('simple_query_string',
                        fields=['tags.name'],
                        query=tags)
            # Get suggestions for tags
            s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'})
    # Boost by popularity metrics
    if POPULARITY_BOOST:
        queries = []
        factors = ['comments', 'views', 'likes']
        boost_factor = 100 / len(factors)
        for factor in factors:
            rank_feature_query = Q('rank_feature',
                                   field=factor,
                                   boost=boost_factor)
            queries.append(rank_feature_query)
        s = Search().query(
            Q('bool', must=s.query, should=queries, minimum_should_match=1))

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip), request_timeout=7)
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    try:
        search_response = s.execute()
        log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}')
    except RequestError as e:
        raise ValueError(e)
    results = _post_process_results(s, start, end, page_size, search_response,
                                    request, filter_dead)

    suggestion = _query_suggestions(search_response)

    result_count, page_count = _get_result_and_page_count(
        search_response, results, page_size)

    return results, page_count, result_count, suggestion
Example #21
0
def query_fatcat(json_output):
    """
    Queries fatcat search index (the full regular fatcat.wiki release index)
    for COVID-19 keywords and phrases, iterates over the result set (using
    scroll), and fetches full release entity (via api.fatcat.wik) for each.
    """
    api_session = requests_retry_session()

    es_backend = os.environ.get(
        "ELASTICSEARCH_BACKEND",
        default="https://search.fatcat.wiki",
    )
    es_index = "fatcat_release"
    es_client = elasticsearch.Elasticsearch(es_backend)

    search = Search(using=es_client, index=es_index)

    search = search.exclude("terms",
                            release_type=["stub", "component", "abstract"])

    # "Emerald Expert Briefings"
    search = search.exclude("terms",
                            container_id=["fnllqvywjbec5eumrbavqipfym"])

    # ResearchGate
    search = search.exclude("terms", doi_prefix=["10.13140"])

    # some industrial thing
    search = search.exclude("query_string",
                            query='"Report on SARS backfit evaluation"',
                            fields=["title"])

    # physic experiment
    search = search.exclude("query_string",
                            query='"TOF-SARS"',
                            fields=["title"])

    # species not related to SARS
    # something based on excluding "lake" in title might be easier?
    search = search.exclude("query_string",
                            query='"G.O. Sars"',
                            fields=["title"])
    search = search.exclude("query_string",
                            query='"Gomphocythere Sars"',
                            fields=["title"])
    search = search.exclude("query_string",
                            query='"Australis Sars"',
                            fields=["title"])
    search = search.exclude("query_string",
                            query='"scutifer Sars"',
                            fields=["title"])
    search = search.exclude("query_string",
                            query='"lumholtzi Sars"',
                            fields=["title"])

    search = search.query(
        Q("query_string",
          query=
          '"COVID-19" coronavirus coronaviruses "sars-cov-2" "2019-nCoV" "SARS-CoV" "MERS-CoV" SARS',
          default_operator="OR",
          fields=["title", "original_title"]) | Q("query_string",
                                                  query='pandemic influenza',
                                                  default_operator="AND",
                                                  fields=["biblio"])
        | Q("query_string",
            query='epidemic influenza',
            default_operator="AND",
            fields=["biblio"]) | Q("query_string",
                                   query='pandemic ventilator',
                                   default_operator="AND",
                                   fields=["biblio"]))

    print("Expecting {} search hits".format(search.count()), file=sys.stderr)

    search = search.params(clear_scroll=False)
    search = search.params(_source=False)

    results = search.scan()
    for hit in results:
        release_id = hit.meta.id
        resp = api_session.get(
            'https://api.fatcat.wiki/v0/release/{}'.format(release_id),
            params={
                'expand': 'container,files,filesets,webcaptures',
                'hide': 'references',
            })
        resp.raise_for_status()
        row = dict(
            fatcat_hit=hit.meta._d_,
            release_id=release_id,
            fatcat_release=resp.json(),
        )
        print(json.dumps(row, sort_keys=True), file=json_output)
Example #22
0
    def emit_compute_dict(self, uuid, compute_map, index, identifier, alias):
        """
        Returns the normalized data from the ES query
        """
        output_dict = {}
        if "aggregations" not in compute_map:
            logger.critical(
                f"Incorrect JSON data: nested dictionaries aggregations \
fields are required in {compute_map}")
            return output_dict
        buckets = compute_map.get("buckets", [])
        aggregations = compute_map["aggregations"]
        filters = compute_map.get("filter", {})

        logger.debug("Initializing search object")
        kw_identifier = identifier + ".keyword"  # append .keyword
        s = Search(using=self._conn_object,
                   index=str(index)).query("match", **{kw_identifier: uuid})

        # Apply filters
        for key, value in filters.items():
            s = s.filter("wildcard", **{key: value})

        # Apply excludes
        for key, value in compute_map.get("exclude", {}).items():
            s = s.exclude("match", **{key: value})
        if buckets:
            logger.debug("Building buckets")
            a = A("terms", field=buckets[0], size=10000)
            x = s.aggs.bucket(buckets[0].split(".keyword")[0], a)
            for bucket in buckets[1:]:
                a = A("terms", field=bucket, size=10000)
                # Create bucket with and trimming characters after .
                x = x.bucket(bucket.split(".keyword")[0], a)
            logger.debug("Finished adding buckets to query")
        else:
            a = a = A("terms")
        logger.debug("Adding aggregations to query")
        for key, agg_list in aggregations.items():
            for aggs in agg_list:
                if isinstance(aggs, str):
                    _temp_agg_str = "{}({})".format(aggs, key)
                    # Create aggregation based on the key
                    a.metric(_temp_agg_str, aggs, field=key)
                    self._aggs_list.append(_temp_agg_str)
                # If there's a dictionary of aggregations. i.e different percentiles
                # we have to iterate through keys and values
                elif isinstance(aggs, dict):
                    for dict_key, dict_value in aggs.items():
                        _temp_agg_str = "{}({})".format(dict_key, key)
                        # Add nested dict as aggregation
                        a.metric(_temp_agg_str,
                                 dict_key,
                                 field=key,
                                 **dict_value)
                        self._aggs_list.append(_temp_agg_str)
                else:
                    logger.warn("Ignoring aggregation {}".format(aggs))
        logger.debug("Finished adding aggregations to query")
        logger.debug("Built the following query: {}".format(
            json.dumps(s.to_dict(), indent=4)))
        response = s.execute()
        logger.debug("Succesfully executed the search query")

        if len(response.hits.hits) == 0:
            return {}
        _output_dict = self.gen_result_dict(response, buckets, self._aggs_list,
                                            uuid, alias)
        if filters:
            output_dict = _output_dict
            filter_list = []
            for key, value in filters.items():
                filter_list.append(key)
                filter_list.append(value)
            # Include all k,v from filters as keys in the output dictionary
            for key in reversed(filter_list):
                output_dict = {key.split(".keyword")[0]: output_dict}
        else:
            output_dict = _output_dict
        logger.debug("output compute dictionary with summaries is: {}".format(
            json.dumps(output_dict, indent=4)))
        return output_dict
Example #23
0
def browse(request):

    s = Search(using=es)
    description = None

    s.query = FunctionScore(
        query=s.query, functions=[SF('random_score', seed=int(time.time()))])

    if 'source' in request.GET:
        source = request.GET['source']
        s = s.filter('terms', **{'analysis.source': [source]})
        description = SOURCE_MAP.get(source, {}).get('name') or source
    elif 'titleii' in request.GET:
        title_ii = request.GET['titleii']
        if title_ii == 'pro':
            s = s.filter('terms', **{'analysis.titleii': [True]})
            description = "Pro Title II"
        elif title_ii == 'anti':
            description = 'Anti Title II'
            s = s.filter('terms', **{'analysis.titleii': [False]})
        elif title_ii == 'unknown':
            description = 'Uncategorized'
            s = s.exclude('exists', field='analysis.titleii')

    s.aggs.bucket('address', A('terms', field='analysis.fulladdress'))
    s.aggs.bucket('site', A('terms', field='analysis.onsite'))

    s.aggs.bucket(
        'email_confirmation',
        A('filters',
          filters={
              'true': {
                  'term': {
                      'emailConfirmation': 'true'
                  }
              },
              'false': {
                  'term': {
                      'emailConfirmation': 'false'
                  }
              }
          }))

    s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw'))

    # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress'))

    stats = OrderedDict({
        'Comment Form': {
            'On-site': 0,
            'Off-site': 0
        },
        'Emails': {
            'Unique': 0,
        },
        'Address': {
            'Full Address': 0,
            'Partial Address': 0,
        },
        'Email Confirmation': {
            'True': 0,
            'False': 0,
            'Missing': 0
        }
    })

    response = s[:50].execute()
    total = s.count()
    for bucket in response.aggregations.address.buckets:
        if bucket.key == 1:
            stats['Address']['Full Address'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Address']['Partial Address'] = bucket.doc_count

    for bucket in response.aggregations.site.buckets:
        if bucket.key == 1:
            stats['Comment Form']['On-site'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Comment Form']['Off-site'] = bucket.doc_count

    stats['Emails']['Unique'] = response.aggregations.unique_emails.value

    for bucket, value in response.aggs.email_confirmation.to_dict(
    )['buckets'].items():
        if bucket == 'true':
            stats['Email Confirmation']['True'] = value['doc_count']
        elif bucket == 'false':
            stats['Email Confirmation']['False'] = value['doc_count']
    stats['Email Confirmation']['Missing'] = (
        total - stats['Email Confirmation']['True'] -
        stats['Email Confirmation']['False'])

    context = {
        'description': description,
        'stats': stats,
        'results': response,
        'comment_count': total
    }

    return render(request, 'listing.html', context)
Example #24
0
def search(search_params,
           index,
           page_size,
           ip,
           request,
           filter_dead,
           page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    if 'li' in search_params.data:
        s = _filter_licenses(s, search_params.data['li'])
    elif 'lt' in search_params.data:
        s = _filter_licenses(s, search_params.data['lt'])

    # Apply term filters.
    filters = ['provider', 'extension', 'categories', 'aspect_ratio', 'size']
    for _filter in filters:
        s = _apply_filter(_filter, search_params, s)

    # Hide data sources from the catalog dynamically.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query('query_string',
                    query=query,
                    fields=search_fields,
                    type='most_fields')
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query('query_string', query=creator, default_field='creator')
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query('query_string', query=title, default_field='title')
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query('query_string', default_field='tags.name', query=tags)

    # Boost by popularity metrics
    if POPULARITY_BOOST:
        queries = []
        factors = ['comments', 'views', 'likes']
        boost_factor = 100 / len(factors)
        for factor in factors:
            rank_feature_query = Q('rank_feature',
                                   field=factor,
                                   boost=boost_factor)
            queries.append(rank_feature_query)
        s = Search().query(
            Q('bool', must=s.query, should=queries, minimum_should_match=1))

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip))
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    try:
        search_response = s.execute()
    except RequestError as e:
        raise ValueError(e)
    results = _post_process_results(s, start, end, page_size, search_response,
                                    request, filter_dead)

    result_count, page_count = _get_result_and_page_count(
        search_response, results, page_size)

    return results, page_count, result_count
Example #25
0
def search(search_params, index, page_size, ip, page=1) -> Response:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param page: The results page number.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :return: An Elasticsearch Response object.
    """
    s = Search(index=index)

    # Paginate search query.
    start_slice = page_size * (page - 1)
    end_slice = page_size * page
    if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW:
        raise ValueError("Deep pagination is not allowed.")
    s = s[start_slice:end_slice]

    # If any filters are specified, add them to the query.
    if 'li' in search_params.data or 'lt' in search_params.data:
        license_field = 'li' if 'li' in search_params.data else 'lt'
        license_filters = []
        for _license in search_params.data[license_field].split(','):
            license_filters.append(Q('term', license__keyword=_license))
        s = s.filter('bool', should=license_filters, minimum_should_match=1)
    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    if 'q' in search_params.data:
        s = s.query('constant_score',
                    filter=Q(
                        'query_string',
                        query=search_params.data['q'],
                        fields=['tags.name', 'title'],
                    ))
    else:
        if 'creator' in search_params.data:
            creator = search_params.data['creator']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 query=creator,
                                 default_field='creator'))
        if 'title' in search_params.data:
            title = search_params.data['title']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 query=title,
                                 default_field='title'))
        if 'tags' in search_params.data:
            tags = search_params.data['tags']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 default_field='tags.name',
                                 query=tags))

    s.extra(track_scores=True)
    s = s.params(preference=str(ip))
    search_response = s.execute()
    return search_response
        should=[Q(...), Q(...)],
        minimum_should_match=1
)
s = Search().query(q)


##################################################
# FILTERS

s = Search()
s = s.filter('terms', tags=['search', 'python'])
# Same as
s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])])

# We can use exclude too
s = s.exclude('terms', tags=['search', 'python'])



#####################################################
# AGGREGATIONS

from elasticsearch_dsl.aggs import A

a = A('terms', field='category')


a.metric('clicks_per_category', 'sum', field='clicks')\
        .bucket('tags_per_category', 'terms', field='tags')

# This is how you add aggregations to the search object
Example #27
0
class AbstractSearch:
    def __init__(self, es_client=None, index=None):
        self.es_client = es_client
        self.index = index
        self.search = None
        self.multi_search = None

    @staticmethod
    def es2api(response, start, limit):
        return {
            'total': response['hits']['total']['value'],
            'results': response['hits']['hits'],
            'aggregations': {key: value['buckets'] for key, value \
                in response.get('aggregations', {}).items()},
            'start': start,
            'limit': limit
        }


    async def get(self, query='*', filters=None, aggregations=None, exclude=None, 
        to_date=None, from_date=None, start=0, limit=30):
        self.search = Search(using=self.es_client, index=self.index)
        self.__filters(filters, to_date, from_date)
        # print(self.search.to_dict())
        self.__exclude(exclude)
        self.__query(query)
        self.__aggregations(aggregations)
        try:
            return AbstractSearch.es2api(
                self.search.sort('-publish_date')[start:start+limit].execute().to_dict(),
                start, limit
            )
        except Exception as err:
            print(err)

    async def get_hist(self, query='*', filters=None, field=None, interval=None, exclude=None,
        to_date=None, from_date=None, start=0, limit=30):
        self.__filters(filters, to_date, from_date)
        self.__exclude(exclude)
        self.__query(query)
        self.__histogram(field, interval)
        return AbstractSearch.es2api(
            self.search[start:limit].execute().to_dict(),
            start, limit
        )

    def __histogram(self, field, interval):
        try:
            ah = A('terms', field='tags.keyword', size=5)
            aggregation = A('date_histogram', field='publish_date', interval=interval)
            self.search.aggs.bucket('histogram_data', aggregation).bucket('tags', ah)
        except Exception as err:
            print(err)


    def __aggregations(self, aggregations):
        for key, value in aggregations.items():
            try:
                aggregation = A(value['type'], field=f'{key}.keyword')
                self.search.aggs.bucket(key, aggregation)
            except Exception as err:
                print(err)
                pass
    
    def __filters(self, filters, to_date, from_date):
        self.search = self.search.filter('range', **{f'publish_date': {'gte':from_date ,'lte':to_date}})
        for key, value in filters.items():
            self.search = self.search.filter(value['type'], **{f'{key}.keyword': value[value['type']]})

    def __exclude(self, exclude):
        for value in exclude:
            self.search = self.search.exclude(value['type'], **{f'{value["field"]}.keyword': value[value['type']]})

    def __query(self, query):
        self.search = self.search.query('simple_query_string', query=query)
    
    async def __aenter__(self):
        self.search = Search(using=self.es_client, index=self.index)
        return self

    async def __aexit__(self, *args, **kwargs):
        return self
Example #28
0
def federal_fec_compute_load_graph_candidates(message, context):

    # configure ElasticSearch search
    s = Search(using=es, index="federal_fec_candidates")
    q = s.exclude("exists", field="context.last_graphed")

    # get start time
    start = time.time()

    # loop for 520s
    while time.time() - start < 520:

        docs = q[0:1000].execute()
        if len(docs) == 0:
            logger.info(' - '.join(['NO CANDIDATES FOUND FOR LOADING']))
            break

        # batches for neo4j and elasticsearch
        candidates = []
        parties = []
        races = []
        linkages = []
        actions = []

        for doc in docs:

            # prepare docs for loading
            if "row" in doc:
                candidates.append({
                    "cand_id":
                    doc.row["cand_id"],
                    "cand_name":
                    doc.processed["cand_name"].strip()
                    if doc.processed["cand_name"] is not None else "",
                    "cand_pty_affiliation":
                    doc.row["cand_pty_affiliation"],
                    "cand_election_yr":
                    doc.row["cand_election_yr"],
                    "cand_office_st":
                    doc.row["cand_office_st"],
                    "cand_office":
                    doc.row["cand_office"],
                    "cand_office_district":
                    doc.row["cand_office_district"],
                    "cand_ici":
                    doc.row["cand_ici"]
                })
                if doc.row["cand_pty_affiliation"] is not None:
                    parties.append({
                        "cand_id":
                        doc.row["cand_id"],
                        "cand_pty_affiliation":
                        doc.row["cand_pty_affiliation"]
                    })
                races.append({
                    "cand_id":
                    doc.row["cand_id"],
                    "cand_election_yr":
                    doc.row["cand_election_yr"] or "",
                    "cand_office_st":
                    doc.row["cand_office_st"] or "",
                    "cand_office":
                    doc.row["cand_office"] or "",
                    "cand_office_district":
                    doc.row["cand_office_district"] or ""
                })
            if "linkages" in doc:
                if "committees" in doc.linkages:
                    for linkage in doc.linkages.committees:
                        linkages.append({
                            "cmte_id":
                            linkage["cmte_id"],
                            "cand_id":
                            doc.meta.id,
                            "cand_election_yr":
                            linkage["cand_election_yr"],
                            "linkage_id":
                            linkage["linkage_id"]
                        })

            # prepare to mark as in graph in elasticsearch
            actions.append({
                "_op_type": "update",
                "_index": "federal_fec_candidates",
                "_id": doc.meta.id,
                "doc": {
                    "context": {
                        "last_graphed":
                        datetime.datetime.now(datetime.timezone.utc)
                    }
                }
            })

        # load into neo4j
        with driver.session() as neo4j:
            neo4j.write_transaction(cypher.merge_node_candidate,
                                    batch=candidates)
            neo4j.write_transaction(cypher.merge_rel_candidate_party,
                                    batch=parties)
            neo4j.write_transaction(cypher.merge_rel_candidate_race,
                                    batch=races)
            neo4j.write_transaction(cypher.merge_rel_candidate_committee,
                                    batch=linkages)

        # mark as graphed in elasticsearch
        helpers.bulk(es, actions)
        logger.info(' - '.join(['CANDIDATES LOADED', str(len(actions))]))

    return True
Example #29
0
Multiple queries are combined together using the AND operator
"""
s = Search()
s = s.query("match", description="fix")
s = s.query("match", author="Honza")

"""
Filter shortcut to use {bool: {filter: []}}
"""
s = s.filter("range", committed_date={"lt": date(2016, 1, 1)})
s.to_dict()

"""
Exclude as a wrapper around must_not, use __ instead of dots for convenience.
"""
s = s.exclude("term", committer__name__keyword="Honza Král")

"""
Search is executed when iterated on or when .execute() is called.
"""
for hit in s:
    """
    Hit class offers direct access to fields and via .meta any other properties
    on the returned hit (_id, _seq_no, ...)
    """
    print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}")

"""
Aggregations are implemented in place to allow for chaining
"""
s = Search(index="git")
Example #30
0
def search(search_params, index, page_size, ip, request,
           filter_dead, page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Add requested filters.
    if 'li' in search_params.data:
        s = _filter_licenses(s, search_params.data['li'])
    elif 'lt' in search_params.data:
        s = _filter_licenses(s, search_params.data['lt'])

    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)
    if 'extension' in search_params.data:
        extension = search_params.data['extension']
        extension_filter = Q('term', extension=extension)
        s = s.filter('bool', should=extension_filter, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(
            key=filter_cache_key,
            timeout=CACHE_TIMEOUT,
            value=filtered_providers
        )
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query(
            'query_string',
            query=query,
            fields=search_fields,
            type='most_fields'
        )
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query(
                'query_string', query=creator, default_field='creator'
            )
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query(
                'query_string', query=title, default_field='title'
            )
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query(
                'query_string',
                default_field='tags.name',
                query=tags
            )

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip))
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    search_response = s.execute()
    results = _post_process_results(
        s,
        start,
        end,
        page_size,
        search_response,
        request,
        filter_dead
    )

    result_count, page_count = _get_result_and_page_count(
        search_response,
        results,
        page_size
    )

    return results, page_count, result_count